diff options
Diffstat (limited to 'src/gallium/drivers/swr/rasterizer')
49 files changed, 2581 insertions, 1790 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/common/containers.hpp b/src/gallium/drivers/swr/rasterizer/common/containers.hpp index bc96c5f62fd..f3c05979144 100644 --- a/src/gallium/drivers/swr/rasterizer/common/containers.hpp +++ b/src/gallium/drivers/swr/rasterizer/common/containers.hpp @@ -33,137 +33,137 @@ namespace SWRL template <typename T, int NUM_ELEMENTS> struct UncheckedFixedVector { - UncheckedFixedVector() : mSize(0) - { - } - - UncheckedFixedVector(std::size_t size, T const& exemplar) - { - this->mSize = 0; - for (std::size_t i = 0; i < size; ++i) - this->push_back(exemplar); - } - - template <typename Iter> - UncheckedFixedVector(Iter fst, Iter lst) - { - this->mSize = 0; - for ( ; fst != lst; ++fst) - this->push_back(*fst); - } - - UncheckedFixedVector(UncheckedFixedVector const& UFV) - { - this->mSize = 0; - for (std::size_t i = 0, N = UFV.size(); i < N; ++i) - (*this)[i] = UFV[i]; - this->mSize = UFV.size(); - } - - UncheckedFixedVector& operator=(UncheckedFixedVector const& UFV) - { - for (std::size_t i = 0, N = UFV.size(); i < N; ++i) - (*this)[i] = UFV[i]; - this->mSize = UFV.size(); - return *this; - } - - T* begin() { return &this->mElements[0]; } - T* end() { return &this->mElements[0] + this->mSize; } - T const* begin() const { return &this->mElements[0]; } - T const* end() const { return &this->mElements[0] + this->mSize; } - - friend bool operator==(UncheckedFixedVector const& L, UncheckedFixedVector const& R) - { - if (L.size() != R.size()) return false; - for (std::size_t i = 0, N = L.size(); i < N; ++i) - { - if (L[i] != R[i]) return false; - } - return true; - } - - friend bool operator!=(UncheckedFixedVector const& L, UncheckedFixedVector const& R) - { - if (L.size() != R.size()) return true; - for (std::size_t i = 0, N = L.size(); i < N; ++i) - { - if (L[i] != R[i]) return true; - } - return false; - } - - T& operator[](std::size_t idx) - { - return this->mElements[idx]; - } - T const& operator[](std::size_t idx) const - { - return this->mElements[idx]; - } - void push_back(T const& t) - { - this->mElements[this->mSize] = t; - ++this->mSize; - } - void pop_back() - { - SWR_ASSERT(this->mSize > 0); - --this->mSize; - } - T& back() - { - return this->mElements[this->mSize-1]; - } - T const& back() const - { - return this->mElements[this->mSize-1]; - } - bool empty() const - { - return this->mSize == 0; - } - std::size_t size() const - { - return this->mSize; - } - void resize(std::size_t sz) - { - this->mSize = sz; - } - void clear() - { - this->resize(0); - } + UncheckedFixedVector() : mSize(0) + { + } + + UncheckedFixedVector(std::size_t size, T const& exemplar) + { + this->mSize = 0; + for (std::size_t i = 0; i < size; ++i) + this->push_back(exemplar); + } + + template <typename Iter> + UncheckedFixedVector(Iter fst, Iter lst) + { + this->mSize = 0; + for ( ; fst != lst; ++fst) + this->push_back(*fst); + } + + UncheckedFixedVector(UncheckedFixedVector const& UFV) + { + this->mSize = 0; + for (std::size_t i = 0, N = UFV.size(); i < N; ++i) + (*this)[i] = UFV[i]; + this->mSize = UFV.size(); + } + + UncheckedFixedVector& operator=(UncheckedFixedVector const& UFV) + { + for (std::size_t i = 0, N = UFV.size(); i < N; ++i) + (*this)[i] = UFV[i]; + this->mSize = UFV.size(); + return *this; + } + + T* begin() { return &this->mElements[0]; } + T* end() { return &this->mElements[0] + this->mSize; } + T const* begin() const { return &this->mElements[0]; } + T const* end() const { return &this->mElements[0] + this->mSize; } + + friend bool operator==(UncheckedFixedVector const& L, UncheckedFixedVector const& R) + { + if (L.size() != R.size()) return false; + for (std::size_t i = 0, N = L.size(); i < N; ++i) + { + if (L[i] != R[i]) return false; + } + return true; + } + + friend bool operator!=(UncheckedFixedVector const& L, UncheckedFixedVector const& R) + { + if (L.size() != R.size()) return true; + for (std::size_t i = 0, N = L.size(); i < N; ++i) + { + if (L[i] != R[i]) return true; + } + return false; + } + + T& operator[](std::size_t idx) + { + return this->mElements[idx]; + } + T const& operator[](std::size_t idx) const + { + return this->mElements[idx]; + } + void push_back(T const& t) + { + this->mElements[this->mSize] = t; + ++this->mSize; + } + void pop_back() + { + SWR_ASSERT(this->mSize > 0); + --this->mSize; + } + T& back() + { + return this->mElements[this->mSize-1]; + } + T const& back() const + { + return this->mElements[this->mSize-1]; + } + bool empty() const + { + return this->mSize == 0; + } + std::size_t size() const + { + return this->mSize; + } + void resize(std::size_t sz) + { + this->mSize = sz; + } + void clear() + { + this->resize(0); + } private: - std::size_t mSize; - T mElements[NUM_ELEMENTS]; + std::size_t mSize{ 0 }; + T mElements[NUM_ELEMENTS]; }; template <typename T, int NUM_ELEMENTS> struct FixedStack : UncheckedFixedVector<T, NUM_ELEMENTS> { - FixedStack() {} - - void push(T const& t) - { - this->push_back(t); - } - - void pop() - { - this->pop_back(); - } - - T& top() - { - return this->back(); - } - - T const& top() const - { - return this->back(); - } + FixedStack() {} + + void push(T const& t) + { + this->push_back(t); + } + + void pop() + { + this->pop_back(); + } + + T& top() + { + return this->back(); + } + + T const& top() const + { + return this->back(); + } }; template <typename T> @@ -190,16 +190,16 @@ namespace std template <typename T, int N> struct hash<SWRL::UncheckedFixedVector<T, N>> { - size_t operator() (SWRL::UncheckedFixedVector<T, N> const& v) const - { - if (v.size() == 0) return 0; - std::hash<T> H; - size_t x = H(v[0]); - if (v.size() == 1) return x; - for (size_t i = 1; i < v.size(); ++i) - x ^= H(v[i]) + 0x9e3779b9 + (x<<6) + (x>>2); - return x; - } + size_t operator() (SWRL::UncheckedFixedVector<T, N> const& v) const + { + if (v.size() == 0) return 0; + std::hash<T> H; + size_t x = H(v[0]); + if (v.size() == 1) return x; + for (size_t i = 1; i < v.size(); ++i) + x ^= H(v[i]) + 0x9e3779b9 + (x<<6) + (x>>2); + return x; + } }; diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h index 522ae0dd65f..5794f3f625a 100644 --- a/src/gallium/drivers/swr/rasterizer/common/os.h +++ b/src/gallium/drivers/swr/rasterizer/common/os.h @@ -47,16 +47,18 @@ #define DEBUGBREAK __debugbreak() #define PRAGMA_WARNING_PUSH_DISABLE(...) \ - __pragma(warning(push));\ - __pragma(warning(disable:__VA_ARGS__)); + __pragma(warning(push));\ + __pragma(warning(disable:__VA_ARGS__)); #define PRAGMA_WARNING_POP() __pragma(warning(pop)) #if defined(_WIN32) #if defined(_WIN64) +#define BitScanReverseSizeT BitScanReverse64 #define BitScanForwardSizeT BitScanForward64 #define _mm_popcount_sizeT _mm_popcnt_u64 #else +#define BitScanReverseSizeT BitScanReverse #define BitScanForwardSizeT BitScanForward #define _mm_popcount_sizeT _mm_popcnt_u32 #endif @@ -68,29 +70,20 @@ #include <stdlib.h> #include <string.h> -#include <X11/Xmd.h> #include <x86intrin.h> #include <stdint.h> #include <sys/types.h> #include <unistd.h> #include <sys/stat.h> +#include <stdio.h> -typedef void VOID; +typedef void VOID; typedef void* LPVOID; -typedef CARD8 BOOL; -typedef wchar_t WCHAR; -typedef uint16_t UINT16; -typedef int INT; -typedef unsigned int UINT; -typedef uint32_t UINT32; -typedef uint64_t UINT64; -typedef int64_t INT64; -typedef void* HANDLE; -typedef float FLOAT; -typedef int LONG; -typedef CARD8 BYTE; -typedef unsigned char UCHAR; -typedef unsigned int DWORD; +typedef int INT; +typedef unsigned int UINT; +typedef void* HANDLE; +typedef int LONG; +typedef unsigned int DWORD; #undef FALSE #define FALSE 0 @@ -104,8 +97,11 @@ typedef unsigned int DWORD; #define INLINE __inline #endif #define DEBUGBREAK asm ("int $3") +#if !defined(__CYGWIN__) #define __cdecl +#define __stdcall #define __declspec(X) +#endif #define GCC_VERSION (__GNUC__ * 10000 \ + __GNUC_MINOR__ * 100 \ @@ -180,21 +176,13 @@ unsigned char _bittest(const LONG *a, LONG b) #define CreateDirectory(name, pSecurity) mkdir(name, 0777) -#if defined(_WIN32) -static inline -unsigned int _mm_popcnt_u32(unsigned int v) -{ - return __builtin_popcount(v); -} -#endif - #define _aligned_free free #define InterlockedCompareExchange(Dest, Exchange, Comparand) __sync_val_compare_and_swap(Dest, Comparand, Exchange) #define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value) #define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1) +#define InterlockedDecrement64(Append) __sync_sub_and_fetch(Append, 1) #define InterlockedIncrement(Append) __sync_add_and_fetch(Append, 1) #define _ReadWriteBarrier() asm volatile("" ::: "memory") -#define __stdcall #define PRAGMA_WARNING_PUSH_DISABLE(...) #define PRAGMA_WARNING_POP() @@ -206,7 +194,7 @@ unsigned int _mm_popcnt_u32(unsigned int v) #endif // Universal types -typedef BYTE KILOBYTE[1024]; +typedef uint8_t KILOBYTE[1024]; typedef KILOBYTE MEGABYTE[1024]; typedef MEGABYTE GIGABYTE[1024]; diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp index 454641b2751..c6768b4c566 100644 --- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp +++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp @@ -64,12 +64,14 @@ void BucketManager::RegisterThread(const std::string& name) UINT BucketManager::RegisterBucket(const BUCKET_DESC& desc) { + mThreadMutex.lock(); size_t id = mBuckets.size(); mBuckets.push_back(desc); + mThreadMutex.unlock(); return (UINT)id; } -void BucketManager::PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket) +void BucketManager::PrintBucket(FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket) { const char *arrows[] = { "", @@ -88,7 +90,7 @@ void BucketManager::PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 float percentParent = (float)((double)bucket.elapsed / (double)parentCycles * 100.0); // compute average cycle count per invocation - UINT64 CPE = bucket.elapsed / bucket.count; + uint64_t CPE = bucket.elapsed / bucket.count; BUCKET_DESC &desc = mBuckets[bucket.id]; @@ -127,7 +129,7 @@ void BucketManager::PrintThread(FILE* f, const BUCKET_THREAD& thread) // compute thread level total cycle counts across all buckets from root const BUCKET& root = thread.root; - UINT64 totalCycles = 0; + uint64_t totalCycles = 0; for (const BUCKET& child : root.children) { totalCycles += child.elapsed; @@ -186,3 +188,13 @@ void BucketManager::PrintReport(const std::string& filename) fclose(f); } } + +void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id) +{ + pBucketMgr->StartBucket(id); +} + +void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id) +{ + pBucketMgr->StopBucket(id); +} diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h index 99cb10ec6e8..9dfa7f694d0 100644 --- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h +++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h @@ -70,7 +70,9 @@ public: // removes all registered buckets void ClearBuckets() { + mThreadMutex.lock(); mBuckets.clear(); + mThreadMutex.unlock(); } /// Registers a new thread with the manager. @@ -209,7 +211,7 @@ public: } private: - void PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket); + void PrintBucket(FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket); void PrintThread(FILE* f, const BUCKET_THREAD& thread); // list of active threads that have registered with this manager @@ -227,3 +229,8 @@ private: bool mThreadViz{ false }; std::string mThreadVizDir; }; + + +// C helpers for jitter +void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id); +void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id); diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h index 41c6d5dec79..34c322e5a85 100644 --- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h +++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h @@ -64,13 +64,13 @@ struct BUCKET_THREAD std::string name; // id for this thread, assigned by the thread manager - uint32_t id; + uint32_t id{ 0 }; // root of the bucket hierarchy for this thread BUCKET root; // currently executing bucket somewhere in the hierarchy - BUCKET* pCurrent; + BUCKET* pCurrent{ nullptr }; // currently executing hierarchy level uint32_t level{ 0 }; diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h index 8fa6d9ef408..fa792b42e1a 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h @@ -43,14 +43,14 @@ typedef uint8_t simdmask; // simd vector OSALIGNSIMD(union) simdvector { - simdscalar v[4]; - struct - { - simdscalar x, y, z, w; - }; - - simdscalar& operator[] (const int i) { return v[i]; } - const simdscalar& operator[] (const int i) const { return v[i]; } + simdscalar v[4]; + struct + { + simdscalar x, y, z, w; + }; + + simdscalar& operator[] (const int i) { return v[i]; } + const simdscalar& operator[] (const int i) const { return v[i]; } }; #if KNOB_SIMD_WIDTH == 8 @@ -59,8 +59,8 @@ OSALIGNSIMD(union) simdvector #define _simd_load1_ps _mm256_broadcast_ss #define _simd_loadu_ps _mm256_loadu_ps #define _simd_setzero_ps _mm256_setzero_ps -#define _simd_set1_ps _mm256_set1_ps -#define _simd_blend_ps _mm256_blend_ps +#define _simd_set1_ps _mm256_set1_ps +#define _simd_blend_ps _mm256_blend_ps #define _simd_blendv_ps _mm256_blendv_ps #define _simd_store_ps _mm256_store_ps #define _simd_mul_ps _mm256_mul_ps @@ -100,21 +100,156 @@ OSALIGNSIMD(union) simdvector INLINE \ __m256i func(__m256i a, __m256i b)\ {\ - __m128i aHi = _mm256_extractf128_si256(a, 1);\ - __m128i bHi = _mm256_extractf128_si256(b, 1);\ - __m128i aLo = _mm256_castsi256_si128(a);\ - __m128i bLo = _mm256_castsi256_si128(b);\ + __m128i aHi = _mm256_extractf128_si256(a, 1);\ + __m128i bHi = _mm256_extractf128_si256(b, 1);\ + __m128i aLo = _mm256_castsi256_si128(a);\ + __m128i bLo = _mm256_castsi256_si128(b);\ \ - __m128i subLo = intrin(aLo, bLo);\ - __m128i subHi = intrin(aHi, bHi);\ + __m128i subLo = intrin(aLo, bLo);\ + __m128i subHi = intrin(aHi, bHi);\ \ - __m256i result = _mm256_castsi128_si256(subLo);\ - result = _mm256_insertf128_si256(result, subHi, 1);\ + __m256i result = _mm256_castsi128_si256(subLo);\ + result = _mm256_insertf128_si256(result, subHi, 1);\ \ - return result;\ + return result;\ } #if (KNOB_ARCH == KNOB_ARCH_AVX) +INLINE +__m256 _simdemu_permute_ps(__m256 a, __m256i b) +{ + __m128 aHi = _mm256_extractf128_ps(a, 1); + __m128i bHi = _mm256_extractf128_si256(b, 1); + __m128 aLo = _mm256_castps256_ps128(a); + __m128i bLo = _mm256_castsi256_si128(b); + + __m128i indexHi = _mm_cmpgt_epi32(bLo, _mm_set1_epi32(3)); + __m128 resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bLo, _mm_set1_epi32(0x3))); + __m128 resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bLo, _mm_set1_epi32(0x3))); + __m128 blendLowRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi)); + + indexHi = _mm_cmpgt_epi32(bHi, _mm_set1_epi32(3)); + resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bHi, _mm_set1_epi32(0x3))); + resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bHi, _mm_set1_epi32(0x3))); + __m128 blendHiRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi)); + + __m256 result = _mm256_castps128_ps256(blendLowRes); + result = _mm256_insertf128_ps(result, blendHiRes, 1); + + return result; +} + +INLINE +__m256i _simdemu_srlv_epi32(__m256i vA, __m256i vCount) +{ + int32_t aHi, aLow, countHi, countLow; + __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1)); + __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0)); + __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1)); + __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0)); + + aHi = _mm_extract_epi32(vAHi, 0); + countHi = _mm_extract_epi32(vCountHi, 0); + aHi >>= countHi; + vAHi = _mm_insert_epi32(vAHi, aHi, 0); + + aLow = _mm_extract_epi32(vALow, 0); + countLow = _mm_extract_epi32(vCountLow, 0); + aLow >>= countLow; + vALow = _mm_insert_epi32(vALow, aLow, 0); + + aHi = _mm_extract_epi32(vAHi, 1); + countHi = _mm_extract_epi32(vCountHi, 1); + aHi >>= countHi; + vAHi = _mm_insert_epi32(vAHi, aHi, 1); + + aLow = _mm_extract_epi32(vALow, 1); + countLow = _mm_extract_epi32(vCountLow, 1); + aLow >>= countLow; + vALow = _mm_insert_epi32(vALow, aLow, 1); + + aHi = _mm_extract_epi32(vAHi, 2); + countHi = _mm_extract_epi32(vCountHi, 2); + aHi >>= countHi; + vAHi = _mm_insert_epi32(vAHi, aHi, 2); + + aLow = _mm_extract_epi32(vALow, 2); + countLow = _mm_extract_epi32(vCountLow, 2); + aLow >>= countLow; + vALow = _mm_insert_epi32(vALow, aLow, 2); + + aHi = _mm_extract_epi32(vAHi, 3); + countHi = _mm_extract_epi32(vCountHi, 3); + aHi >>= countHi; + vAHi = _mm_insert_epi32(vAHi, aHi, 3); + + aLow = _mm_extract_epi32(vALow, 3); + countLow = _mm_extract_epi32(vCountLow, 3); + aLow >>= countLow; + vALow = _mm_insert_epi32(vALow, aLow, 3); + + __m256i ret = _mm256_set1_epi32(0); + ret = _mm256_insertf128_si256(ret, vAHi, 1); + ret = _mm256_insertf128_si256(ret, vALow, 0); + return ret; +} + + +INLINE +__m256i _simdemu_sllv_epi32(__m256i vA, __m256i vCount) +{ + int32_t aHi, aLow, countHi, countLow; + __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1)); + __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0)); + __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1)); + __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0)); + + aHi = _mm_extract_epi32(vAHi, 0); + countHi = _mm_extract_epi32(vCountHi, 0); + aHi <<= countHi; + vAHi = _mm_insert_epi32(vAHi, aHi, 0); + + aLow = _mm_extract_epi32(vALow, 0); + countLow = _mm_extract_epi32(vCountLow, 0); + aLow <<= countLow; + vALow = _mm_insert_epi32(vALow, aLow, 0); + + aHi = _mm_extract_epi32(vAHi, 1); + countHi = _mm_extract_epi32(vCountHi, 1); + aHi <<= countHi; + vAHi = _mm_insert_epi32(vAHi, aHi, 1); + + aLow = _mm_extract_epi32(vALow, 1); + countLow = _mm_extract_epi32(vCountLow, 1); + aLow <<= countLow; + vALow = _mm_insert_epi32(vALow, aLow, 1); + + aHi = _mm_extract_epi32(vAHi, 2); + countHi = _mm_extract_epi32(vCountHi, 2); + aHi <<= countHi; + vAHi = _mm_insert_epi32(vAHi, aHi, 2); + + aLow = _mm_extract_epi32(vALow, 2); + countLow = _mm_extract_epi32(vCountLow, 2); + aLow <<= countLow; + vALow = _mm_insert_epi32(vALow, aLow, 2); + + aHi = _mm_extract_epi32(vAHi, 3); + countHi = _mm_extract_epi32(vCountHi, 3); + aHi <<= countHi; + vAHi = _mm_insert_epi32(vAHi, aHi, 3); + + aLow = _mm_extract_epi32(vALow, 3); + countLow = _mm_extract_epi32(vCountLow, 3); + aLow <<= countLow; + vALow = _mm_insert_epi32(vALow, aLow, 3); + + __m256i ret = _mm256_set1_epi32(0); + ret = _mm256_insertf128_si256(ret, vAHi, 1); + ret = _mm256_insertf128_si256(ret, vALow, 0); + return ret; +} + #define _simd_mul_epi32 _simdemu_mul_epi32 #define _simd_mullo_epi32 _simdemu_mullo_epi32 #define _simd_sub_epi32 _simdemu_sub_epi32 @@ -136,7 +271,14 @@ __m256i func(__m256i a, __m256i b)\ #define _simd_add_epi8 _simdemu_add_epi8 #define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64 #define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64 +#define _simd_cmpgt_epi8 _simdemu_cmpgt_epi8 +#define _simd_cmpeq_epi8 _simdemu_cmpeq_epi8 +#define _simd_cmpgt_epi16 _simdemu_cmpgt_epi16 +#define _simd_cmpeq_epi16 _simdemu_cmpeq_epi16 #define _simd_movemask_epi8 _simdemu_movemask_epi8 +#define _simd_permute_ps _simdemu_permute_ps +#define _simd_srlv_epi32 _simdemu_srlv_epi32 +#define _simd_sllv_epi32 _simdemu_sllv_epi32 SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32) SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32) @@ -158,6 +300,10 @@ SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8) SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8) SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64) SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64) +SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8) +SIMD_EMU_EPI(_simdemu_cmpeq_epi8, _mm_cmpeq_epi8) +SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16) +SIMD_EMU_EPI(_simdemu_cmpeq_epi16, _mm_cmpeq_epi16) #define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))) #define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))) @@ -176,25 +322,25 @@ SIMD_EMU_EPI(_simdemu_shuffle_epi8, _mm_shuffle_epi8) INLINE __m128 _mm_fmaddemu_ps(__m128 a, __m128 b, __m128 c) { - __m128 res = _mm_mul_ps(a, b); - res = _mm_add_ps(res, c); - return res; + __m128 res = _mm_mul_ps(a, b); + res = _mm_add_ps(res, c); + return res; } INLINE __m256 _mm_fmaddemu256_ps(__m256 a, __m256 b, __m256 c) { - __m256 res = _mm256_mul_ps(a, b); - res = _mm256_add_ps(res, c); - return res; + __m256 res = _mm256_mul_ps(a, b); + res = _mm256_add_ps(res, c); + return res; } INLINE __m256 _mm_fmsubemu256_ps(__m256 a, __m256 b, __m256 c) { - __m256 res = _mm256_mul_ps(a, b); - res = _mm256_sub_ps(res, c); - return res; + __m256 res = _mm256_mul_ps(a, b); + res = _mm256_sub_ps(res, c); + return res; } INLINE @@ -295,7 +441,14 @@ int _simdemu_movemask_epi8(__m256i a) #define _simd_cmpeq_epi64 _mm256_cmpeq_epi64 #define _simd_cmpgt_epi64 _mm256_cmpgt_epi64 +#define _simd_cmpgt_epi8 _mm256_cmpgt_epi8 +#define _simd_cmpeq_epi8 _mm256_cmpeq_epi8 +#define _simd_cmpgt_epi16 _mm256_cmpgt_epi16 +#define _simd_cmpeq_epi16 _mm256_cmpeq_epi16 #define _simd_movemask_epi8 _mm256_movemask_epi8 +#define _simd_permute_ps _mm256_permutevar8x32_ps +#define _simd_srlv_epi32 _mm256_srlv_epi32 +#define _simd_sllv_epi32 _mm256_sllv_epi32 #endif #define _simd_shuffleps_epi32(vA, vB, imm) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(vA), _mm256_castsi256_ps(vB), imm)) @@ -343,30 +496,30 @@ void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int sl INLINE __m256i _simdemu_slli_epi32(__m256i a, uint32_t i) { - __m128i aHi = _mm256_extractf128_si256(a, 1); - __m128i aLo = _mm256_castsi256_si128(a); + __m128i aHi = _mm256_extractf128_si256(a, 1); + __m128i aLo = _mm256_castsi256_si128(a); - __m128i resHi = _mm_slli_epi32(aHi, i); - __m128i resLo = _mm_slli_epi32(aLo, i); + __m128i resHi = _mm_slli_epi32(aHi, i); + __m128i resLo = _mm_slli_epi32(aLo, i); - __m256i result = _mm256_castsi128_si256(resLo); - result = _mm256_insertf128_si256(result, resHi, 1); + __m256i result = _mm256_castsi128_si256(resLo); + result = _mm256_insertf128_si256(result, resHi, 1); - return result; + return result; } INLINE __m256i _simdemu_srai_epi32(__m256i a, uint32_t i) { - __m128i aHi = _mm256_extractf128_si256(a, 1); - __m128i aLo = _mm256_castsi256_si128(a); + __m128i aHi = _mm256_extractf128_si256(a, 1); + __m128i aLo = _mm256_castsi256_si128(a); - __m128i resHi = _mm_srai_epi32(aHi, i); - __m128i resLo = _mm_srai_epi32(aLo, i); + __m128i resHi = _mm_srai_epi32(aHi, i); + __m128i resLo = _mm_srai_epi32(aLo, i); - __m256i result = _mm256_castsi128_si256(resLo); - result = _mm256_insertf128_si256(result, resHi, 1); + __m256i result = _mm256_castsi128_si256(resLo); + result = _mm256_insertf128_si256(result, resHi, 1); - return result; + return result; } INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i) @@ -386,7 +539,7 @@ INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i) INLINE void _simdvec_transpose(simdvector &v) { - SWR_ASSERT(false, "Need to implement 8 wide version"); + SWR_ASSERT(false, "Need to implement 8 wide version"); } #else @@ -397,132 +550,132 @@ void _simdvec_transpose(simdvector &v) INLINE void _simdvec_load_ps(simdvector& r, const float *p) { - r[0] = _simd_set1_ps(p[0]); - r[1] = _simd_set1_ps(p[1]); - r[2] = _simd_set1_ps(p[2]); - r[3] = _simd_set1_ps(p[3]); + r[0] = _simd_set1_ps(p[0]); + r[1] = _simd_set1_ps(p[1]); + r[2] = _simd_set1_ps(p[2]); + r[3] = _simd_set1_ps(p[3]); } INLINE void _simdvec_mov(simdvector& r, const simdscalar& s) { - r[0] = s; - r[1] = s; - r[2] = s; - r[3] = s; + r[0] = s; + r[1] = s; + r[2] = s; + r[3] = s; } INLINE void _simdvec_mov(simdvector& r, const simdvector& v) { - r[0] = v[0]; - r[1] = v[1]; - r[2] = v[2]; - r[3] = v[3]; + r[0] = v[0]; + r[1] = v[1]; + r[2] = v[2]; + r[3] = v[3]; } // just move a lane from the source simdvector to dest simdvector INLINE void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane) { - _simd_mov(r[0], rlane, s[0], slane); - _simd_mov(r[1], rlane, s[1], slane); - _simd_mov(r[2], rlane, s[2], slane); - _simd_mov(r[3], rlane, s[3], slane); + _simd_mov(r[0], rlane, s[0], slane); + _simd_mov(r[1], rlane, s[1], slane); + _simd_mov(r[2], rlane, s[2], slane); + _simd_mov(r[3], rlane, s[3], slane); } INLINE void _simdvec_dp3_ps(simdscalar& r, const simdvector& v0, const simdvector& v1) { - simdscalar tmp; - r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x) + simdscalar tmp; + r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x) - tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y) - r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y) + r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) - tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z) - r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) + tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z) + r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) } INLINE void _simdvec_dp4_ps(simdscalar& r, const simdvector& v0, const simdvector& v1) { - simdscalar tmp; - r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x) + simdscalar tmp; + r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x) - tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y) - r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y) + r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) - tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z) - r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) + tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z) + r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) - tmp = _simd_mul_ps(v0[3], v1[3]); // (v0.w*v1.w) - r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) + tmp = _simd_mul_ps(v0[3], v1[3]); // (v0.w*v1.w) + r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) } INLINE simdscalar _simdvec_rcp_length_ps(const simdvector& v) { - simdscalar length; - _simdvec_dp4_ps(length, v, v); - return _simd_rsqrt_ps(length); + simdscalar length; + _simdvec_dp4_ps(length, v, v); + return _simd_rsqrt_ps(length); } INLINE void _simdvec_normalize_ps(simdvector& r, const simdvector& v) { - simdscalar vecLength; - vecLength = _simdvec_rcp_length_ps(v); + simdscalar vecLength; + vecLength = _simdvec_rcp_length_ps(v); - r[0] = _simd_mul_ps(v[0], vecLength); - r[1] = _simd_mul_ps(v[1], vecLength); - r[2] = _simd_mul_ps(v[2], vecLength); - r[3] = _simd_mul_ps(v[3], vecLength); + r[0] = _simd_mul_ps(v[0], vecLength); + r[1] = _simd_mul_ps(v[1], vecLength); + r[2] = _simd_mul_ps(v[2], vecLength); + r[3] = _simd_mul_ps(v[3], vecLength); } INLINE void _simdvec_mul_ps(simdvector& r, const simdvector& v, const simdscalar& s) { - r[0] = _simd_mul_ps(v[0], s); - r[1] = _simd_mul_ps(v[1], s); - r[2] = _simd_mul_ps(v[2], s); - r[3] = _simd_mul_ps(v[3], s); + r[0] = _simd_mul_ps(v[0], s); + r[1] = _simd_mul_ps(v[1], s); + r[2] = _simd_mul_ps(v[2], s); + r[3] = _simd_mul_ps(v[3], s); } INLINE void _simdvec_mul_ps(simdvector& r, const simdvector& v0, const simdvector& v1) { - r[0] = _simd_mul_ps(v0[0], v1[0]); - r[1] = _simd_mul_ps(v0[1], v1[1]); - r[2] = _simd_mul_ps(v0[2], v1[2]); - r[3] = _simd_mul_ps(v0[3], v1[3]); + r[0] = _simd_mul_ps(v0[0], v1[0]); + r[1] = _simd_mul_ps(v0[1], v1[1]); + r[2] = _simd_mul_ps(v0[2], v1[2]); + r[3] = _simd_mul_ps(v0[3], v1[3]); } INLINE void _simdvec_add_ps(simdvector& r, const simdvector& v0, const simdvector& v1) { - r[0] = _simd_add_ps(v0[0], v1[0]); - r[1] = _simd_add_ps(v0[1], v1[1]); - r[2] = _simd_add_ps(v0[2], v1[2]); - r[3] = _simd_add_ps(v0[3], v1[3]); + r[0] = _simd_add_ps(v0[0], v1[0]); + r[1] = _simd_add_ps(v0[1], v1[1]); + r[2] = _simd_add_ps(v0[2], v1[2]); + r[3] = _simd_add_ps(v0[3], v1[3]); } INLINE void _simdvec_min_ps(simdvector& r, const simdvector& v0, const simdscalar& s) { - r[0] = _simd_min_ps(v0[0], s); - r[1] = _simd_min_ps(v0[1], s); - r[2] = _simd_min_ps(v0[2], s); - r[3] = _simd_min_ps(v0[3], s); + r[0] = _simd_min_ps(v0[0], s); + r[1] = _simd_min_ps(v0[1], s); + r[2] = _simd_min_ps(v0[2], s); + r[3] = _simd_min_ps(v0[3], s); } INLINE void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s) { - r[0] = _simd_max_ps(v0[0], s); - r[1] = _simd_max_ps(v0[1], s); - r[2] = _simd_max_ps(v0[2], s); - r[3] = _simd_max_ps(v0[3], s); + r[0] = _simd_max_ps(v0[0], s); + r[1] = _simd_max_ps(v0[1], s); + r[2] = _simd_max_ps(v0[2], s); + r[3] = _simd_max_ps(v0[3], s); } // Matrix4x4 * Vector4 @@ -532,65 +685,65 @@ void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s) // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w) INLINE void _simd_mat4x4_vec4_multiply( - simdvector& result, - const float *pMatrix, - const simdvector& v) -{ - simdscalar m; - simdscalar r0; - simdscalar r1; - - m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] - r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) - result[0] = r0; - - m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] - r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) - result[1] = r0; - - m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] - r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) - result[2] = r0; - - m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3] - r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) - result[3] = r0; + simdvector& result, + const float *pMatrix, + const simdvector& v) +{ + simdscalar m; + simdscalar r0; + simdscalar r1; + + m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] + r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) + result[0] = r0; + + m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] + r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) + result[1] = r0; + + m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] + r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) + result[2] = r0; + + m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3] + r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) + result[3] = r0; } // Matrix4x4 * Vector3 - Direction Vector where w = 0. @@ -600,45 +753,45 @@ void _simd_mat4x4_vec4_multiply( // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0) INLINE void _simd_mat3x3_vec3_w0_multiply( - simdvector& result, - const float *pMatrix, - const simdvector& v) -{ - simdscalar m; - simdscalar r0; - simdscalar r1; - - m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - result[0] = r0; - - m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - result[1] = r0; - - m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - result[2] = r0; - - result[3] = _simd_setzero_ps(); + simdvector& result, + const float *pMatrix, + const simdvector& v) +{ + simdscalar m; + simdscalar r0; + simdscalar r1; + + m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + result[0] = r0; + + m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + result[1] = r0; + + m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + result[2] = r0; + + result[3] = _simd_setzero_ps(); } // Matrix4x4 * Vector3 - Position vector where w = 1. @@ -648,108 +801,108 @@ void _simd_mat3x3_vec3_w0_multiply( // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1) INLINE void _simd_mat4x4_vec3_w1_multiply( - simdvector& result, - const float *pMatrix, - const simdvector& v) -{ - simdscalar m; - simdscalar r0; - simdscalar r1; - - m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] - r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) - result[0] = r0; - - m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] - r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) - result[1] = r0; - - m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] - r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) - result[2] = r0; - - m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3] - result[3] = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + simdvector& result, + const float *pMatrix, + const simdvector& v) +{ + simdscalar m; + simdscalar r0; + simdscalar r1; + + m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] + r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + result[0] = r0; + + m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] + r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + result[1] = r0; + + m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] + r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + result[2] = r0; + + m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3] + result[3] = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) } INLINE void _simd_mat4x3_vec3_w1_multiply( - simdvector& result, - const float *pMatrix, - const simdvector& v) -{ - simdscalar m; - simdscalar r0; - simdscalar r1; - - m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] - r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) - result[0] = r0; - - m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] - r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) - result[1] = r0; - - m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] - r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) - result[2] = r0; - result[3] = _simd_set1_ps(1.0f); + simdvector& result, + const float *pMatrix, + const simdvector& v) +{ + simdscalar m; + simdscalar r0; + simdscalar r1; + + m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] + r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + result[0] = r0; + + m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] + r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + result[1] = r0; + + m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] + r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + result[2] = r0; + result[3] = _simd_set1_ps(1.0f); } ////////////////////////////////////////////////////////////////////////// @@ -783,5 +936,61 @@ static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, cons return vplaneps(vA, vB, vC, vI, vJ); } +INLINE +UINT pdep_u32(UINT a, UINT mask) +{ +#if KNOB_ARCH==KNOB_ARCH_AVX2 + return _pdep_u32(a, mask); +#else + UINT result = 0; + + // copied from http://wm.ite.pl/articles/pdep-soft-emu.html + // using bsf instead of funky loop + DWORD maskIndex; + while (_BitScanForward(&maskIndex, mask)) + { + // 1. isolate lowest set bit of mask + const UINT lowest = 1 << maskIndex; + + // 2. populate LSB from src + const UINT LSB = (UINT)((int)(a << 31) >> 31); + + // 3. copy bit from mask + result |= LSB & lowest; + + // 4. clear lowest bit + mask &= ~lowest; + + // 5. prepare for next iteration + a >>= 1; + } + + return result; +#endif +} + +INLINE +UINT pext_u32(UINT a, UINT mask) +{ +#if KNOB_ARCH==KNOB_ARCH_AVX2 + return _pext_u32(a, mask); +#else + UINT result = 0; + DWORD maskIndex; + uint32_t currentBit = 0; + while (_BitScanForward(&maskIndex, mask)) + { + // 1. isolate lowest set bit of mask + const UINT lowest = 1 << maskIndex; + + // 2. copy bit from mask + result |= ((a & lowest) > 0) << currentBit++; + + // 3. clear lowest bit + mask &= ~lowest; + } + return result; +#endif +} #endif//__SWR_SIMDINTRIN_H__ diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index fccccab503c..f0f7956b590 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -49,7 +49,7 @@ void SetupDefaultState(SWR_CONTEXT *pContext); /// @brief Create SWR Context. /// @param pCreateInfo - pointer to creation info. HANDLE SwrCreateContext( - const SWR_CREATECONTEXT_INFO* pCreateInfo) + SWR_CREATECONTEXT_INFO* pCreateInfo) { RDTSC_RESET(); RDTSC_INIT(0); @@ -61,27 +61,16 @@ HANDLE SwrCreateContext( pContext->driverType = pCreateInfo->driver; pContext->privateStateSize = pCreateInfo->privateStateSize; - pContext->dcRing = (DRAW_CONTEXT*)_aligned_malloc(sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT, 64); - memset(pContext->dcRing, 0, sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT); - - pContext->dsRing = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT, 64); - memset(pContext->dsRing, 0, sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT); - - pContext->numSubContexts = pCreateInfo->maxSubContexts; - if (pContext->numSubContexts > 1) - { - pContext->subCtxSave = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE) * pContext->numSubContexts, 64); - memset(pContext->subCtxSave, 0, sizeof(DRAW_STATE) * pContext->numSubContexts); - } + pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT); + pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT); for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) { - pContext->dcRing[dc].pArena = new Arena(); - pContext->dcRing[dc].inUse = false; + pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena)); pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen. - pContext->dsRing[dc].pArena = new Arena(); + pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); } if (!KNOB_SINGLE_THREADED) @@ -108,9 +97,6 @@ HANDLE SwrCreateContext( pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4); } - pContext->nextDrawId = 1; - pContext->DrawEnqueued = 1; - // State setup AFTER context is fully initialized SetupDefaultState(pContext); @@ -125,6 +111,13 @@ HANDLE SwrCreateContext( pContext->pfnStoreTile = pCreateInfo->pfnStoreTile; pContext->pfnClearTile = pCreateInfo->pfnClearTile; + // pass pointer to bucket manager back to caller +#ifdef KNOB_ENABLE_RDTSC + pCreateInfo->pBucketMgr = &gBucketMgr; +#endif + + pCreateInfo->contextSaveSize = sizeof(API_STATE); + return (HANDLE)pContext; } @@ -148,10 +141,6 @@ void SwrDestroyContext(HANDLE hContext) _aligned_free(pContext->pScratch[i]); } - _aligned_free(pContext->dcRing); - _aligned_free(pContext->dsRing); - _aligned_free(pContext->subCtxSave); - delete(pContext->pHotTileMgr); pContext->~SWR_CONTEXT(); @@ -168,49 +157,20 @@ void WakeAllThreads(SWR_CONTEXT *pContext) pContext->FifosNotEmpty.notify_all(); } -bool StillDrawing(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC) -{ - // For single thread nothing should still be drawing. - if (KNOB_SINGLE_THREADED) { return false; } - - if (pDC->isCompute) - { - if (pDC->doneCompute) - { - pDC->inUse = false; - return false; - } - } - - // Check if backend work is done. First make sure all triangles have been binned. - if (pDC->doneFE == true) - { - // ensure workers have all moved passed this draw - if (pDC->threadsDoneFE != pContext->NumWorkerThreads) - { - return true; - } - - if (pDC->threadsDoneBE != pContext->NumWorkerThreads) - { - return true; - } - - pDC->inUse = false; // all work is done. - } - - return pDC->inUse; -} - -void QueueDraw(SWR_CONTEXT *pContext) +template<bool IsDraw> +void QueueWork(SWR_CONTEXT *pContext) { - SWR_ASSERT(pContext->pCurDrawContext->inUse == false); - pContext->pCurDrawContext->inUse = true; + // Each worker thread looks at a DC for both FE and BE work at different times and so we + // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers + // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and + // then moved on if all work is done.) + pContext->pCurDrawContext->threadsDone = + pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2; _ReadWriteBarrier(); { std::unique_lock<std::mutex> lock(pContext->WaitLock); - pContext->DrawEnqueued++; + pContext->dcRing.Enqueue(); } if (KNOB_SINGLE_THREADED) @@ -219,10 +179,21 @@ void QueueDraw(SWR_CONTEXT *pContext) uint32_t mxcsr = _mm_getcsr(); _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); - std::unordered_set<uint32_t> lockedTiles; - uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId }; - WorkOnFifoFE(pContext, 0, curDraw[0], 0); - WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles); + if (IsDraw) + { + static TileSet lockedTiles; + uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId }; + WorkOnFifoFE(pContext, 0, curDraw[0], 0); + WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0); + } + else + { + uint64_t curDispatch = pContext->pCurDrawContext->drawId; + WorkOnCompute(pContext, 0, curDispatch); + } + + // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers). + while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0) {} // restore csr _mm_setcsr(mxcsr); @@ -239,40 +210,14 @@ void QueueDraw(SWR_CONTEXT *pContext) pContext->pCurDrawContext = nullptr; } -///@todo Combine this with QueueDraw -void QueueDispatch(SWR_CONTEXT *pContext) +INLINE void QueueDraw(SWR_CONTEXT* pContext) { - SWR_ASSERT(pContext->pCurDrawContext->inUse == false); - pContext->pCurDrawContext->inUse = true; - - _ReadWriteBarrier(); - { - std::unique_lock<std::mutex> lock(pContext->WaitLock); - pContext->DrawEnqueued++; - } - - if (KNOB_SINGLE_THREADED) - { - // flush denormals to 0 - uint32_t mxcsr = _mm_getcsr(); - _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); - - uint64_t curDispatch = pContext->pCurDrawContext->drawId; - WorkOnCompute(pContext, 0, curDispatch); - - // restore csr - _mm_setcsr(mxcsr); - } - else - { - RDTSC_START(APIDrawWakeAllThreads); - WakeAllThreads(pContext); - RDTSC_STOP(APIDrawWakeAllThreads, 1, 0); - } + QueueWork<true>(pContext); +} - // Set current draw context to NULL so that next state call forces a new draw context to be created and populated. - pContext->pPrevDrawContext = pContext->pCurDrawContext; - pContext->pCurDrawContext = nullptr; +INLINE void QueueDispatch(SWR_CONTEXT* pContext) +{ + QueueWork<false>(pContext); } DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) @@ -281,23 +226,21 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) // If current draw context is null then need to obtain a new draw context to use from ring. if (pContext->pCurDrawContext == nullptr) { - uint32_t dcIndex = pContext->nextDrawId % KNOB_MAX_DRAWS_IN_FLIGHT; - - DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex]; - pContext->pCurDrawContext = pCurDrawContext; - - // Need to wait until this draw context is available to use. - while (StillDrawing(pContext, pCurDrawContext)) + // Need to wait for a free entry. + while (pContext->dcRing.IsFull()) { _mm_pause(); } + uint32_t dcIndex = pContext->dcRing.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT; + + DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex]; + pContext->pCurDrawContext = pCurDrawContext; + // Assign next available entry in DS ring to this DC. uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT; pCurDrawContext->pState = &pContext->dsRing[dsIndex]; - Arena& stateArena = *(pCurDrawContext->pState->pArena); - // Copy previous state to current state. if (pContext->pPrevDrawContext) { @@ -310,7 +253,9 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) { CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState); - stateArena.Reset(true); // Reset memory. + // Should have been cleaned up previously + SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true); + pCurDrawContext->pState->pPrivateState = nullptr; pContext->curStateId++; // Progress state ring index forward. @@ -320,30 +265,31 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) // If its a split draw then just copy the state pointer over // since its the same draw. pCurDrawContext->pState = pPrevDrawContext->pState; + SWR_ASSERT(pPrevDrawContext->cleanupState == false); } } else { - stateArena.Reset(); // Reset memory. + SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true); pContext->curStateId++; // Progress state ring index forward. } + SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true); + pCurDrawContext->dependency = 0; - pCurDrawContext->pArena->Reset(); pCurDrawContext->pContext = pContext; pCurDrawContext->isCompute = false; // Dispatch has to set this to true. - pCurDrawContext->inUse = false; - pCurDrawContext->doneCompute = false; pCurDrawContext->doneFE = false; pCurDrawContext->FeLock = 0; - pCurDrawContext->threadsDoneFE = 0; - pCurDrawContext->threadsDoneBE = 0; + pCurDrawContext->threadsDone = 0; pCurDrawContext->pTileMgr->initialize(); // Assign unique drawId for this DC - pCurDrawContext->drawId = pContext->nextDrawId++; + pCurDrawContext->drawId = pContext->dcRing.GetHead(); + + pCurDrawContext->cleanupState = true; } else { @@ -354,38 +300,36 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) return pContext->pCurDrawContext; } -void SWR_API SwrSetActiveSubContext( - HANDLE hContext, - uint32_t subContextIndex) +API_STATE* GetDrawState(SWR_CONTEXT *pContext) { - SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; - if (subContextIndex >= pContext->numSubContexts) - { - return; - } + DRAW_CONTEXT* pDC = GetDrawContext(pContext); + SWR_ASSERT(pDC->pState != nullptr); - if (subContextIndex != pContext->curSubCtxId) - { - // Save and restore draw state - DRAW_CONTEXT* pDC = GetDrawContext(pContext); - CopyState( - pContext->subCtxSave[pContext->curSubCtxId], - *(pDC->pState)); + return &pDC->pState->state; +} - CopyState( - *(pDC->pState), - pContext->subCtxSave[subContextIndex]); +void SWR_API SwrSaveState( + HANDLE hContext, + void* pOutputStateBlock, + size_t memSize) +{ + SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; + auto pSrc = GetDrawState(pContext); + SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc)); - pContext->curSubCtxId = subContextIndex; - } + memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc)); } -API_STATE* GetDrawState(SWR_CONTEXT *pContext) +void SWR_API SwrRestoreState( + HANDLE hContext, + const void* pStateBlock, + size_t memSize) { - DRAW_CONTEXT* pDC = GetDrawContext(pContext); - SWR_ASSERT(pDC->pState != nullptr); + SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; + auto pDst = GetDrawState(pContext); + SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst)); - return &pDC->pState->state; + memcpy(pDst, pStateBlock, sizeof(*pDst)); } void SetupDefaultState(SWR_CONTEXT *pContext) @@ -431,16 +375,12 @@ void SwrWaitForIdle(HANDLE hContext) SWR_CONTEXT *pContext = GetContext(hContext); RDTSC_START(APIWaitForIdle); - // Wait for all work to complete. - for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) - { - DRAW_CONTEXT *pDC = &pContext->dcRing[dc]; - while (StillDrawing(pContext, pDC)) - { - _mm_pause(); - } + while (!pContext->dcRing.IsEmpty()) + { + _mm_pause(); } + RDTSC_STOP(APIWaitForIdle, 1, 0); } @@ -770,16 +710,25 @@ void SetupMacroTileScissors(DRAW_CONTEXT *pDC) pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1; } } - +// templated backend function tables +extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX]; +extern PFN_BACKEND_FUNC gBackendSingleSample[2][2]; +extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2]; +extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2]; +extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS + 1][SWR_MULTISAMPLE_TYPE_MAX]; +extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2]; +extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2]; +extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2]; void SetupPipeline(DRAW_CONTEXT *pDC) { DRAW_STATE* pState = pDC->pState; const SWR_RASTSTATE &rastState = pState->state.rastState; + const SWR_PS_STATE &psState = pState->state.psState; BACKEND_FUNCS& backendFuncs = pState->backendFuncs; const uint32_t forcedSampleCount = (rastState.bForcedSampleCount) ? 1 : 0; // setup backend - if (pState->state.psState.pfnPixelShader == nullptr) + if (psState.pfnPixelShader == nullptr) { backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount]; // always need to generate I & J per sample for Z interpolation @@ -788,41 +737,40 @@ void SetupPipeline(DRAW_CONTEXT *pDC) else { const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0; - const uint32_t centroid = ((pState->state.psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0; + const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0; // currently only support 'normal' input coverage - SWR_ASSERT(pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL || - pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NONE); + SWR_ASSERT(psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL || + psState.inputCoverage == SWR_INPUT_COVERAGE_NONE); - SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)pState->state.psState.barycentricsMask; + SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask; // select backend function - switch(pState->state.psState.shadingRate) + switch(psState.shadingRate) { case SWR_SHADING_RATE_PIXEL: if(bMultisampleEnable) { // always need to generate I & J per sample for Z interpolation barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); - backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][pState->state.psState.inputCoverage][centroid][forcedSampleCount]; - backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount]; + backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount]; + backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount]; } else { // always need to generate I & J per pixel for Z interpolation barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK); - backendFuncs.pfnBackend = gBackendSingleSample[pState->state.psState.inputCoverage][centroid]; - backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][SWR_MULTISAMPLE_1X]; + backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid]; + backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][SWR_MULTISAMPLE_1X]; } break; case SWR_SHADING_RATE_SAMPLE: SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN); // always need to generate I & J per sample for Z interpolation barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); - backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][pState->state.psState.inputCoverage][centroid]; - backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount]; + backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid]; + backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount]; break; - case SWR_SHADING_RATE_COARSE: default: SWR_ASSERT(0 && "Invalid shading rate"); break; @@ -913,7 +861,7 @@ void SetupPipeline(DRAW_CONTEXT *pDC) uint32_t numRTs = pState->state.psState.numRenderTargets; pState->state.colorHottileEnable = 0; - if(pState->state.psState.pfnPixelShader != nullptr) + if (psState.pfnPixelShader != nullptr) { for (uint32_t rt = 0; rt < numRTs; ++rt) { @@ -1005,6 +953,11 @@ uint32_t MaxVertsPerDraw( } break; + // The Primitive Assembly code can only handle 1 RECT at a time. + case TOP_RECT_LIST: + vertsPerDraw = 3; + break; + default: // We are not splitting up draws for other topologies. break; @@ -1116,6 +1069,8 @@ void DrawInstanced( pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw; + pDC->cleanupState = (remainingVerts == numVertsForDraw); + //enqueue DC QueueDraw(pContext); @@ -1250,6 +1205,8 @@ void DrawIndexedInstance( pDC->FeWork.desc.draw.baseVertex = baseVertex; pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; + pDC->cleanupState = (remainingIndices == numIndicesForDraw); + //enqueue DC QueueDraw(pContext); @@ -1305,7 +1262,10 @@ void SwrDrawIndexedInstanced( DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance); } -// Attach surfaces to pipeline +////////////////////////////////////////////////////////////////////////// +/// @brief SwrInvalidateTiles +/// @param hContext - Handle passed back from SwrCreateContext +/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate. void SwrInvalidateTiles( HANDLE hContext, uint32_t attachmentMask) @@ -1313,10 +1273,39 @@ void SwrInvalidateTiles( SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; DRAW_CONTEXT* pDC = GetDrawContext(pContext); + pDC->FeWork.type = DISCARDINVALIDATETILES; + pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles; + pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask; + memset(&pDC->FeWork.desc.discardInvalidateTiles.rect, 0, sizeof(SWR_RECT)); + pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID; + pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false; + pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false; + + //enqueue + QueueDraw(pContext); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief SwrDiscardRect +/// @param hContext - Handle passed back from SwrCreateContext +/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard. +/// @param rect - if rect is all zeros, the entire attachment surface will be discarded +void SwrDiscardRect( + HANDLE hContext, + uint32_t attachmentMask, + SWR_RECT rect) +{ + SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; + DRAW_CONTEXT* pDC = GetDrawContext(pContext); + // Queue a load to the hottile - pDC->FeWork.type = INVALIDATETILES; - pDC->FeWork.pfnWork = ProcessInvalidateTiles; - pDC->FeWork.desc.invalidateTiles.attachmentMask = attachmentMask; + pDC->FeWork.type = DISCARDINVALIDATETILES; + pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles; + pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask; + pDC->FeWork.desc.discardInvalidateTiles.rect = rect; + pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED; + pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true; + pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true; //enqueue QueueDraw(pContext); @@ -1391,7 +1380,7 @@ void SwrClearRenderTarget( uint32_t clearMask, const float clearColor[4], float z, - BYTE stencil) + uint8_t stencil) { RDTSC_START(APIClearRenderTarget); diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h index 72fae8b2c21..90c2f038c46 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.h +++ b/src/gallium/drivers/swr/rasterizer/core/api.h @@ -53,7 +53,7 @@ typedef void(SWR_API *PFN_CALLBACK_FUNC)(uint64_t data, uint64_t data2, uint64_t /// @param pDstHotTile - pointer to the hot tile surface typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT dstFormat, SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, - uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pDstHotTile); + uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t *pDstHotTile); ////////////////////////////////////////////////////////////////////////// /// @brief Function signature for store hot tiles @@ -65,7 +65,7 @@ typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT dstForma /// @param pSrcHotTile - pointer to the hot tile surface typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, SWR_FORMAT srcFormat, SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, - uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pSrcHotTile); + uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t *pSrcHotTile); /// @brief Function signature for clearing from the hot tiles clear value /// @param hPrivateContext - handle to private data @@ -77,6 +77,8 @@ typedef void(SWR_API *PFN_CLEAR_TILE)(HANDLE hPrivateContext, SWR_RENDERTARGET_ATTACHMENT rtIndex, uint32_t x, uint32_t y, const float* pClearColor); +class BucketManager; + ////////////////////////////////////////////////////////////////////////// /// SWR_CREATECONTEXT_INFO ///////////////////////////////////////////////////////////////////////// @@ -88,13 +90,17 @@ struct SWR_CREATECONTEXT_INFO // Use SwrGetPrivateContextState() to access private state. uint32_t privateStateSize; - // Each SWR context can have multiple sets of active state - uint32_t maxSubContexts; - - // tile manipulation functions + // Tile manipulation functions PFN_LOAD_TILE pfnLoadTile; PFN_STORE_TILE pfnStoreTile; PFN_CLEAR_TILE pfnClearTile; + + // Pointer to rdtsc buckets mgr returned to the caller. + // Only populated when KNOB_ENABLE_RDTSC is set + BucketManager* pBucketMgr; + + // Output: size required memory passed to for SwrSaveState / SwrRestoreState + size_t contextSaveSize; }; ////////////////////////////////////////////////////////////////////////// @@ -112,7 +118,7 @@ struct SWR_RECT /// @brief Create SWR Context. /// @param pCreateInfo - pointer to creation info. HANDLE SWR_API SwrCreateContext( - const SWR_CREATECONTEXT_INFO* pCreateInfo); + SWR_CREATECONTEXT_INFO* pCreateInfo); ////////////////////////////////////////////////////////////////////////// /// @brief Destroys SWR Context. @@ -121,12 +127,24 @@ void SWR_API SwrDestroyContext( HANDLE hContext); ////////////////////////////////////////////////////////////////////////// -/// @brief Set currently active state context -/// @param subContextIndex - value from 0 to -/// SWR_CREATECONTEXT_INFO.maxSubContexts. Defaults to 0. -void SWR_API SwrSetActiveSubContext( +/// @brief Saves API state associated with hContext +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pOutputStateBlock - Memory block to receive API state data +/// @param memSize - Size of memory pointed to by pOutputStateBlock +void SWR_API SwrSaveState( HANDLE hContext, - uint32_t subContextIndex); + void* pOutputStateBlock, + size_t memSize); + +////////////////////////////////////////////////////////////////////////// +/// @brief Restores API state to hContext previously saved with SwrSaveState +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pStateBlock - Memory block to read API state data from +/// @param memSize - Size of memory pointed to by pStateBlock +void SWR_API SwrRestoreState( + HANDLE hContext, + const void* pStateBlock, + size_t memSize); ////////////////////////////////////////////////////////////////////////// /// @brief Sync cmd. Executes the callback func when all rendering up to this sync @@ -391,6 +409,16 @@ void SWR_API SwrInvalidateTiles( uint32_t attachmentMask); ////////////////////////////////////////////////////////////////////////// +/// @brief SwrDiscardRect +/// @param hContext - Handle passed back from SwrCreateContext +/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard. +/// @param rect - if rect is all zeros, the entire attachment surface will be discarded +void SWR_API SwrDiscardRect( + HANDLE hContext, + uint32_t attachmentMask, + SWR_RECT rect); + +////////////////////////////////////////////////////////////////////////// /// @brief SwrDispatch /// @param hContext - Handle passed back from SwrCreateContext /// @param threadGroupCountX - Number of thread groups dispatched in X direction @@ -419,9 +447,9 @@ void SWR_API SwrStoreTiles( void SWR_API SwrClearRenderTarget( HANDLE hContext, uint32_t clearMask, - const FLOAT clearColor[4], + const float clearColor[4], float z, - BYTE stencil); + uint8_t stencil); void SWR_API SwrSetRastState( HANDLE hContext, diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.cpp b/src/gallium/drivers/swr/rasterizer/core/arena.cpp deleted file mode 100644 index 8184c8d3f4c..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/arena.cpp +++ /dev/null @@ -1,166 +0,0 @@ -/**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file arena.cpp -* -* @brief Arena memory manager -* The arena is convenient and fast for managing allocations for any of -* our allocations that are associated with operations and can all be freed -* once when their operation has completed. Allocations are cheap since -* most of the time its simply an increment of an offset. Also, no need to -* free individual allocations. All of the arena memory can be freed at once. -* -******************************************************************************/ - -#include "context.h" -#include "arena.h" - -#include <cmath> - -Arena::Arena() - : m_pCurBlock(nullptr), m_size(0) -{ - m_pMutex = new std::mutex(); -} - -Arena::~Arena() -{ - Reset(); // Reset just in case to avoid leaking memory. - - if (m_pCurBlock) - { - _aligned_free(m_pCurBlock->pMem); - delete m_pCurBlock; - } - - delete m_pMutex; -} - -///@todo Remove this when all users have stopped using this. -void Arena::Init() -{ - m_size = 0; - m_pCurBlock = nullptr; - - m_pMutex = new std::mutex(); -} - -void* Arena::AllocAligned(size_t size, size_t align) -{ - if (m_pCurBlock) - { - ArenaBlock* pCurBlock = m_pCurBlock; - pCurBlock->offset = AlignUp(pCurBlock->offset, align); - - if ((pCurBlock->offset + size) <= pCurBlock->blockSize) - { - void* pMem = PtrAdd(pCurBlock->pMem, pCurBlock->offset); - pCurBlock->offset += size; - m_size += size; - return pMem; - } - - // Not enough memory in this block, fall through to allocate - // a new block - } - - static const size_t ArenaBlockSize = 1024*1024; - size_t blockSize = std::max(m_size + ArenaBlockSize, std::max(size, ArenaBlockSize)); - blockSize = AlignUp(blockSize, KNOB_SIMD_WIDTH*4); - - void *pMem = _aligned_malloc(blockSize, KNOB_SIMD_WIDTH*4); // Arena blocks are always simd byte aligned. - SWR_ASSERT(pMem != nullptr); - - ArenaBlock* pNewBlock = new (std::nothrow) ArenaBlock(); - SWR_ASSERT(pNewBlock != nullptr); - - if (pNewBlock != nullptr) - { - pNewBlock->pNext = m_pCurBlock; - - m_pCurBlock = pNewBlock; - m_pCurBlock->pMem = pMem; - m_pCurBlock->blockSize = blockSize; - - } - - return AllocAligned(size, align); -} - -void* Arena::Alloc(size_t size) -{ - return AllocAligned(size, 1); -} - -void* Arena::AllocAlignedSync(size_t size, size_t align) -{ - void* pAlloc = nullptr; - - SWR_ASSERT(m_pMutex != nullptr); - - m_pMutex->lock(); - pAlloc = AllocAligned(size, align); - m_pMutex->unlock(); - - return pAlloc; -} - -void* Arena::AllocSync(size_t size) -{ - void* pAlloc = nullptr; - - SWR_ASSERT(m_pMutex != nullptr); - - m_pMutex->lock(); - pAlloc = Alloc(size); - m_pMutex->unlock(); - - return pAlloc; -} - -void Arena::Reset(bool removeAll) -{ - if (m_pCurBlock) - { - m_pCurBlock->offset = 0; - - ArenaBlock *pUsedBlocks = m_pCurBlock->pNext; - m_pCurBlock->pNext = nullptr; - while(pUsedBlocks) - { - ArenaBlock* pBlock = pUsedBlocks; - pUsedBlocks = pBlock->pNext; - - _aligned_free(pBlock->pMem); - delete pBlock; - } - - if (removeAll) - { - _aligned_free(m_pCurBlock->pMem); - delete m_pCurBlock; - m_pCurBlock = nullptr; - } - } - - m_size = 0; -} diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h index 76eee11fb08..67d81a44347 100644 --- a/src/gallium/drivers/swr/rasterizer/core/arena.h +++ b/src/gallium/drivers/swr/rasterizer/core/arena.h @@ -33,37 +33,308 @@ #pragma once #include <mutex> +#include <algorithm> +#include <atomic> +#include "core/utils.h" -class Arena +class DefaultAllocator { public: - Arena(); - ~Arena(); + void* AllocateAligned(size_t size, size_t align) + { + void* p = _aligned_malloc(size, align); + return p; + } + void Free(void* pMem) + { + _aligned_free(pMem); + } +}; - void Init(); +static const size_t ARENA_BLOCK_ALIGN = 64; - void* AllocAligned(size_t size, size_t align); - void* Alloc(size_t size); +struct ArenaBlock +{ + size_t blockSize = 0; + ArenaBlock* pNext = nullptr; +}; +static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN, + "Increase BLOCK_ALIGN size"); - void* AllocAlignedSync(size_t size, size_t align); - void* AllocSync(size_t size); +// Caching Allocator for Arena +template<uint32_t NumBucketsT = 4, uint32_t StartBucketBitT = 16> +struct CachingAllocatorT : DefaultAllocator +{ + static uint32_t GetBucketId(size_t blockSize) + { + uint32_t bucketId = 0; - void Reset(bool removeAll = false); - size_t Size() { return m_size; } +#if defined(BitScanReverseSizeT) + BitScanReverseSizeT((unsigned long*)&bucketId, blockSize >> CACHE_START_BUCKET_BIT); + bucketId = std::min<uint32_t>(bucketId, CACHE_NUM_BUCKETS - 1); +#endif -private: + return bucketId; + } + + void* AllocateAligned(size_t size, size_t align) + { + SWR_ASSERT(size >= sizeof(ArenaBlock)); + SWR_ASSERT(size <= uint32_t(-1)); + + size_t blockSize = size - ARENA_BLOCK_ALIGN; + + { + // search cached blocks + std::lock_guard<std::mutex> l(m_mutex); + ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(blockSize)]; + ArenaBlock* pBlock = pPrevBlock->pNext; + ArenaBlock* pPotentialBlock = nullptr; + ArenaBlock* pPotentialPrev = nullptr; + + while (pBlock) + { + if (pBlock->blockSize >= blockSize) + { + if (pBlock == AlignUp(pBlock, align)) + { + if (pBlock->blockSize == blockSize) + { + // Won't find a better match + break; + } + + // We could use this as it is larger than we wanted, but + // continue to search for a better match + pPotentialBlock = pBlock; + pPotentialPrev = pPrevBlock; + } + } + else + { + // Blocks are sorted by size (biggest first) + // So, if we get here, there are no blocks + // large enough, fall through to allocation. + pBlock = nullptr; + break; + } + + pPrevBlock = pBlock; + pBlock = pBlock->pNext; + } + + if (!pBlock) + { + // Couldn't find an exact match, use next biggest size + pBlock = pPotentialBlock; + pPrevBlock = pPotentialPrev; + } + + if (pBlock) + { + SWR_ASSERT(pPrevBlock && pPrevBlock->pNext == pBlock); + pPrevBlock->pNext = pBlock->pNext; + pBlock->pNext = nullptr; + + return pBlock; + } + + m_totalAllocated += size; + +#if 0 + { + static uint32_t count = 0; + char buf[128]; + sprintf_s(buf, "Arena Alloc %d 0x%llx bytes - 0x%llx total\n", ++count, uint64_t(size), uint64_t(m_totalAllocated)); + OutputDebugStringA(buf); + } +#endif + } + + return this->DefaultAllocator::AllocateAligned(size, align); + } + + void Free(void* pMem) + { + if (pMem) + { + ArenaBlock* pNewBlock = reinterpret_cast<ArenaBlock*>(pMem); + SWR_ASSERT(pNewBlock->blockSize >= 0); + + std::unique_lock<std::mutex> l(m_mutex); + ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(pNewBlock->blockSize)]; + ArenaBlock* pBlock = pPrevBlock->pNext; + + while (pBlock) + { + if (pNewBlock->blockSize >= pBlock->blockSize) + { + // Insert here + break; + } + pPrevBlock = pBlock; + pBlock = pBlock->pNext; + } + + // Insert into list + SWR_ASSERT(pPrevBlock); + pPrevBlock->pNext = pNewBlock; + pNewBlock->pNext = pBlock; + } + } + + ~CachingAllocatorT() + { + // Free all cached blocks + for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i) + { + ArenaBlock* pBlock = m_cachedBlocks[i].pNext; + while (pBlock) + { + ArenaBlock* pNext = pBlock->pNext; + this->DefaultAllocator::Free(pBlock); + pBlock = pNext; + } + } + } + + // buckets, for block sizes < (1 << (start+1)), < (1 << (start+2)), ... + static const uint32_t CACHE_NUM_BUCKETS = NumBucketsT; + static const uint32_t CACHE_START_BUCKET_BIT = StartBucketBitT; + + ArenaBlock m_cachedBlocks[CACHE_NUM_BUCKETS]; + std::mutex m_mutex; + + size_t m_totalAllocated = 0; +}; +typedef CachingAllocatorT<> CachingAllocator; + +template<typename T = DefaultAllocator, size_t BlockSizeT = (128 * 1024)> +class TArena +{ +public: + TArena(T& in_allocator) : m_allocator(in_allocator) {} + TArena() : m_allocator(m_defAllocator) {} + ~TArena() + { + Reset(true); + } + + void* AllocAligned(size_t size, size_t align) + { + if (0 == size) + { + return nullptr; + } + + SWR_ASSERT(align <= ARENA_BLOCK_ALIGN); + + if (m_pCurBlock) + { + ArenaBlock* pCurBlock = m_pCurBlock; + size_t offset = AlignUp(m_offset, align); + + if ((offset + size) <= pCurBlock->blockSize) + { + void* pMem = PtrAdd(pCurBlock, offset + ARENA_BLOCK_ALIGN); + m_offset = offset + size; + return pMem; + } + + // Not enough memory in this block, fall through to allocate + // a new block + } + + static const size_t ArenaBlockSize = BlockSizeT - ARENA_BLOCK_ALIGN; + size_t blockSize = std::max(size, ArenaBlockSize); + + // Add in one BLOCK_ALIGN unit to store ArenaBlock in. + blockSize = AlignUp(blockSize, ARENA_BLOCK_ALIGN); + + void *pMem = m_allocator.AllocateAligned(blockSize + ARENA_BLOCK_ALIGN, ARENA_BLOCK_ALIGN); // Arena blocks are always simd byte aligned. + SWR_ASSERT(pMem != nullptr); + + ArenaBlock* pNewBlock = new (pMem) ArenaBlock(); + + if (pNewBlock != nullptr) + { + m_offset = 0; + pNewBlock->pNext = m_pCurBlock; + + m_pCurBlock = pNewBlock; + m_pCurBlock->blockSize = blockSize; + } + + return AllocAligned(size, align); + } + + void* Alloc(size_t size) + { + return AllocAligned(size, 1); + } - struct ArenaBlock + void* AllocAlignedSync(size_t size, size_t align) { - void* pMem = nullptr; - size_t blockSize = 0; - size_t offset = 0; - ArenaBlock* pNext = nullptr; - }; + void* pAlloc = nullptr; - ArenaBlock* m_pCurBlock = nullptr; - size_t m_size = 0; + m_mutex.lock(); + pAlloc = AllocAligned(size, align); + m_mutex.unlock(); + + return pAlloc; + } + + void* AllocSync(size_t size) + { + void* pAlloc = nullptr; + + m_mutex.lock(); + pAlloc = Alloc(size); + m_mutex.unlock(); + + return pAlloc; + } + + void Reset(bool removeAll = false) + { + m_offset = 0; + + if (m_pCurBlock) + { + ArenaBlock *pUsedBlocks = m_pCurBlock->pNext; + m_pCurBlock->pNext = nullptr; + while (pUsedBlocks) + { + ArenaBlock* pBlock = pUsedBlocks; + pUsedBlocks = pBlock->pNext; + + m_allocator.Free(pBlock); + } + + if (removeAll) + { + m_allocator.Free(m_pCurBlock); + m_pCurBlock = nullptr; + } + } + } + + bool IsEmpty() + { + return (m_pCurBlock == nullptr) || (m_offset == 0 && m_pCurBlock->pNext == nullptr); + } + +private: + + ArenaBlock* m_pCurBlock = nullptr; + size_t m_offset = 0; /// @note Mutex is only used by sync allocation functions. - std::mutex* m_pMutex; + std::mutex m_mutex; + + DefaultAllocator m_defAllocator; + T& m_allocator; }; + +using StdArena = TArena<DefaultAllocator>; +using CachingArena = TArena<CachingAllocator>; diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp index 4a472bc9e5c..7fb83edf169 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp @@ -156,7 +156,7 @@ void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTil } template<SWR_FORMAT format> -void ClearRasterTile(BYTE *pTileBuffer, simdvector &value) +void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value) { auto lambda = [&](int comp) { @@ -299,10 +299,10 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo /// @todo clear data should come in as RGBA32_FLOAT DWORD clearData[4]; float clearFloat[4]; - clearFloat[0] = ((BYTE*)(&pClear->clearRTColor))[0] / 255.0f; - clearFloat[1] = ((BYTE*)(&pClear->clearRTColor))[1] / 255.0f; - clearFloat[2] = ((BYTE*)(&pClear->clearRTColor))[2] / 255.0f; - clearFloat[3] = ((BYTE*)(&pClear->clearRTColor))[3] / 255.0f; + clearFloat[0] = ((uint8_t*)(&pClear->clearRTColor))[0] / 255.0f; + clearFloat[1] = ((uint8_t*)(&pClear->clearRTColor))[1] / 255.0f; + clearFloat[2] = ((uint8_t*)(&pClear->clearRTColor))[2] / 255.0f; + clearFloat[3] = ((uint8_t*)(&pClear->clearRTColor))[3] / 255.0f; clearData[0] = *(DWORD*)&clearFloat[0]; clearData[1] = *(DWORD*)&clearFloat[1]; clearData[2] = *(DWORD*)&clearFloat[2]; @@ -399,30 +399,32 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile } -void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) +void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) { - INVALIDATE_TILES_DESC *pDesc = (INVALIDATE_TILES_DESC*)pData; + DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData; SWR_CONTEXT *pContext = pDC->pContext; + const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount); + for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i) { if (pDesc->attachmentMask & (1 << i)) { - HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, false); + HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad( + pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples); if (pHotTile) { - pHotTile->state = HOTTILE_INVALID; + pHotTile->state = (HOTTILE_STATE)pDesc->newTileState; } } } } #if KNOB_SIMD_WIDTH == 8 -const __m256 vQuadCenterOffsetsX = { 0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5 }; -const __m256 vQuadCenterOffsetsY = { 0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5 }; -const __m256 vQuadULOffsetsX ={0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0}; -const __m256 vQuadULOffsetsY ={0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0}; -#define MASK 0xff +const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5}; +const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5}; +const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0}; +const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0}; #else #error Unsupported vector width #endif @@ -457,155 +459,6 @@ simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscala return _simd_movemask_ps(vClipMask); } -template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount> -INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask) -{ - - // will need to update for avx512 - assert(KNOB_SIMD_WIDTH == 8); - - __m256i mask[2]; - __m256i sampleCoverage[2]; - if(bIsStandardPattern) - { - __m256i src = _mm256_set1_epi32(0); - __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1; - - if(MultisampleTraits<sampleCountT>::numSamples == 1) - { - mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1); - } - else if(MultisampleTraits<sampleCountT>::numSamples == 2) - { - mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); - } - else if(MultisampleTraits<sampleCountT>::numSamples == 4) - { - mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); - } - else if(MultisampleTraits<sampleCountT>::numSamples == 8) - { - mask[0] = _mm256_set1_epi32(-1); - } - else if(MultisampleTraits<sampleCountT>::numSamples == 16) - { - mask[0] = _mm256_set1_epi32(-1); - mask[1] = _mm256_set1_epi32(-1); - index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8); - } - - // gather coverage for samples 0-7 - sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8)); - if(MultisampleTraits<sampleCountT>::numSamples > 8) - { - // gather coverage for samples 8-15 - sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8)); - } - } - else - { - // center coverage is the same for all samples; just broadcast to the sample slots - uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK); - if(MultisampleTraits<sampleCountT>::numSamples == 1) - { - sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage); - } - else if(MultisampleTraits<sampleCountT>::numSamples == 2) - { - sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage); - } - else if(MultisampleTraits<sampleCountT>::numSamples == 4) - { - sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage); - } - else if(MultisampleTraits<sampleCountT>::numSamples == 8) - { - sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); - } - else if(MultisampleTraits<sampleCountT>::numSamples == 16) - { - sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); - sampleCoverage[1] = _mm256_set1_epi32(centerCoverage); - } - } - - mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0); - // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane - __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]); - - __m256i packedCoverage1; - if(MultisampleTraits<sampleCountT>::numSamples > 8) - { - // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane - packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]); - } - -#if (KNOB_ARCH == KNOB_ARCH_AVX) - // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane - __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83); - __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); - packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE)); - - __m256i packedSampleCoverage; - if(MultisampleTraits<sampleCountT>::numSamples > 8) - { - // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane - hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83); - shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); - shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE); - packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01))); - packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC)); - } - else - { - packedSampleCoverage = packedCoverage0; - } -#else - __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0); - // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane - packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask); - - __m256i packedSampleCoverage; - if(MultisampleTraits<sampleCountT>::numSamples > 8) - { - permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7); - // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane - packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask); - - // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane - packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C); - } - else - { - packedSampleCoverage = packedCoverage0; - } -#endif - - for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--) - { - // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2 - inputMask[i] = _simd_movemask_epi8(packedSampleCoverage); - - if(!bForcedSampleCount) - { - // input coverage has to be anded with sample mask if MSAA isn't forced on - inputMask[i] &= sampleMask; - } - - // shift to the next pixel in the 4x2 - packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1); - } -} - -template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount> -INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask) -{ - uint32_t inputMask[KNOB_SIMD_WIDTH]; - generateInputCoverage<sampleCountT, bIsStandardPattern, bForcedSampleCount>(coverageMask, inputMask, sampleMask); - inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0])); -} - template<bool perspMask> INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext) { @@ -766,6 +619,8 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND // type safety guaranteed from template instantiation in BEChooser<>::GetFunc static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT; uint32_t rasterTileColorOffset = MultisampleTraits<sampleCount>::RasterTileColorOffset(sample); + simdvector blendOut; + for(uint32_t rt = 0; rt < NumRT; ++rt) { uint8_t *pColorSample; @@ -779,6 +634,9 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND } const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt]; + // pfnBlendFunc may not update all channels. Initialize with PS output. + /// TODO: move this into the blend JIT. + blendOut = psContext.shaded[rt]; // Blend outputs and update coverage mask for alpha test if(pfnBlendFunc[rt] != nullptr) @@ -789,7 +647,7 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND psContext.shaded[1], sample, pColorSample, - psContext.shaded[rt], + blendOut, &psContext.oMask, (simdscalari*)&coverageMask); } @@ -805,19 +663,19 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND // store with color mask if(!pRTBlend->writeDisableRed) { - _simd_maskstore_ps((float*)pColorSample, outputMask, psContext.shaded[rt].x); + _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x); } if(!pRTBlend->writeDisableGreen) { - _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, psContext.shaded[rt].y); + _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y); } if(!pRTBlend->writeDisableBlue) { - _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, psContext.shaded[rt].z); + _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z); } if(!pRTBlend->writeDisableAlpha) { - _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, psContext.shaded[rt].w); + _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w); } } } @@ -884,9 +742,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { // UL pixel corner - psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy)); + psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy)); // pixel center - psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy)); + psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy)); for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { @@ -898,9 +756,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 if(coverageMask & MASK) { RDTSC_START(BEBarycentric); - psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx)); + psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx)); // pixel center - psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx)); + psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx)); backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext); @@ -1077,15 +935,15 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { // UL pixel corner - psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy)); + psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy)); // pixel center - psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy)); + psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy)); for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { - psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx)); + psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx)); // pixel center - psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx)); + psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx)); RDTSC_START(BEBarycentric); backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext); @@ -1313,14 +1171,14 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { - psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy)); - psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy)); + psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy)); + psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy)); for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { - simdscalar vZ[MultisampleTraits<sampleCount>::numSamples]; - psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx)); + simdscalar vZ[MultisampleTraits<sampleCount>::numSamples]{ 0 }; + psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx)); // set pixel center positions - psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx)); + psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx)); if (bInputCoverage) { @@ -1353,7 +1211,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t } else { - psContext.activeMask = _simd_set1_epi32(-1); + psContext.activeMask = _simd_set1_epi32(-1); } // need to declare enough space for all samples @@ -1552,9 +1410,11 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, RDTSC_START(BESetup); static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT; + SWR_CONTEXT *pContext = pDC->pContext; const API_STATE& state = GetApiState(pDC); const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; + const SWR_RASTSTATE& rastState = pDC->pState->state.rastState; // broadcast scalars BarycentricCoeffs coeffs; @@ -1572,7 +1432,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet); - BYTE *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil; + uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil; RDTSC_STOP(BESetup, 0, 0); @@ -1580,12 +1440,12 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { // UL pixel corner - simdscalar vYSamplePosUL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy)); + simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy)); for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { // UL pixel corners - simdscalar vXSamplePosUL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx)); + simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx)); // iterate over active samples unsigned long sample = 0; @@ -1593,7 +1453,8 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, while (_BitScanForward(&sample, sampleMask)) { sampleMask &= ~(1 << sample); - if (work.coverageMask[sample] & MASK) + simdmask coverageMask = work.coverageMask[sample] & MASK; + if (coverageMask) { RDTSC_START(BEBarycentric); // calculate per sample positions @@ -1607,7 +1468,14 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, RDTSC_STOP(BEBarycentric, 0, 0); - simdscalar vCoverageMask = vMask(work.coverageMask[sample] & MASK); + // interpolate user clip distance if available + if (rastState.clipDistanceMask) + { + coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer, + psContext.vI.sample, psContext.vJ.sample); + } + + simdscalar vCoverageMask = vMask(coverageMask); simdscalar stencilPassMask = vCoverageMask; // offset depth/stencil buffers current sample diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h index 53089e5047b..2fa18953cad 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend.h @@ -29,16 +29,20 @@ #pragma once #include "common/os.h" -#include "core/context.h" +#include "core/context.h" +#include "core/multisample.h" void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId); void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); -void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); +void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers); void InitClearTilesTable(); +simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ); +void InitBackendFuncTables(); +void InitCPSFuncTables(); enum SWR_BACKEND_FUNCS { @@ -47,13 +51,160 @@ enum SWR_BACKEND_FUNCS SWR_BACKEND_MSAA_SAMPLE_RATE, SWR_BACKEND_FUNCS_MAX, }; -void InitBackendFuncTables(); -extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX]; -extern PFN_BACKEND_FUNC gBackendSingleSample[2][2]; -extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2]; -extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2]; -extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX]; -extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2]; -extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2]; -extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2]; +#if KNOB_SIMD_WIDTH == 8 +extern const __m256 vCenterOffsetsX; +extern const __m256 vCenterOffsetsY; +extern const __m256 vULOffsetsX; +extern const __m256 vULOffsetsY; +#define MASK 0xff +#endif + +template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount> +INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask) +{ + + // will need to update for avx512 + assert(KNOB_SIMD_WIDTH == 8); + + __m256i mask[2]; + __m256i sampleCoverage[2]; + if(bIsStandardPattern) + { + __m256i src = _mm256_set1_epi32(0); + __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1; + + if(MultisampleTraits<sampleCountT>::numSamples == 1) + { + mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 2) + { + mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 4) + { + mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 8) + { + mask[0] = _mm256_set1_epi32(-1); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 16) + { + mask[0] = _mm256_set1_epi32(-1); + mask[1] = _mm256_set1_epi32(-1); + index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8); + } + + // gather coverage for samples 0-7 + sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8)); + if(MultisampleTraits<sampleCountT>::numSamples > 8) + { + // gather coverage for samples 8-15 + sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8)); + } + } + else + { + // center coverage is the same for all samples; just broadcast to the sample slots + uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK); + if(MultisampleTraits<sampleCountT>::numSamples == 1) + { + sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 2) + { + sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 4) + { + sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 8) + { + sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 16) + { + sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); + sampleCoverage[1] = _mm256_set1_epi32(centerCoverage); + } + } + + mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0); + // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane + __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]); + + __m256i packedCoverage1; + if(MultisampleTraits<sampleCountT>::numSamples > 8) + { + // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane + packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]); + } + +#if (KNOB_ARCH == KNOB_ARCH_AVX) + // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane + __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83); + __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); + packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE)); + + __m256i packedSampleCoverage; + if(MultisampleTraits<sampleCountT>::numSamples > 8) + { + // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane + hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83); + shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); + shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE); + packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01))); + packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC)); + } + else + { + packedSampleCoverage = packedCoverage0; + } +#else + __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0); + // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane + packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask); + + __m256i packedSampleCoverage; + if(MultisampleTraits<sampleCountT>::numSamples > 8) + { + permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7); + // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane + packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask); + + // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane + packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C); + } + else + { + packedSampleCoverage = packedCoverage0; + } +#endif + + for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--) + { + // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2 + inputMask[i] = _simd_movemask_epi8(packedSampleCoverage); + + if(!bForcedSampleCount) + { + // input coverage has to be anded with sample mask if MSAA isn't forced on + inputMask[i] &= sampleMask; + } + + // shift to the next pixel in the 4x2 + packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1); + } +} + +template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount> +INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask) +{ + uint32_t inputMask[KNOB_SIMD_WIDTH]; + generateInputCoverage<sampleCountT, bIsStandardPattern, bForcedSampleCount>(coverageMask, inputMask, sampleMask); + inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0])); +} diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp index ce27bf71d3c..3a2a8b35be8 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp @@ -31,6 +31,9 @@ #include "common/os.h" #include "core/clip.h" +// Temp storage used by the clipper +THREAD simdvertex tlsTempVertices[7]; + float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1) { return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1)); diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index 49494a4e374..ba5870a92bb 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -32,6 +32,9 @@ #include "core/pa.h" #include "rdtsc_core.h" +// Temp storage used by the clipper +extern THREAD simdvertex tlsTempVertices[7]; + enum SWR_CLIPCODES { // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare. @@ -354,6 +357,25 @@ public: } } + // assemble user clip distances if enabled + if (this->state.rastState.clipDistanceMask & 0xf) + { + pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector); + for (uint32_t i = 0; i < NumVertsPerPrim; ++i) + { + vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = tmpVector[i]; + } + } + + if (this->state.rastState.clipDistanceMask & 0xf0) + { + pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector); + for (uint32_t i = 0; i < NumVertsPerPrim; ++i) + { + vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = tmpVector[i]; + } + } + uint32_t numAttribs = maxSlot + 1; simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs); @@ -436,6 +458,27 @@ public: } } + // transpose user clip distances if enabled + if (this->state.rastState.clipDistanceMask & 0xf) + { + pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim; + for (uint32_t c = 0; c < 4; ++c) + { + transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1); + pBase += sizeof(simdscalar); + } + } + + if (this->state.rastState.clipDistanceMask & 0xf0) + { + pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim; + for (uint32_t c = 0; c < 4; ++c) + { + transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1); + pBase += sizeof(simdscalar); + } + } + PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology); while (clipPa.GetNextStreamOutput()) @@ -630,6 +673,31 @@ private: ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); } } + + // interpolate clip distance if enabled + if (this->state.rastState.clipDistanceMask & 0xf) + { + uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT; + for (uint32_t c = 0; c < 4; ++c) + { + simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); + simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); + simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0); + ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); + } + } + + if (this->state.rastState.clipDistanceMask & 0xf0) + { + uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT; + for (uint32_t c = 0; c < 4; ++c) + { + simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); + simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); + simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0); + ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); + } + } } template<SWR_CLIPCODES ClippingPlane> @@ -700,6 +768,27 @@ private: } } + // store clip distance if enabled + if (this->state.rastState.clipDistanceMask & 0xf) + { + uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT; + for (uint32_t c = 0; c < 4; ++c) + { + simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); + ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); + } + } + + if (this->state.rastState.clipDistanceMask & 0xf0) + { + uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT; + for (uint32_t c = 0; c < 4; ++c) + { + simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); + ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); + } + } + // increment outIndex vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in); } @@ -818,8 +907,7 @@ private: simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs) { // temp storage - simdvertex tempVertices[7]; - float* pTempVerts = (float*)&tempVertices[0]; + float* pTempVerts = (float*)&tlsTempVertices[0]; // zero out num input verts for non-active lanes simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim); @@ -854,9 +942,9 @@ private: return vNumOutPts; } - const uint32_t workerId; - const DRIVER_TYPE driverType; - DRAW_CONTEXT* pDC; + const uint32_t workerId{ 0 }; + const DRIVER_TYPE driverType{ DX }; + DRAW_CONTEXT* pDC{ nullptr }; const API_STATE& state; simdscalar clipCodes[NumVertsPerPrim]; }; diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index 4a214aff1c8..39f23372a18 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -41,6 +41,7 @@ #include "core/knobs.h" #include "common/simdintrin.h" #include "core/threads.h" +#include "ringbuffer.h" // x.8 fixed point precision values #define FIXED_POINT_SHIFT 8 @@ -82,6 +83,7 @@ struct SWR_TRIANGLE_DESC float *pUserClipBuffer; uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES]; + uint64_t anyCoveredSamples; TRI_FLAGS triFlags; }; @@ -109,12 +111,16 @@ struct CLEAR_DESC CLEAR_FLAGS flags; float clearRTColor[4]; // RGBA_32F float clearDepth; // [0..1] - BYTE clearStencil; + uint8_t clearStencil; }; -struct INVALIDATE_TILES_DESC +struct DISCARD_INVALIDATE_TILES_DESC { uint32_t attachmentMask; + SWR_RECT rect; + SWR_TILE_STATE newTileState; + bool createNewTiles; + bool fullTilesOnly; }; struct SYNC_DESC @@ -150,7 +156,7 @@ enum WORK_TYPE SYNC, DRAW, CLEAR, - INVALIDATETILES, + DISCARDINVALIDATETILES, STORETILES, QUERYSTATS, }; @@ -164,7 +170,7 @@ struct BE_WORK SYNC_DESC sync; TRIANGLE_WORK_DESC tri; CLEAR_DESC clear; - INVALIDATE_TILES_DESC invalidateTiles; + DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles; STORE_TILES_DESC storeTiles; QUERY_DESC queryStats; } desc; @@ -201,7 +207,7 @@ struct FE_WORK SYNC_DESC sync; DRAW_WORK draw; CLEAR_DESC clear; - INVALIDATE_TILES_DESC invalidateTiles; + DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles; STORE_TILES_DESC storeTiles; QUERY_DESC queryStats; } desc; @@ -354,6 +360,7 @@ struct BACKEND_FUNCS PFN_OUTPUT_MERGER pfnOutputMerger; }; + // Draw State struct DRAW_STATE { @@ -365,7 +372,7 @@ struct DRAW_STATE BACKEND_FUNCS backendFuncs; PFN_PROCESS_PRIMS pfnProcessPrims; - Arena* pArena; // This should only be used by API thread. + CachingArena* pArena; // This should only be used by API thread. }; // Draw Context @@ -381,25 +388,22 @@ struct DRAW_CONTEXT FE_WORK FeWork; volatile OSALIGNLINE(uint32_t) FeLock; - volatile OSALIGNLINE(bool) inUse; volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw? - - // Have all worker threads moved past draw in DC ring? - volatile OSALIGNLINE(uint32_t) threadsDoneFE; - volatile OSALIGNLINE(uint32_t) threadsDoneBE; + volatile OSALIGNLINE(int64_t) threadsDone; uint64_t dependency; MacroTileMgr* pTileMgr; // The following fields are valid if isCompute is true. - volatile OSALIGNLINE(bool) doneCompute; // Is this dispatch done? (isCompute) DispatchQueue* pDispatch; // Queue for thread groups. (isCompute) DRAW_STATE* pState; - Arena* pArena; + CachingArena* pArena; uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS]; // Scratch space used for spill fills. + + bool cleanupState; // True if this is the last draw using an entry in the state ring. }; INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC) @@ -438,7 +442,7 @@ struct SWR_CONTEXT // 3. State - When an applications sets state after draw // a. Same as step 1. // b. State is copied from prev draw context to current. - DRAW_CONTEXT* dcRing; + RingBuffer<DRAW_CONTEXT> dcRing; DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw. DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from. @@ -448,14 +452,10 @@ struct SWR_CONTEXT // These split draws all have identical state. So instead of storing the state directly // in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs // to reference a single entry in the DS ring. - DRAW_STATE* dsRing; + RingBuffer<DRAW_STATE> dsRing; uint32_t curStateId; // Current index to the next available entry in the DS ring. - DRAW_STATE* subCtxSave; // Save area for inactive contexts. - uint32_t curSubCtxId; // Current index for active state subcontext. - uint32_t numSubContexts; // Number of available subcontexts - uint32_t NumWorkerThreads; THREAD_POOL threadPool; // Thread pool associated with this context @@ -463,13 +463,6 @@ struct SWR_CONTEXT std::condition_variable FifosNotEmpty; std::mutex WaitLock; - // Draw Contexts will get a unique drawId generated from this - uint64_t nextDrawId; - - // most recent draw id enqueued by the API thread - // written by api thread, read by multiple workers - OSALIGNLINE(volatile uint64_t) DrawEnqueued; - DRIVER_TYPE driverType; uint32_t privateStateSize; @@ -486,6 +479,8 @@ struct SWR_CONTEXT // Scratch space for workers. uint8_t* pScratch[KNOB_MAX_NUM_THREADS]; + + CachingAllocator cachingArenaAllocator; }; void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId); diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h index 4f245c8c53e..2cc9d4054ac 100644 --- a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h +++ b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h @@ -82,7 +82,7 @@ void StencilOp(SWR_STENCILOP op, simdscalar mask, simdscalar stencilRefps, simds INLINE simdscalar DepthStencilTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState, - bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, simdscalar coverageMask, BYTE *pStencilBase, + bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask, uint8_t *pStencilBase, simdscalar* pStencilMask) { static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); @@ -177,8 +177,8 @@ simdscalar DepthStencilTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENC INLINE void DepthStencilWrite(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState, - bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask, - BYTE *pStencilBase, const simdscalar& stencilMask) + bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask, + uint8_t *pStencilBase, const simdscalar& stencilMask) { if (pDSState->depthWriteEnable) { diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp index 7e556012e6b..ccf0b70544f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp +++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp @@ -49,7 +49,8 @@ struct QUEUE static const uint32_t mBlockSizeShift = 6; static const uint32_t mBlockSize = 1 << mBlockSizeShift; - void clear(Arena& arena) + template <typename ArenaT> + void clear(ArenaT& arena) { mHead = 0; mTail = 0; @@ -102,7 +103,8 @@ struct QUEUE mNumEntries --; } - bool enqueue_try_nosync(Arena& arena, const T* entry) + template <typename ArenaT> + bool enqueue_try_nosync(ArenaT& arena, const T* entry) { memcpy(&mCurBlock[mTail], entry, sizeof(T)); diff --git a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h index 83d85fc86d8..344758eefe5 100644 --- a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h +++ b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h @@ -34,7 +34,7 @@ /// @param pSrc - source data in SOA form /// @param dst - output data in SOA form template<SWR_FORMAT SrcFormat> -INLINE void LoadSOA(const BYTE *pSrc, simdvector &dst) +INLINE void LoadSOA(const uint8_t *pSrc, simdvector &dst) { // fast path for float32 if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<SrcFormat>::GetBPC(0) == 32)) @@ -141,7 +141,7 @@ INLINE simdscalar Normalize(simdscalar vComp, uint32_t Component) /// @param src - source data in SOA form /// @param dst - output data in SOA form template<SWR_FORMAT DstFormat> -INLINE void StoreSOA(const simdvector &src, BYTE *pDst) +INLINE void StoreSOA(const simdvector &src, uint8_t *pDst) { // fast path for float32 if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<DstFormat>::GetBPC(0) == 32)) diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h index aa350259a15..9acf846a7f0 100644 --- a/src/gallium/drivers/swr/rasterizer/core/format_types.h +++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h @@ -34,8 +34,8 @@ template <uint32_t NumBits, bool Signed = false> struct PackTraits { static const uint32_t MyNumBits = NumBits; - static simdscalar loadSOA(const BYTE *pSrc) = delete; - static void storeSOA(BYTE *pDst, simdscalar src) = delete; + static simdscalar loadSOA(const uint8_t *pSrc) = delete; + static void storeSOA(uint8_t *pDst, simdscalar src) = delete; static simdscalar unpack(simdscalar &in) = delete; static simdscalar pack(simdscalar &in) = delete; }; @@ -48,8 +48,8 @@ struct PackTraits<0, false> { static const uint32_t MyNumBits = 0; - static simdscalar loadSOA(const BYTE *pSrc) { return _simd_setzero_ps(); } - static void storeSOA(BYTE *pDst, simdscalar src) { return; } + static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_setzero_ps(); } + static void storeSOA(uint8_t *pDst, simdscalar src) { return; } static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); } static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); } }; @@ -63,7 +63,7 @@ struct PackTraits<8, false> { static const uint32_t MyNumBits = 8; - static simdscalar loadSOA(const BYTE *pSrc) + static simdscalar loadSOA(const uint8_t *pSrc) { #if KNOB_SIMD_WIDTH == 8 __m256 result = _mm256_setzero_ps(); @@ -74,7 +74,7 @@ struct PackTraits<8, false> #endif } - static void storeSOA(BYTE *pDst, simdscalar src) + static void storeSOA(uint8_t *pDst, simdscalar src) { // store simd bytes #if KNOB_SIMD_WIDTH == 8 @@ -125,7 +125,7 @@ struct PackTraits<8, true> { static const uint32_t MyNumBits = 8; - static simdscalar loadSOA(const BYTE *pSrc) + static simdscalar loadSOA(const uint8_t *pSrc) { #if KNOB_SIMD_WIDTH == 8 __m256 result = _mm256_setzero_ps(); @@ -136,7 +136,7 @@ struct PackTraits<8, true> #endif } - static void storeSOA(BYTE *pDst, simdscalar src) + static void storeSOA(uint8_t *pDst, simdscalar src) { // store simd bytes #if KNOB_SIMD_WIDTH == 8 @@ -188,7 +188,7 @@ struct PackTraits<16, false> { static const uint32_t MyNumBits = 16; - static simdscalar loadSOA(const BYTE *pSrc) + static simdscalar loadSOA(const uint8_t *pSrc) { #if KNOB_SIMD_WIDTH == 8 __m256 result = _mm256_setzero_ps(); @@ -199,7 +199,7 @@ struct PackTraits<16, false> #endif } - static void storeSOA(BYTE *pDst, simdscalar src) + static void storeSOA(uint8_t *pDst, simdscalar src) { #if KNOB_SIMD_WIDTH == 8 // store 16B (2B * 8) @@ -249,7 +249,7 @@ struct PackTraits<16, true> { static const uint32_t MyNumBits = 16; - static simdscalar loadSOA(const BYTE *pSrc) + static simdscalar loadSOA(const uint8_t *pSrc) { #if KNOB_SIMD_WIDTH == 8 __m256 result = _mm256_setzero_ps(); @@ -260,7 +260,7 @@ struct PackTraits<16, true> #endif } - static void storeSOA(BYTE *pDst, simdscalar src) + static void storeSOA(uint8_t *pDst, simdscalar src) { #if KNOB_SIMD_WIDTH == 8 // store 16B (2B * 8) @@ -311,8 +311,8 @@ struct PackTraits<32, false> { static const uint32_t MyNumBits = 32; - static simdscalar loadSOA(const BYTE *pSrc) { return _simd_load_ps((const float*)pSrc); } - static void storeSOA(BYTE *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); } + static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_load_ps((const float*)pSrc); } + static void storeSOA(uint8_t *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); } static simdscalar unpack(simdscalar &in) { return in; } static simdscalar pack(simdscalar &in) { return in; } }; @@ -984,7 +984,7 @@ struct ComponentTraits return TypeTraits<X, NumBitsX>::fromFloat(); } - INLINE static simdscalar loadSOA(uint32_t comp, const BYTE* pSrc) + INLINE static simdscalar loadSOA(uint32_t comp, const uint8_t* pSrc) { switch (comp) { @@ -1001,7 +1001,7 @@ struct ComponentTraits return TypeTraits<X, NumBitsX>::loadSOA(pSrc); } - INLINE static void storeSOA(uint32_t comp, BYTE *pDst, simdscalar src) + INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simdscalar src) { switch (comp) { diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index f43a672bd82..36721e00beb 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -193,35 +193,71 @@ void ProcessStoreTiles( /// @param workerId - thread's worker id. Even thread has a unique id. /// @param pUserData - Pointer to user data passed back to callback. /// @todo This should go away when we switch this to use compute threading. -void ProcessInvalidateTiles( +void ProcessDiscardInvalidateTiles( SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData) { RDTSC_START(FEProcessInvalidateTiles); - INVALIDATE_TILES_DESC *pInv = (INVALIDATE_TILES_DESC*)pUserData; + DISCARD_INVALIDATE_TILES_DESC *pInv = (DISCARD_INVALIDATE_TILES_DESC*)pUserData; MacroTileMgr *pTileMgr = pDC->pTileMgr; - const API_STATE& state = GetApiState(pDC); + SWR_RECT rect; + + if (pInv->rect.top | pInv->rect.bottom | pInv->rect.right | pInv->rect.left) + { + // Valid rect + rect = pInv->rect; + } + else + { + // Use viewport dimensions + const API_STATE& state = GetApiState(pDC); + + rect.left = (uint32_t)state.vp[0].x; + rect.right = (uint32_t)(state.vp[0].x + state.vp[0].width); + rect.top = (uint32_t)state.vp[0].y; + rect.bottom = (uint32_t)(state.vp[0].y + state.vp[0].height); + } // queue a store to each macro tile // compute macro tile bounds for the current render target uint32_t macroWidth = KNOB_MACROTILE_X_DIM; uint32_t macroHeight = KNOB_MACROTILE_Y_DIM; - uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth; - uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight; + // Setup region assuming full tiles + uint32_t macroTileStartX = (rect.left + (macroWidth - 1)) / macroWidth; + uint32_t macroTileStartY = (rect.top + (macroHeight - 1)) / macroHeight; + + uint32_t macroTileEndX = rect.right / macroWidth; + uint32_t macroTileEndY = rect.bottom / macroHeight; + + if (pInv->fullTilesOnly == false) + { + // include partial tiles + macroTileStartX = rect.left / macroWidth; + macroTileStartY = rect.top / macroHeight; + + macroTileEndX = (rect.right + macroWidth - 1) / macroWidth; + macroTileEndY = (rect.bottom + macroHeight - 1) / macroHeight; + } + + SWR_ASSERT(macroTileEndX <= KNOB_NUM_HOT_TILES_X); + SWR_ASSERT(macroTileEndY <= KNOB_NUM_HOT_TILES_Y); + + macroTileEndX = std::min<uint32_t>(macroTileEndX, KNOB_NUM_HOT_TILES_X); + macroTileEndY = std::min<uint32_t>(macroTileEndY, KNOB_NUM_HOT_TILES_Y); // load tiles BE_WORK work; - work.type = INVALIDATETILES; - work.pfnWork = ProcessInvalidateTilesBE; - work.desc.invalidateTiles = *pInv; + work.type = DISCARDINVALIDATETILES; + work.pfnWork = ProcessDiscardInvalidateTilesBE; + work.desc.discardInvalidateTiles = *pInv; - for (uint32_t x = 0; x < numMacroTilesX; ++x) + for (uint32_t x = macroTileStartX; x < macroTileEndX; ++x) { - for (uint32_t y = 0; y < numMacroTilesY; ++y) + for (uint32_t y = macroTileStartY; y < macroTileEndY; ++y) { pTileMgr->enqueue(x, y, &work); } @@ -630,6 +666,8 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t num } } +THREAD SWR_GS_CONTEXT tlsGsContext; + ////////////////////////////////////////////////////////////////////////// /// @brief Implements GS stage. /// @param pDC - pointer to draw context. @@ -651,7 +689,6 @@ static void GeometryShaderStage( { RDTSC_START(FEGeometryShader); - SWR_GS_CONTEXT gsContext; SWR_CONTEXT* pContext = pDC->pContext; const API_STATE& state = GetApiState(pDC); @@ -660,9 +697,9 @@ static void GeometryShaderStage( SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized"); SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized"); - gsContext.pStream = (uint8_t*)pGsOut; - gsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer; - gsContext.PrimitiveID = primID; + tlsGsContext.pStream = (uint8_t*)pGsOut; + tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer; + tlsGsContext.PrimitiveID = primID; uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true); simdvector attrib[MAX_ATTRIBUTES]; @@ -675,7 +712,7 @@ static void GeometryShaderStage( for (uint32_t i = 0; i < numVertsPerPrim; ++i) { - gsContext.vert[i].attrib[attribSlot] = attrib[i]; + tlsGsContext.vert[i].attrib[attribSlot] = attrib[i]; } } @@ -683,7 +720,7 @@ static void GeometryShaderStage( pa.Assemble(VERTEX_POSITION_SLOT, attrib); for (uint32_t i = 0; i < numVertsPerPrim; ++i) { - gsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i]; + tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i]; } const uint32_t vertexStride = sizeof(simdvertex); @@ -710,14 +747,14 @@ static void GeometryShaderStage( for (uint32_t instance = 0; instance < pState->instanceCount; ++instance) { - gsContext.InstanceID = instance; - gsContext.mask = GenerateMask(numInputPrims); + tlsGsContext.InstanceID = instance; + tlsGsContext.mask = GenerateMask(numInputPrims); // execute the geometry shader - state.pfnGsFunc(GetPrivateState(pDC), &gsContext); + state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext); - gsContext.pStream += instanceStride; - gsContext.pCutOrStreamIdBuffer += cutInstanceStride; + tlsGsContext.pStream += instanceStride; + tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride; } // set up new binner and state for the GS output topology @@ -736,7 +773,7 @@ static void GeometryShaderStage( // foreach input prim: // - setup a new PA based on the emitted verts for that prim // - loop over the new verts, calling PA to assemble each prim - uint32_t* pVertexCount = (uint32_t*)&gsContext.vertexCount; + uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount; uint32_t* pPrimitiveId = (uint32_t*)&primID; uint32_t totalPrimsGenerated = 0; @@ -844,7 +881,7 @@ static void GeometryShaderStage( static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer, void **ppStreamCutBuffer) { - Arena* pArena = pDC->pArena; + auto pArena = pDC->pArena; SWR_ASSERT(pArena != nullptr); SWR_ASSERT(state.gsState.gsEnable); // allocate arena space to hold GS output verts @@ -1186,7 +1223,7 @@ void ProcessDraw( // if the entire index buffer isn't being consumed, set the last index // so that fetches < a SIMD wide will be masked off - fetchInfo.pLastIndex = (const int32_t*)(((BYTE*)state.indexBuffer.pIndices) + state.indexBuffer.size); + fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size); if (pLastRequestedIndex < fetchInfo.pLastIndex) { fetchInfo.pLastIndex = pLastRequestedIndex; @@ -1362,7 +1399,7 @@ void ProcessDraw( i += KNOB_SIMD_WIDTH; if (IsIndexedT) { - fetchInfo.pIndices = (int*)((BYTE*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize); + fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize); } else { @@ -1776,7 +1813,7 @@ void BinTriangles( work.pfnWork = gRasterizerTable[rastState.scissorEnable][SWR_MULTISAMPLE_1X]; } - Arena* pArena = pDC->pArena; + auto pArena = pDC->pArena; SWR_ASSERT(pArena != nullptr); // store active attribs @@ -1948,7 +1985,7 @@ void BinPoints( work.pfnWork = RasterizeSimplePoint; - Arena* pArena = pDC->pArena; + auto pArena = pDC->pArena; SWR_ASSERT(pArena != nullptr); // store attributes @@ -2082,7 +2119,7 @@ void BinPoints( work.pfnWork = RasterizeTriPoint; - Arena* pArena = pDC->pArena; + auto pArena = pDC->pArena; SWR_ASSERT(pArena != nullptr); // store active attribs @@ -2299,7 +2336,7 @@ void BinLines( work.pfnWork = RasterizeLine; - Arena* pArena = pDC->pArena; + auto pArena = pDC->pArena; SWR_ASSERT(pArena != nullptr); // store active attribs diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h index acb935fc251..f92f88c3226 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.h +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h @@ -146,14 +146,13 @@ float calcDeterminantInt(const __m128i vA, const __m128i vB) //vMul = [A1*B2 - B1*A2] vMul = _mm_sub_epi64(vMul, vMul2); - // According to emmintrin.h __mm_store1_pd(), address must be 16-byte aligned - OSALIGN(int64_t, 16) result; - _mm_store1_pd((double*)&result, _mm_castsi128_pd(vMul)); + int64_t result; + _mm_store_sd((double*)&result, _mm_castsi128_pd(vMul)); - double fResult = (double)result; - fResult = fResult * (1.0 / FIXED_POINT16_SCALE); + double dResult = (double)result; + dResult = dResult * (1.0 / FIXED_POINT16_SCALE); - return (float)fResult; + return (float)dResult; } INLINE @@ -316,7 +315,7 @@ void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, vo void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -void ProcessInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +void ProcessDiscardInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); void ProcessQueryStats(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h index 3f19555557f..adf738c1bed 100644 --- a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h +++ b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h @@ -80,6 +80,11 @@ static inline void ConvertEnvToKnob(const char* pOverride, float& knobValue) } } +static inline void ConvertEnvToKnob(const char* pOverride, std::string& knobValue) +{ + knobValue = pOverride; +} + template <typename T> static inline void InitKnob(T& knob) { diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h index 2028d9fbcfe..f8f1a33b7e3 100644 --- a/src/gallium/drivers/swr/rasterizer/core/pa.h +++ b/src/gallium/drivers/swr/rasterizer/core/pa.h @@ -34,12 +34,12 @@ struct PA_STATE { - DRAW_CONTEXT *pDC; // draw context - uint8_t* pStreamBase; // vertex stream - uint32_t streamSizeInVerts; // total size of the input stream in verts + DRAW_CONTEXT *pDC{ nullptr }; // draw context + uint8_t* pStreamBase{ nullptr }; // vertex stream + uint32_t streamSizeInVerts{ 0 }; // total size of the input stream in verts // The topology the binner will use. In some cases the FE changes the topology from the api state. - PRIMITIVE_TOPOLOGY binTopology; + PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN }; PA_STATE() {} PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) : @@ -76,37 +76,37 @@ struct PA_STATE // cuts struct PA_STATE_OPT : public PA_STATE { - simdvertex leadingVertex; // For tri-fan - uint32_t numPrims; // Total number of primitives for draw. - uint32_t numPrimsComplete; // Total number of complete primitives. + simdvertex leadingVertex; // For tri-fan + uint32_t numPrims{ 0 }; // Total number of primitives for draw. + uint32_t numPrimsComplete{ 0 }; // Total number of complete primitives. - uint32_t numSimdPrims; // Number of prims in current simd. + uint32_t numSimdPrims{ 0 }; // Number of prims in current simd. - uint32_t cur; // index to current VS output. - uint32_t prev; // index to prev VS output. Not really needed in the state. - uint32_t first; // index to first VS output. Used for trifan. + uint32_t cur{ 0 }; // index to current VS output. + uint32_t prev{ 0 }; // index to prev VS output. Not really needed in the state. + uint32_t first{ 0 }; // index to first VS output. Used for trifan. - uint32_t counter; // state counter - bool reset; // reset state + uint32_t counter{ 0 }; // state counter + bool reset{ false }; // reset state - uint32_t primIDIncr; // how much to increment for each vector (typically vector / {1, 2}) + uint32_t primIDIncr{ 0 }; // how much to increment for each vector (typically vector / {1, 2}) simdscalari primID; typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]); typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); - PFN_PA_FUNC pfnPaFunc; // PA state machine function for assembling 4 triangles. - PFN_PA_SINGLE_FUNC pfnPaSingleFunc; // PA state machine function for assembling single triangle. - PFN_PA_FUNC pfnPaFuncReset; // initial state to set on reset + PFN_PA_FUNC pfnPaFunc{ nullptr }; // PA state machine function for assembling 4 triangles. + PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr }; // PA state machine function for assembling single triangle. + PFN_PA_FUNC pfnPaFuncReset{ nullptr }; // initial state to set on reset // state used to advance the PA when Next is called - PFN_PA_FUNC pfnPaNextFunc; - uint32_t nextNumSimdPrims; - uint32_t nextNumPrimsIncrement; - bool nextReset; - bool isStreaming; + PFN_PA_FUNC pfnPaNextFunc{ nullptr }; + uint32_t nextNumSimdPrims{ 0 }; + uint32_t nextNumPrimsIncrement{ 0 }; + bool nextReset{ false }; + bool isStreaming{ false }; - simdmask tmpIndices; // temporary index store for unused virtual function + simdmask tmpIndices{ 0 }; // temporary index store for unused virtual function PA_STATE_OPT() {} PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts, @@ -333,33 +333,33 @@ INLINE __m128 swizzleLaneN(const simdvector &a, int lane) // Cut-aware primitive assembler. struct PA_STATE_CUT : public PA_STATE { - simdmask* pCutIndices; // cut indices buffer, 1 bit per vertex - uint32_t numVerts; // number of vertices available in buffer store - uint32_t numAttribs; // number of attributes - int32_t numRemainingVerts; // number of verts remaining to be assembled - uint32_t numVertsToAssemble; // total number of verts to assemble for the draw + simdmask* pCutIndices{ nullptr }; // cut indices buffer, 1 bit per vertex + uint32_t numVerts{ 0 }; // number of vertices available in buffer store + uint32_t numAttribs{ 0 }; // number of attributes + int32_t numRemainingVerts{ 0 }; // number of verts remaining to be assembled + uint32_t numVertsToAssemble{ 0 }; // total number of verts to assemble for the draw OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH]; // current index buffer for gather simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd - uint32_t numPrimsAssembled; // number of primitives that are fully assembled - uint32_t headVertex; // current unused vertex slot in vertex buffer store - uint32_t tailVertex; // beginning vertex currently assembling - uint32_t curVertex; // current unprocessed vertex - uint32_t startPrimId; // starting prim id - simdscalari vPrimId; // vector of prim ID - bool needOffsets; // need to compute gather offsets for current SIMD - uint32_t vertsPerPrim; - simdvertex tmpVertex; // temporary simdvertex for unimplemented API - bool processCutVerts; // vertex indices with cuts should be processed as normal, otherwise they - // are ignored. Fetch shader sends invalid verts on cuts that should be ignored - // while the GS sends valid verts for every index + uint32_t numPrimsAssembled{ 0 }; // number of primitives that are fully assembled + uint32_t headVertex{ 0 }; // current unused vertex slot in vertex buffer store + uint32_t tailVertex{ 0 }; // beginning vertex currently assembling + uint32_t curVertex{ 0 }; // current unprocessed vertex + uint32_t startPrimId{ 0 }; // starting prim id + simdscalari vPrimId; // vector of prim ID + bool needOffsets{ false }; // need to compute gather offsets for current SIMD + uint32_t vertsPerPrim{ 0 }; + simdvertex tmpVertex; // temporary simdvertex for unimplemented API + bool processCutVerts{ false }; // vertex indices with cuts should be processed as normal, otherwise they + // are ignored. Fetch shader sends invalid verts on cuts that should be ignored + // while the GS sends valid verts for every index // Topology state tracking uint32_t vert[MAX_NUM_VERTS_PER_PRIM]; - uint32_t curIndex; - bool reverseWinding; // indicates reverse winding for strips - int32_t adjExtraVert; // extra vert uses for tristrip w/ adj + uint32_t curIndex{ 0 }; + bool reverseWinding{ false }; // indicates reverse winding for strips + int32_t adjExtraVert{ 0 }; // extra vert uses for tristrip w/ adj typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish); - PFN_PA_FUNC pfnPa; // per-topology function that processes a single vert + PFN_PA_FUNC pfnPa{ nullptr }; // per-topology function that processes a single vert PA_STATE_CUT() {} PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts, @@ -1199,9 +1199,9 @@ struct PA_FACTORY PA_STATE_OPT paOpt; PA_STATE_CUT paCut; - bool cutPA; + bool cutPA{ false }; - PRIMITIVE_TOPOLOGY topo; + PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN }; simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM]; simdmask indexStore[MAX_NUM_VERTS_PER_PRIM]; diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp index 587e336d87d..52fb7c88cdd 100644 --- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp @@ -690,9 +690,10 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile // used to for testing if entire raster tile is inside a triangle - vEdgeFix16[0] = _mm256_add_pd(vEdgeFix16[0], rastEdges[0].vRasterTileOffsets); - vEdgeFix16[1] = _mm256_add_pd(vEdgeFix16[1], rastEdges[1].vRasterTileOffsets); - vEdgeFix16[2] = _mm256_add_pd(vEdgeFix16[2], rastEdges[2].vRasterTileOffsets); + for (uint32_t e = 0; e < numEdges; ++e) + { + vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets); + } // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox // step sample positions to the raster tile bbox of multisample points @@ -700,7 +701,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, // | | // | | // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples) - __m256d vEdge0TileBbox, vEdge1TileBbox, vEdge2TileBbox; + __m256d vEdgeTileBbox[3]; if (sampleCount > SWR_MULTISAMPLE_1X) { __m128i vTileSampleBBoxXh = MultisampleTraits<sampleCount>::TileSampleOffsetsX(); @@ -711,17 +712,12 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, // step edge equation tests from Tile // used to for testing if entire raster tile is inside a triangle - __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].a), vTileSampleBBoxXFix8); - __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].b), vTileSampleBBoxYFix8); - vEdge0TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16); - - vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].a), vTileSampleBBoxXFix8); - vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].b), vTileSampleBBoxYFix8); - vEdge1TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16); - - vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].a), vTileSampleBBoxXFix8); - vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].b), vTileSampleBBoxYFix8); - vEdge2TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16); + for (uint32_t e = 0; e < 3; ++e) + { + __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8); + __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8); + vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16); + } } RDTSC_STOP(BEStepSetup, 0, pDC->drawId); @@ -756,7 +752,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, for (uint32_t tileX = tX; tileX <= maxX; ++tileX) { - uint64_t anyCoveredSamples = 0; + triDesc.anyCoveredSamples = 0; // is the corner of the edge outside of the raster tile? (vEdge < 0) int mask0, mask1, mask2; @@ -770,9 +766,9 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, { __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2; // evaluate edge equations at the tile multisample bounding box - vSampleBboxTest0 = _mm256_add_pd(vEdge0TileBbox, vEdgeFix16[0]); - vSampleBboxTest1 = _mm256_add_pd(vEdge1TileBbox, vEdgeFix16[1]); - vSampleBboxTest2 = _mm256_add_pd(vEdge2TileBbox, vEdgeFix16[2]); + vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]); + vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]); + vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]); mask0 = _mm256_movemask_pd(vSampleBboxTest0); mask1 = _mm256_movemask_pd(vSampleBboxTest1); mask2 = _mm256_movemask_pd(vSampleBboxTest2); @@ -789,20 +785,21 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL; if ((mask0 & mask1 & mask2) == 0xf) { - anyCoveredSamples = triDesc.coverageMask[sampleNum]; + triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum]; // trivial accept, all 4 corners of all 3 edges are negative // i.e. raster tile completely inside triangle RDTSC_EVENT(BETrivialAccept, 1, 0); } else { - __m256d vEdge0AtSample, vEdge1AtSample, vEdge2AtSample; + __m256d vEdgeAtSample[numEdges]; if(sampleCount == SWR_MULTISAMPLE_1X) { // should get optimized out for single sample case (global value numbering or copy propagation) - vEdge0AtSample = vEdgeFix16[0]; - vEdge1AtSample = vEdgeFix16[1]; - vEdge2AtSample = vEdgeFix16[2]; + for (uint32_t e = 0; e < numEdges; ++e) + { + vEdgeAtSample[e] = vEdgeFix16[e]; + } } else { @@ -815,31 +812,20 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, // for each edge and broadcasts it before offsetting to individual pixel quads // step edge equation tests from UL tile corner to pixel sample position - __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].a), vSampleOffsetX); - __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].b), vSampleOffsetY); - vEdge0AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16); - vEdge0AtSample = _mm256_add_pd(vEdgeFix16[0], vEdge0AtSample); - - vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].a), vSampleOffsetX); - vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].b), vSampleOffsetY); - vEdge1AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16); - vEdge1AtSample = _mm256_add_pd(vEdgeFix16[1], vEdge1AtSample); - - vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].a), vSampleOffsetX); - vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].b), vSampleOffsetY); - vEdge2AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16); - vEdge2AtSample = _mm256_add_pd(vEdgeFix16[2], vEdge2AtSample); + for (uint32_t e = 0; e < numEdges; ++e) + { + __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX); + __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY); + vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16); + vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]); + } } double startQuadEdges[numEdges]; const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); - _mm256_maskstore_pd(&startQuadEdges[0], vLane0Mask, vEdge0AtSample); - _mm256_maskstore_pd(&startQuadEdges[1], vLane0Mask, vEdge1AtSample); - _mm256_maskstore_pd(&startQuadEdges[2], vLane0Mask, vEdge2AtSample); - - for (uint32_t e = 3; e < numEdges; ++e) + for (uint32_t e = 0; e < numEdges; ++e) { - _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeFix16[e]); + _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]); } // not trivial accept or reject, must rasterize full tile @@ -854,7 +840,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, } RDTSC_STOP(BERasterizePartial, 0, 0); - anyCoveredSamples |= triDesc.coverageMask[sampleNum]; + triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; } } else @@ -875,7 +861,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, } else #endif - if(anyCoveredSamples) + if(triDesc.anyCoveredSamples) { RDTSC_START(BEPixelBackend); backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers); diff --git a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h new file mode 100644 index 00000000000..7ff109d4fe8 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h @@ -0,0 +1,102 @@ +/**************************************************************************** +* Copyright (C) 2016 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file arena.h +* +* @brief RingBuffer +* The RingBuffer class manages all aspects of the ring buffer including +* the head/tail indices, etc. +* +******************************************************************************/ +#pragma once + +template<typename T> +class RingBuffer +{ +public: + RingBuffer() + : mpRingBuffer(nullptr), mNumEntries(0), mRingHead(0), mRingTail(0) + { + } + + ~RingBuffer() + { + Destroy(); + } + + void Init(uint32_t numEntries) + { + SWR_ASSERT(numEntries > 0); + mNumEntries = numEntries; + mpRingBuffer = (T*)_aligned_malloc(sizeof(T)*numEntries, 64); + SWR_ASSERT(mpRingBuffer != nullptr); + memset(mpRingBuffer, 0, sizeof(T)*numEntries); + } + + void Destroy() + { + _aligned_free(mpRingBuffer); + mpRingBuffer = nullptr; + } + + T& operator[](const uint32_t index) + { + SWR_ASSERT(index < mNumEntries); + return mpRingBuffer[index]; + } + + INLINE void Enqueue() + { + mRingHead++; // There's only one producer. + } + + INLINE void Dequeue() + { + InterlockedIncrement(&mRingTail); // There are multiple consumers. + } + + INLINE bool IsEmpty() + { + return (GetHead() == GetTail()); + } + + INLINE bool IsFull() + { + ///@note We don't handle wrap case due to using 64-bit indices. + /// It would take 11 million years to wrap at 50,000 DCs per sec. + /// If we used 32-bit indices then its about 23 hours to wrap. + uint64_t numEnqueued = GetHead() - GetTail(); + SWR_ASSERT(numEnqueued <= mNumEntries); + + return (numEnqueued == mNumEntries); + } + + INLINE volatile uint64_t GetTail() { return mRingTail; } + INLINE volatile uint64_t GetHead() { return mRingHead; } + +protected: + T* mpRingBuffer; + uint32_t mNumEntries; + + OSALIGNLINE(volatile uint64_t) mRingHead; // Consumer Counter + OSALIGNLINE(volatile uint64_t) mRingTail; // Producer Counter +}; diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index 2758555fd4b..5752094ca10 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -307,6 +307,8 @@ struct PixelPositions simdscalar centroid; }; +#define SWR_MAX_NUM_MULTISAMPLES 16 + ////////////////////////////////////////////////////////////////////////// /// SWR_PS_CONTEXT /// @brief Input to pixel shader. @@ -338,6 +340,7 @@ struct SWR_PS_CONTEXT uint32_t frontFace; // IN: front- 1, back- 0 uint32_t primID; // IN: primitive ID uint32_t sampleIndex; // IN: sampleIndex + }; ////////////////////////////////////////////////////////////////////////// @@ -748,7 +751,6 @@ struct SWR_RENDER_TARGET_BLEND_STATE }; static_assert(sizeof(SWR_RENDER_TARGET_BLEND_STATE) == 1, "Invalid SWR_RENDER_TARGET_BLEND_STATE size"); -#define SWR_MAX_NUM_MULTISAMPLES 16 enum SWR_MULTISAMPLE_COUNT { SWR_MULTISAMPLE_1X = 0, @@ -786,7 +788,8 @@ typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, SWR_GS_CONTEXT* pGsConte typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, SWR_CS_CONTEXT* pCsContext); typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext); typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext); -typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*); +typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext); +typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, uint8_t*, simdvector&, simdscalari*, simdscalari*); ////////////////////////////////////////////////////////////////////////// /// FRONTEND_STATE @@ -941,6 +944,7 @@ struct SWR_BACKEND_STATE uint8_t numComponents[KNOB_NUM_ATTRIBUTES]; }; + union SWR_DEPTH_STENCIL_STATE { struct @@ -980,7 +984,6 @@ enum SWR_SHADING_RATE { SWR_SHADING_RATE_PIXEL, SWR_SHADING_RATE_SAMPLE, - SWR_SHADING_RATE_COARSE, SWR_SHADING_RATE_MAX, }; @@ -1024,4 +1027,5 @@ struct SWR_PS_STATE uint32_t barycentricsMask : 3; // which type(s) of barycentric coords does the PS interpolate attributes with uint32_t usesUAV : 1; // pixel shader accesses UAV uint32_t forceEarlyZ : 1; // force execution of early depth/stencil test + }; diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index 24c5588bfec..07bc94a1a54 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -24,7 +24,6 @@ #include <stdio.h> #include <thread> #include <algorithm> -#include <unordered_set> #include <float.h> #include <vector> #include <utility> @@ -44,7 +43,6 @@ #include "rasterizer.h" #include "rdtsc_core.h" #include "tilemgr.h" -#include "core/multisample.h" @@ -265,9 +263,7 @@ void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup= INLINE uint64_t GetEnqueuedDraw(SWR_CONTEXT *pContext) { - //uint64_t result = _InterlockedCompareExchange64((volatile __int64*)&pContext->DrawEnqueued, 0, 0); - //return result; - return pContext->DrawEnqueued; + return pContext->dcRing.GetHead(); } INLINE @@ -283,170 +279,27 @@ bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint64_t lastReti return (pDC->dependency > lastRetiredDraw); } -void ClearColorHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. +INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC) { - // Load clear color into SIMD register... - float *pClearData = (float*)(pHotTile->clearData); - simdscalar valR = _simd_broadcast_ss(&pClearData[0]); - simdscalar valG = _simd_broadcast_ss(&pClearData[1]); - simdscalar valB = _simd_broadcast_ss(&pClearData[2]); - simdscalar valA = _simd_broadcast_ss(&pClearData[3]); + int64_t result = InterlockedDecrement64(&pDC->threadsDone); + SWR_ASSERT(result >= 0); - float *pfBuf = (float*)pHotTile->pBuffer; - uint32_t numSamples = pHotTile->numSamples; - - for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) + if (result == 0) { - for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) + // Cleanup memory allocations + pDC->pArena->Reset(true); + pDC->pTileMgr->initialize(); + if (pDC->cleanupState) { - for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++) - { - _simd_store_ps(pfBuf, valR); - pfBuf += KNOB_SIMD_WIDTH; - _simd_store_ps(pfBuf, valG); - pfBuf += KNOB_SIMD_WIDTH; - _simd_store_ps(pfBuf, valB); - pfBuf += KNOB_SIMD_WIDTH; - _simd_store_ps(pfBuf, valA); - pfBuf += KNOB_SIMD_WIDTH; - } + pDC->pState->pArena->Reset(true); } - } -} - -void ClearDepthHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. -{ - // Load clear color into SIMD register... - float *pClearData = (float*)(pHotTile->clearData); - simdscalar valZ = _simd_broadcast_ss(&pClearData[0]); - float *pfBuf = (float*)pHotTile->pBuffer; - uint32_t numSamples = pHotTile->numSamples; - - for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) - { - for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) - { - for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) - { - _simd_store_ps(pfBuf, valZ); - pfBuf += KNOB_SIMD_WIDTH; - } - } - } -} - -void ClearStencilHotTile(const HOTTILE* pHotTile) -{ - // convert from F32 to U8. - uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]); - //broadcast 32x into __m256i... - simdscalari valS = _simd_set1_epi8(clearVal); - - simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer; - uint32_t numSamples = pHotTile->numSamples; - - for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) - { - for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) - { - // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly. - for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4) - { - _simd_store_si(pBuf, valS); - pBuf += 1; - } - } - } -} - -// for draw calls, we initialize the active hot tiles and perform deferred -// load on them if tile is in invalid state. we do this in the outer thread loop instead of inside -// the draw routine itself mainly for performance, to avoid unnecessary setup -// every triangle -// @todo support deferred clear -INLINE -void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, const TRIANGLE_WORK_DESC* pWork) -{ - const API_STATE& state = GetApiState(pDC); - HotTileMgr *pHotTileMgr = pContext->pHotTileMgr; - - uint32_t x, y; - MacroTileMgr::getTileIndices(macroID, x, y); - x *= KNOB_MACROTILE_X_DIM; - y *= KNOB_MACROTILE_Y_DIM; - - uint32_t numSamples = GetNumSamples(state.rastState.sampleCount); - - // check RT if enabled - unsigned long rtSlot = 0; - uint32_t colorHottileEnableMask = state.colorHottileEnable; - while(_BitScanForward(&rtSlot, colorHottileEnableMask)) - { - HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples); - - if (pHotTile->state == HOTTILE_INVALID) - { - RDTSC_START(BELoadTiles); - // invalid hottile before draw requires a load from surface before we can draw to it - pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); - pHotTile->state = HOTTILE_DIRTY; - RDTSC_STOP(BELoadTiles, 0, 0); - } - else if (pHotTile->state == HOTTILE_CLEAR) - { - RDTSC_START(BELoadTiles); - // Clear the tile. - ClearColorHotTile(pHotTile); - pHotTile->state = HOTTILE_DIRTY; - RDTSC_STOP(BELoadTiles, 0, 0); - } - colorHottileEnableMask &= ~(1 << rtSlot); - } + _ReadWriteBarrier(); - // check depth if enabled - if (state.depthHottileEnable) - { - HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples); - if (pHotTile->state == HOTTILE_INVALID) - { - RDTSC_START(BELoadTiles); - // invalid hottile before draw requires a load from surface before we can draw to it - pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); - pHotTile->state = HOTTILE_DIRTY; - RDTSC_STOP(BELoadTiles, 0, 0); - } - else if (pHotTile->state == HOTTILE_CLEAR) - { - RDTSC_START(BELoadTiles); - // Clear the tile. - ClearDepthHotTile(pHotTile); - pHotTile->state = HOTTILE_DIRTY; - RDTSC_STOP(BELoadTiles, 0, 0); - } + pContext->dcRing.Dequeue(); // Remove from tail } - // check stencil if enabled - if (state.stencilHottileEnable) - { - HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples); - if (pHotTile->state == HOTTILE_INVALID) - { - RDTSC_START(BELoadTiles); - // invalid hottile before draw requires a load from surface before we can draw to it - pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); - pHotTile->state = HOTTILE_DIRTY; - RDTSC_STOP(BELoadTiles, 0, 0); - } - else if (pHotTile->state == HOTTILE_CLEAR) - { - RDTSC_START(BELoadTiles); - // Clear the tile. - ClearStencilHotTile(pHotTile); - pHotTile->state = HOTTILE_DIRTY; - RDTSC_STOP(BELoadTiles, 0, 0); - } - } + return result; } INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE) @@ -466,7 +319,7 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE) if (isWorkComplete) { curDrawBE++; - InterlockedIncrement(&pDC->threadsDoneBE); + CompleteDrawContext(pContext, pDC); } else { @@ -496,7 +349,9 @@ void WorkOnFifoBE( SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, - std::unordered_set<uint32_t>& lockedTiles) + TileSet& lockedTiles, + uint32_t numaNode, + uint32_t numaMask) { // Find the first incomplete draw that has pending work. If no such draw is found then // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE. @@ -537,68 +392,78 @@ void WorkOnFifoBE( for (uint32_t tileID : macroTiles) { + // Only work on tiles for for this numa node + uint32_t x, y; + pDC->pTileMgr->getTileIndices(tileID, x, y); + if (((x ^ y) & numaMask) != numaNode) + { + continue; + } + MacroTileQueue &tile = pDC->pTileMgr->getMacroTileQueue(tileID); + if (!tile.getNumQueued()) + { + continue; + } + // can only work on this draw if it's not in use by other threads - if (lockedTiles.find(tileID) == lockedTiles.end()) + if (lockedTiles.find(tileID) != lockedTiles.end()) { - if (tile.getNumQueued()) + continue; + } + + if (tile.tryLock()) + { + BE_WORK *pWork; + + RDTSC_START(WorkerFoundWork); + + uint32_t numWorkItems = tile.getNumQueued(); + SWR_ASSERT(numWorkItems); + + pWork = tile.peek(); + SWR_ASSERT(pWork); + if (pWork->type == DRAW) { - if (tile.tryLock()) - { - BE_WORK *pWork; - - RDTSC_START(WorkerFoundWork); - - uint32_t numWorkItems = tile.getNumQueued(); - - if (numWorkItems != 0) - { - pWork = tile.peek(); - SWR_ASSERT(pWork); - if (pWork->type == DRAW) - { - InitializeHotTiles(pContext, pDC, tileID, (const TRIANGLE_WORK_DESC*)&pWork->desc); - } - } - - while ((pWork = tile.peek()) != nullptr) - { - pWork->pfnWork(pDC, workerId, tileID, &pWork->desc); - tile.dequeue(); - } - RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId); - - _ReadWriteBarrier(); - - pDC->pTileMgr->markTileComplete(tileID); - - // Optimization: If the draw is complete and we're the last one to have worked on it then - // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete. - if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete()) - { - // We can increment the current BE and safely move to next draw since we know this draw is complete. - curDrawBE++; - InterlockedIncrement(&pDC->threadsDoneBE); - - lastRetiredDraw++; - - lockedTiles.clear(); - break; - } - } - else - { - // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again. - lockedTiles.insert(tileID); - } + pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID); + } + + while ((pWork = tile.peek()) != nullptr) + { + pWork->pfnWork(pDC, workerId, tileID, &pWork->desc); + tile.dequeue(); } + RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId); + + _ReadWriteBarrier(); + + pDC->pTileMgr->markTileComplete(tileID); + + // Optimization: If the draw is complete and we're the last one to have worked on it then + // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete. + if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete()) + { + // We can increment the current BE and safely move to next draw since we know this draw is complete. + curDrawBE++; + CompleteDrawContext(pContext, pDC); + + lastRetiredDraw++; + + lockedTiles.clear(); + break; + } + } + else + { + // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again. + lockedTiles.insert(tileID); } } } } -void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, UCHAR numaNode) +void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode) { // Try to grab the next DC from the ring uint64_t drawEnqueued = GetEnqueuedDraw(pContext); @@ -608,8 +473,8 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot]; if (pDC->isCompute || pDC->doneFE || pDC->FeLock) { + CompleteDrawContext(pContext, pDC); curDrawFE++; - InterlockedIncrement(&pDC->threadsDoneFE); } else { @@ -673,22 +538,12 @@ void WorkOnCompute( // Is there any work remaining? if (queue.getNumQueued() > 0) { - bool lastToComplete = false; - uint32_t threadGroupId = 0; while (queue.getWork(threadGroupId)) { ProcessComputeBE(pDC, workerId, threadGroupId); - lastToComplete = queue.finishedWork(); - } - - _ReadWriteBarrier(); - - if (lastToComplete) - { - SWR_ASSERT(queue.isWorkComplete() == true); - pDC->doneCompute = true; + queue.finishedWork(); } } } @@ -704,14 +559,15 @@ DWORD workerThreadMain(LPVOID pData) RDTSC_INIT(threadId); - int numaNode = (int)pThreadData->numaId; + uint32_t numaNode = pThreadData->numaId; + uint32_t numaMask = pContext->threadPool.numaMask; // flush denormals to 0 _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); // Track tiles locked by other threads. If we try to lock a macrotile and find its already // locked then we'll add it to this list so that we don't try and lock it again. - std::unordered_set<uint32_t> lockedTiles; + TileSet lockedTiles; // each worker has the ability to work on any of the queued draws as long as certain // conditions are met. the data associated @@ -732,10 +588,10 @@ DWORD workerThreadMain(LPVOID pData) // the worker can safely increment its oldestDraw counter and move on to the next draw. std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock); - auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->DrawEnqueued; }; + auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->dcRing.GetHead(); }; - uint64_t curDrawBE = 1; - uint64_t curDrawFE = 1; + uint64_t curDrawBE = 0; + uint64_t curDrawFE = 0; while (pContext->threadPool.inThreadShutdown == false) { @@ -776,7 +632,7 @@ DWORD workerThreadMain(LPVOID pData) } RDTSC_START(WorkerWorkOnFifoBE); - WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles); + WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask); RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0); WorkOnCompute(pContext, workerId, curDrawBE); @@ -853,9 +709,12 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) numThreads, KNOB_MAX_NUM_THREADS); } + uint32_t numAPIReservedThreads = 1; + + if (numThreads == 1) { - // If only 1 worker thread, try to move it to an available + // If only 1 worker threads, try to move it to an available // HW thread. If that fails, use the API thread. if (numCoresPerNode < numHWCoresPerNode) { @@ -878,8 +737,15 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) } else { - // Save a HW thread for the API thread. - numThreads--; + // Save HW threads for the API if we can + if (numThreads > numAPIReservedThreads) + { + numThreads -= numAPIReservedThreads; + } + else + { + numAPIReservedThreads = 0; + } } pPool->numThreads = numThreads; @@ -887,6 +753,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) pPool->inThreadShutdown = false; pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA)); + pPool->numaMask = 0; if (KNOB_MAX_WORKER_THREADS) { @@ -907,6 +774,8 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) } else { + pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.) + uint32_t workerId = 0; for (uint32_t n = 0; n < numNodes; ++n) { @@ -918,9 +787,9 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) auto& core = node.cores[c]; for (uint32_t t = 0; t < numHyperThreads; ++t) { - if (c == 0 && n == 0 && t == 0) + if (numAPIReservedThreads) { - // Skip core 0, thread0 on node 0 to reserve for API thread + --numAPIReservedThreads; continue; } diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h index 0fa7196f5ac..821d7dcb16e 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.h +++ b/src/gallium/drivers/swr/rasterizer/core/threads.h @@ -34,6 +34,7 @@ typedef std::thread* THREAD_PTR; struct SWR_CONTEXT; +struct DRAW_CONTEXT; struct THREAD_DATA { @@ -50,14 +51,18 @@ struct THREAD_POOL { THREAD_PTR threads[KNOB_MAX_NUM_THREADS]; uint32_t numThreads; + uint32_t numaMask; volatile bool inThreadShutdown; THREAD_DATA *pThreadData; }; +typedef std::unordered_set<uint32_t> TileSet; + void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); // Expose FE and BE worker functions to the API thread if single threaded -void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, UCHAR numaNode); -void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, std::unordered_set<uint32_t> &usedTiles); +void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode); +void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask); void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE); +int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
\ No newline at end of file diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp index 860393661e2..794577270cf 100644 --- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp @@ -29,7 +29,9 @@ #include <unordered_map> #include "fifo.hpp" -#include "tilemgr.h" +#include "core/tilemgr.h" +#include "core/multisample.h" +#include "rdtsc_core.h" #define TILE_ID(x,y) ((x << 16 | y)) @@ -54,24 +56,21 @@ void DispatchQueue::operator delete(void *p) _aligned_free(p); } -MacroTileMgr::MacroTileMgr(Arena& arena) : mArena(arena) +MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena) { } -void MacroTileMgr::initialize() -{ - mWorkItemsProduced = 0; - mWorkItemsConsumed = 0; - - mDirtyTiles.clear(); -} - void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork) { // Should not enqueue more then what we have backing for in the hot tile manager. SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X); SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y); + if ((x & ~(KNOB_NUM_HOT_TILES_X-1)) | (y & ~(KNOB_NUM_HOT_TILES_Y-1))) + { + return; + } + uint32_t id = TILE_ID(x, y); MacroTileQueue &tile = mTiles[id]; @@ -103,3 +102,284 @@ void MacroTileMgr::markTileComplete(uint32_t id) tile.mWorkItemsFE = 0; tile.mWorkItemsBE = 0; } + +HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples, + uint32_t renderTargetArrayIndex) +{ + uint32_t x, y; + MacroTileMgr::getTileIndices(macroID, x, y); + + SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X); + SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y); + + HotTileSet &tile = mHotTiles[x][y]; + HOTTILE& hotTile = tile.Attachment[attachment]; + if (hotTile.pBuffer == NULL) + { + if (create) + { + uint32_t size = numSamples * mHotTileSize[attachment]; + uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask); + hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode); + hotTile.state = HOTTILE_INVALID; + hotTile.numSamples = numSamples; + hotTile.renderTargetArrayIndex = renderTargetArrayIndex; + } + else + { + return NULL; + } + } + else + { + // free the old tile and create a new one with enough space to hold all samples + if (numSamples > hotTile.numSamples) + { + // tile should be either uninitialized or resolved if we're deleting and switching to a + // new sample count + SWR_ASSERT((hotTile.state == HOTTILE_INVALID) || + (hotTile.state == HOTTILE_RESOLVED) || + (hotTile.state == HOTTILE_CLEAR)); + FreeHotTileMem(hotTile.pBuffer); + + uint32_t size = numSamples * mHotTileSize[attachment]; + uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask); + hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode); + hotTile.state = HOTTILE_INVALID; + hotTile.numSamples = numSamples; + } + + // if requested render target array index isn't currently loaded, need to store out the current hottile + // and load the requested array slice + if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex) + { + SWR_FORMAT format; + switch (attachment) + { + case SWR_ATTACHMENT_COLOR0: + case SWR_ATTACHMENT_COLOR1: + case SWR_ATTACHMENT_COLOR2: + case SWR_ATTACHMENT_COLOR3: + case SWR_ATTACHMENT_COLOR4: + case SWR_ATTACHMENT_COLOR5: + case SWR_ATTACHMENT_COLOR6: + case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break; + case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break; + case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break; + default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break; + } + + if (hotTile.state == HOTTILE_DIRTY) + { + pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment, + x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer); + } + + pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment, + x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer); + + hotTile.renderTargetArrayIndex = renderTargetArrayIndex; + hotTile.state = HOTTILE_DIRTY; + } + } + return &tile.Attachment[attachment]; +} + +HOTTILE* HotTileMgr::GetHotTileNoLoad( + SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, + SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples) +{ + uint32_t x, y; + MacroTileMgr::getTileIndices(macroID, x, y); + + SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X); + SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y); + + HotTileSet &tile = mHotTiles[x][y]; + HOTTILE& hotTile = tile.Attachment[attachment]; + if (hotTile.pBuffer == NULL) + { + if (create) + { + uint32_t size = numSamples * mHotTileSize[attachment]; + hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4); + hotTile.state = HOTTILE_INVALID; + hotTile.numSamples = numSamples; + hotTile.renderTargetArrayIndex = 0; + } + else + { + return NULL; + } + } + + return &hotTile; +} + +void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. +{ + // Load clear color into SIMD register... + float *pClearData = (float*)(pHotTile->clearData); + simdscalar valR = _simd_broadcast_ss(&pClearData[0]); + simdscalar valG = _simd_broadcast_ss(&pClearData[1]); + simdscalar valB = _simd_broadcast_ss(&pClearData[2]); + simdscalar valA = _simd_broadcast_ss(&pClearData[3]); + + float *pfBuf = (float*)pHotTile->pBuffer; + uint32_t numSamples = pHotTile->numSamples; + + for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) + { + for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) + { + for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++) + { + _simd_store_ps(pfBuf, valR); + pfBuf += KNOB_SIMD_WIDTH; + _simd_store_ps(pfBuf, valG); + pfBuf += KNOB_SIMD_WIDTH; + _simd_store_ps(pfBuf, valB); + pfBuf += KNOB_SIMD_WIDTH; + _simd_store_ps(pfBuf, valA); + pfBuf += KNOB_SIMD_WIDTH; + } + } + } +} + +void HotTileMgr::ClearDepthHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. +{ + // Load clear color into SIMD register... + float *pClearData = (float*)(pHotTile->clearData); + simdscalar valZ = _simd_broadcast_ss(&pClearData[0]); + + float *pfBuf = (float*)pHotTile->pBuffer; + uint32_t numSamples = pHotTile->numSamples; + + for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) + { + for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) + { + for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) + { + _simd_store_ps(pfBuf, valZ); + pfBuf += KNOB_SIMD_WIDTH; + } + } + } +} + +void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile) +{ + // convert from F32 to U8. + uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]); + //broadcast 32x into __m256i... + simdscalari valS = _simd_set1_epi8(clearVal); + + simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer; + uint32_t numSamples = pHotTile->numSamples; + + for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) + { + for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) + { + // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly. + for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4) + { + _simd_store_si(pBuf, valS); + pBuf += 1; + } + } + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief InitializeHotTiles +/// for draw calls, we initialize the active hot tiles and perform deferred +/// load on them if tile is in invalid state. we do this in the outer thread +/// loop instead of inside the draw routine itself mainly for performance, +/// to avoid unnecessary setup every triangle +/// @todo support deferred clear +/// @param pCreateInfo - pointer to creation info. +void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID) +{ + const API_STATE& state = GetApiState(pDC); + HotTileMgr *pHotTileMgr = pContext->pHotTileMgr; + + uint32_t x, y; + MacroTileMgr::getTileIndices(macroID, x, y); + x *= KNOB_MACROTILE_X_DIM; + y *= KNOB_MACROTILE_Y_DIM; + + uint32_t numSamples = GetNumSamples(state.rastState.sampleCount); + + // check RT if enabled + unsigned long rtSlot = 0; + uint32_t colorHottileEnableMask = state.colorHottileEnable; + while (_BitScanForward(&rtSlot, colorHottileEnableMask)) + { + HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples); + + if (pHotTile->state == HOTTILE_INVALID) + { + RDTSC_START(BELoadTiles); + // invalid hottile before draw requires a load from surface before we can draw to it + pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); + pHotTile->state = HOTTILE_DIRTY; + RDTSC_STOP(BELoadTiles, 0, 0); + } + else if (pHotTile->state == HOTTILE_CLEAR) + { + RDTSC_START(BELoadTiles); + // Clear the tile. + ClearColorHotTile(pHotTile); + pHotTile->state = HOTTILE_DIRTY; + RDTSC_STOP(BELoadTiles, 0, 0); + } + colorHottileEnableMask &= ~(1 << rtSlot); + } + + // check depth if enabled + if (state.depthHottileEnable) + { + HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples); + if (pHotTile->state == HOTTILE_INVALID) + { + RDTSC_START(BELoadTiles); + // invalid hottile before draw requires a load from surface before we can draw to it + pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); + pHotTile->state = HOTTILE_DIRTY; + RDTSC_STOP(BELoadTiles, 0, 0); + } + else if (pHotTile->state == HOTTILE_CLEAR) + { + RDTSC_START(BELoadTiles); + // Clear the tile. + ClearDepthHotTile(pHotTile); + pHotTile->state = HOTTILE_DIRTY; + RDTSC_STOP(BELoadTiles, 0, 0); + } + } + + // check stencil if enabled + if (state.stencilHottileEnable) + { + HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples); + if (pHotTile->state == HOTTILE_INVALID) + { + RDTSC_START(BELoadTiles); + // invalid hottile before draw requires a load from surface before we can draw to it + pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); + pHotTile->state = HOTTILE_DIRTY; + RDTSC_STOP(BELoadTiles, 0, 0); + } + else if (pHotTile->state == HOTTILE_CLEAR) + { + RDTSC_START(BELoadTiles); + // Clear the tile. + ClearStencilHotTile(pHotTile); + pHotTile->state = HOTTILE_DIRTY; + RDTSC_STOP(BELoadTiles, 0, 0); + } + } +} diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h index 9137941bad4..aa561badc1c 100644 --- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h +++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h @@ -59,7 +59,8 @@ struct MacroTileQueue ////////////////////////////////////////////////////////////////////////// /// @brief Clear fifo and unlock it. - void clear(Arena& arena) + template <typename ArenaT> + void clear(ArenaT& arena) { mFifo.clear(arena); } @@ -71,7 +72,8 @@ struct MacroTileQueue return mFifo.peek(); } - bool enqueue_try_nosync(Arena& arena, const BE_WORK* entry) + template <typename ArenaT> + bool enqueue_try_nosync(ArenaT& arena, const BE_WORK* entry) { return mFifo.enqueue_try_nosync(arena, entry); } @@ -104,7 +106,7 @@ private: class MacroTileMgr { public: - MacroTileMgr(Arena& arena); + MacroTileMgr(CachingArena& arena); ~MacroTileMgr() { for (auto &tile : mTiles) @@ -113,7 +115,14 @@ public: } } - void initialize(); + INLINE void initialize() + { + mWorkItemsProduced = 0; + mWorkItemsConsumed = 0; + + mDirtyTiles.clear(); + } + INLINE std::vector<uint32_t>& getDirtyTiles() { return mDirtyTiles; } INLINE MacroTileQueue& getMacroTileQueue(uint32_t id) { return mTiles[id]; } void markTileComplete(uint32_t id); @@ -135,15 +144,14 @@ public: void operator delete (void *p); private: - Arena& mArena; - SWR_FORMAT mFormat; + CachingArena& mArena; std::unordered_map<uint32_t, MacroTileQueue> mTiles; // Any tile that has work queued to it is a dirty tile. std::vector<uint32_t> mDirtyTiles; - OSALIGNLINE(LONG) mWorkItemsProduced; - OSALIGNLINE(volatile LONG) mWorkItemsConsumed; + OSALIGNLINE(LONG) mWorkItemsProduced { 0 }; + OSALIGNLINE(volatile LONG) mWorkItemsConsumed { 0 }; }; ////////////////////////////////////////////////////////////////////////// @@ -224,7 +232,7 @@ public: void *operator new(size_t size); void operator delete (void *p); - void* mpTaskData; // The API thread will set this up and the callback task function will interpet this. + void* mpTaskData{ nullptr }; // The API thread will set this up and the callback task function will interpet this. OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 }; OSALIGNLINE(volatile LONG) mTasksOutstanding{ 0 }; @@ -241,7 +249,7 @@ enum HOTTILE_STATE struct HOTTILE { - BYTE *pBuffer; + uint8_t *pBuffer; HOTTILE_STATE state; DWORD clearData[4]; // May need to change based on pfnClearTile implementation. Reorder for alignment? uint32_t numSamples; @@ -283,108 +291,50 @@ public: { for (int a = 0; a < SWR_NUM_ATTACHMENTS; ++a) { - if (mHotTiles[x][y].Attachment[a].pBuffer != NULL) - { - _aligned_free(mHotTiles[x][y].Attachment[a].pBuffer); - mHotTiles[x][y].Attachment[a].pBuffer = NULL; - } + FreeHotTileMem(mHotTiles[x][y].Attachment[a].pBuffer); } } } } - HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1, - uint32_t renderTargetArrayIndex = 0) - { - uint32_t x, y; - MacroTileMgr::getTileIndices(macroID, x, y); + void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID); - assert(x < KNOB_NUM_HOT_TILES_X); - assert(y < KNOB_NUM_HOT_TILES_Y); + HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1, + uint32_t renderTargetArrayIndex = 0); - HotTileSet &tile = mHotTiles[x][y]; - HOTTILE& hotTile = tile.Attachment[attachment]; - if (hotTile.pBuffer == NULL) - { - if (create) - { - uint32_t size = numSamples * mHotTileSize[attachment]; - hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4); - hotTile.state = HOTTILE_INVALID; - hotTile.numSamples = numSamples; - hotTile.renderTargetArrayIndex = renderTargetArrayIndex; - } - else - { - return NULL; - } - } - else - { - // free the old tile and create a new one with enough space to hold all samples - if (numSamples > hotTile.numSamples) - { - // tile should be either uninitialized or resolved if we're deleting and switching to a - // new sample count - assert((hotTile.state == HOTTILE_INVALID) || - (hotTile.state == HOTTILE_RESOLVED) || - (hotTile.state == HOTTILE_CLEAR)); - _aligned_free(hotTile.pBuffer); - - uint32_t size = numSamples * mHotTileSize[attachment]; - hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4); - hotTile.state = HOTTILE_INVALID; - hotTile.numSamples = numSamples; - } + HOTTILE *GetHotTileNoLoad(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1); - // if requested render target array index isn't currently loaded, need to store out the current hottile - // and load the requested array slice - if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex) - { - SWR_FORMAT format; - switch (attachment) - { - case SWR_ATTACHMENT_COLOR0: - case SWR_ATTACHMENT_COLOR1: - case SWR_ATTACHMENT_COLOR2: - case SWR_ATTACHMENT_COLOR3: - case SWR_ATTACHMENT_COLOR4: - case SWR_ATTACHMENT_COLOR5: - case SWR_ATTACHMENT_COLOR6: - case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break; - case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break; - case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break; - default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break; - } + static void ClearColorHotTile(const HOTTILE* pHotTile); + static void ClearDepthHotTile(const HOTTILE* pHotTile); + static void ClearStencilHotTile(const HOTTILE* pHotTile); - if (hotTile.state == HOTTILE_DIRTY) - { - pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment, - x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer); - } - - pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment, - x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer); +private: + HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y]; + uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS]; - hotTile.renderTargetArrayIndex = renderTargetArrayIndex; - hotTile.state = HOTTILE_DIRTY; - } - } - return &tile.Attachment[attachment]; + void* AllocHotTileMem(size_t size, uint32_t align, uint32_t numaNode) + { + void* p = nullptr; +#if defined(_WIN32) + HANDLE hProcess = GetCurrentProcess(); + p = VirtualAllocExNuma(hProcess, nullptr, size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE, numaNode); +#else + p = _aligned_malloc(size, align); +#endif + + return p; } - HotTileSet &GetHotTile(uint32_t macroID) + void FreeHotTileMem(void* pBuffer) { - uint32_t x, y; - MacroTileMgr::getTileIndices(macroID, x, y); - assert(x < KNOB_NUM_HOT_TILES_X); - assert(y < KNOB_NUM_HOT_TILES_Y); - - return mHotTiles[x][y]; + if (pBuffer) + { +#if defined(_WIN32) + VirtualFree(pBuffer, 0, MEM_RELEASE); +#else + _aligned_free(pBuffer); +#endif + } } - -private: - HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y]; - uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS]; }; diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.cpp b/src/gallium/drivers/swr/rasterizer/core/utils.cpp index f36452f2cec..a1d665e77cc 100644 --- a/src/gallium/drivers/swr/rasterizer/core/utils.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/utils.cpp @@ -27,6 +27,11 @@ ******************************************************************************/ #if defined(_WIN32) +#if defined(NOMINMAX) +// GDI Plus requires non-std min / max macros be defined :( +#undef NOMINMAX +#endif + #include<Windows.h> #include <Gdiplus.h> #include <Gdiplusheaders.h> diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h index b9dc48c4fd7..60a3a6af19e 100644 --- a/src/gallium/drivers/swr/rasterizer/core/utils.h +++ b/src/gallium/drivers/swr/rasterizer/core/utils.h @@ -46,8 +46,7 @@ void OpenBitmapFromFile( uint32_t *height); #endif -/// @todo assume linux is always 64 bit -#if defined(_WIN64) || defined(__linux__) || defined(__gnu_linux__) +#if defined(_WIN64) || defined(__x86_64__) #define _MM_INSERT_EPI64 _mm_insert_epi64 #define _MM_EXTRACT_EPI64 _mm_extract_epi64 #else @@ -89,7 +88,10 @@ INLINE __m128i _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx) OSALIGNLINE(struct) BBOX { - int top, bottom, left, right; + int top{ 0 }; + int bottom{ 0 }; + int left{ 0 }; + int right{ 0 }; BBOX() {} BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {} @@ -110,7 +112,10 @@ OSALIGNLINE(struct) BBOX struct simdBBox { - simdscalari top, bottom, left, right; + simdscalari top; + simdscalari bottom; + simdscalari left; + simdscalari right; }; INLINE @@ -271,7 +276,7 @@ struct TransposeSingleComponent /// @brief Pass-thru for single component. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) { memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8); } @@ -286,7 +291,7 @@ struct Transpose8_8_8_8 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) { simdscalari src = _simd_load_si((const simdscalari*)pSrc); #if KNOB_SIMD_WIDTH == 8 @@ -325,7 +330,7 @@ struct Transpose8_8_8 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -337,7 +342,7 @@ struct Transpose8_8 /// @brief Performs an SOA to AOS conversion for packed 8_8 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) { simdscalari src = _simd_load_si((const simdscalari*)pSrc); @@ -361,7 +366,7 @@ struct Transpose32_32_32_32 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) { #if KNOB_SIMD_WIDTH == 8 simdscalar src0 = _simd_load_ps((const float*)pSrc); @@ -394,7 +399,7 @@ struct Transpose32_32_32 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) { #if KNOB_SIMD_WIDTH == 8 simdscalar src0 = _simd_load_ps((const float*)pSrc); @@ -426,7 +431,7 @@ struct Transpose32_32 /// @brief Performs an SOA to AOS conversion for packed 32_32 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) { const float* pfSrc = (const float*)pSrc; __m128 src_r0 = _mm_load_ps(pfSrc + 0); @@ -456,7 +461,7 @@ struct Transpose16_16_16_16 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) { #if KNOB_SIMD_WIDTH == 8 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); @@ -496,7 +501,7 @@ struct Transpose16_16_16 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) { #if KNOB_SIMD_WIDTH == 8 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); @@ -535,7 +540,7 @@ struct Transpose16_16 /// @brief Performs an SOA to AOS conversion for packed 16_16 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) { simdscalar src = _simd_load_ps((const float*)pSrc); @@ -566,7 +571,7 @@ struct Transpose24_8 /// @brief Performs an SOA to AOS conversion for packed 24_8 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -578,7 +583,7 @@ struct Transpose32_8_24 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; }; @@ -592,7 +597,7 @@ struct Transpose4_4_4_4 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -604,7 +609,7 @@ struct Transpose5_6_5 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -616,7 +621,7 @@ struct Transpose9_9_9_5 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -628,7 +633,7 @@ struct Transpose5_5_5_1 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -640,7 +645,7 @@ struct Transpose10_10_10_2 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -652,7 +657,7 @@ struct Transpose11_11_10 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; }; // helper function to unroll loops @@ -694,7 +699,7 @@ uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size) } #endif - BYTE* pRemainderBytes = (BYTE*)pDataWords; + uint8_t* pRemainderBytes = (uint8_t*)pDataWords; for (uint32_t i = 0; i < sizeRemainderBytes; ++i) { crc = _mm_crc32_u8(crc, *pRemainderBytes++); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp index 734c89792f0..de856c4a095 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp @@ -47,6 +47,10 @@ #include "llvm/Analysis/CFGPrinter.h" #include "llvm/IRReader/IRReader.h" +#if LLVM_USE_INTEL_JITEVENTS +#include "llvm/ExecutionEngine/JITEventListener.h" +#endif + #include "core/state.h" #include "common/containers.hpp" diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h index c974a611224..4ffb0fbee01 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h @@ -53,6 +53,10 @@ #include "llvm/Config/config.h" #endif +#ifndef HAVE_LLVM +#define HAVE_LLVM (LLVM_VERSION_MAJOR << 8) || LLVM_VERSION_MINOR +#endif + #include "llvm/IR/Verifier.h" #include "llvm/ExecutionEngine/MCJIT.h" #include "llvm/Support/FileSystem.h" @@ -60,11 +64,10 @@ #include "llvm/Analysis/Passes.h" -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +#if HAVE_LLVM == 0x306 #include "llvm/PassManager.h" #else #include "llvm/IR/LegacyPassManager.h" -using namespace llvm::legacy; #endif #include "llvm/CodeGen/Passes.h" @@ -166,7 +169,6 @@ struct JitManager FunctionType* mTrinaryFPTy; FunctionType* mUnaryIntTy; FunctionType* mBinaryIntTy; - FunctionType* mTrinaryIntTy; Type* mSimtFP32Ty; Type* mSimtInt32Ty; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp index 954524afd3a..a64f86006f4 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp @@ -576,9 +576,12 @@ struct BlendJit : public Builder src1[i] = LOAD(pSrc1, { i }); } Value* currentMask = VIMMED1(-1); - if(state.desc.alphaToCoverageEnable) + if (state.desc.alphaToCoverageEnable) { - currentMask = FP_TO_SI(FMUL(src[3], VBROADCAST(C((float)state.desc.numSamples))), mSimdInt32Ty); + Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f); + uint32_t bits = (1 << state.desc.numSamples) - 1; + currentMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits))); + currentMask = FP_TO_SI(FADD(currentMask, VIMMED1(0.5f)), mSimdInt32Ty); } // alpha test @@ -702,6 +705,12 @@ struct BlendJit : public Builder currentMask = AND(sampleMask, currentMask); } + if (state.desc.alphaToCoverageEnable) + { + Value* sampleMasked = SHL(C(1), sampleNum); + currentMask = AND(currentMask, VBROADCAST(sampleMasked)); + } + if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable || state.desc.oMaskEnable) { @@ -717,7 +726,13 @@ struct BlendJit : public Builder JitManager::DumpToFile(blendFunc, ""); - FunctionPassManager passes(JM()->mpCurrentModule); +#if HAVE_LLVM == 0x306 + FunctionPassManager +#else + llvm::legacy::FunctionPassManager +#endif + passes(JM()->mpCurrentModule); + passes.add(createBreakCriticalEdgesPass()); passes.add(createCFGSimplificationPass()); passes.add(createEarlyCSEPass()); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp index c15bdf1e756..757ea3fe39c 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp @@ -38,6 +38,8 @@ using namespace llvm; Builder::Builder(JitManager *pJitMgr) : mpJitMgr(pJitMgr) { + mVWidth = pJitMgr->mVWidth; + mpIRBuilder = &pJitMgr->mBuilder; mVoidTy = Type::getVoidTy(pJitMgr->mContext); @@ -48,14 +50,18 @@ Builder::Builder(JitManager *pJitMgr) mInt8Ty = Type::getInt8Ty(pJitMgr->mContext); mInt16Ty = Type::getInt16Ty(pJitMgr->mContext); mInt32Ty = Type::getInt32Ty(pJitMgr->mContext); + mInt8PtrTy = PointerType::get(mInt8Ty, 0); + mInt16PtrTy = PointerType::get(mInt16Ty, 0); + mInt32PtrTy = PointerType::get(mInt32Ty, 0); mInt64Ty = Type::getInt64Ty(pJitMgr->mContext); mV4FP32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mFP32Ty), false); // vector4 float type (represented as structure) mV4Int32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mInt32Ty), false); // vector4 int type - mSimdInt16Ty = VectorType::get(mInt16Ty, mpJitMgr->mVWidth); - mSimdInt32Ty = VectorType::get(mInt32Ty, mpJitMgr->mVWidth); - mSimdInt64Ty = VectorType::get(mInt64Ty, mpJitMgr->mVWidth); - mSimdFP16Ty = VectorType::get(mFP16Ty, mpJitMgr->mVWidth); - mSimdFP32Ty = VectorType::get(mFP32Ty, mpJitMgr->mVWidth); + mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth); + mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth); + mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth); + mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth); + mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth); + mSimdVectorTy = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mSimdFP32Ty), false); if (sizeof(uint32_t*) == 4) { diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h index 49216612cc9..239ef2ab49f 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h @@ -43,6 +43,8 @@ struct Builder JitManager* mpJitMgr; IRBuilder<>* mpIRBuilder; + uint32_t mVWidth; + // Built in types. Type* mVoidTy; Type* mInt1Ty; @@ -54,12 +56,16 @@ struct Builder Type* mFP16Ty; Type* mFP32Ty; Type* mDoubleTy; + Type* mInt8PtrTy; + Type* mInt16PtrTy; + Type* mInt32PtrTy; Type* mSimdFP16Ty; Type* mSimdFP32Ty; Type* mSimdInt16Ty; Type* mSimdInt32Ty; Type* mSimdInt64Ty; Type* mSimdIntPtrTy; + Type* mSimdVectorTy; StructType* mV4FP32Ty; StructType* mV4Int32Ty; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index 5394fc7bf5a..486dad8f04c 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -28,6 +28,8 @@ * ******************************************************************************/ #include "builder.h" +#include "common/rdtsc_buckets.h" + #include "llvm/Support/DynamicLibrary.h" void __cdecl CallPrint(const char* fmt, ...); @@ -189,32 +191,32 @@ Constant *Builder::PRED(bool pred) Value *Builder::VIMMED1(int i) { - return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i))); + return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); } Value *Builder::VIMMED1(uint32_t i) { - return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i))); + return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); } Value *Builder::VIMMED1(float i) { - return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantFP>(C(i))); + return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i))); } Value *Builder::VIMMED1(bool i) { - return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i))); + return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); } Value *Builder::VUNDEF_IPTR() { - return UndefValue::get(VectorType::get(PointerType::get(mInt32Ty, 0),JM()->mVWidth)); + return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth)); } Value *Builder::VUNDEF_I() { - return UndefValue::get(VectorType::get(mInt32Ty, JM()->mVWidth)); + return UndefValue::get(VectorType::get(mInt32Ty, mVWidth)); } Value *Builder::VUNDEF(Type *ty, uint32_t size) @@ -224,15 +226,15 @@ Value *Builder::VUNDEF(Type *ty, uint32_t size) Value *Builder::VUNDEF_F() { - return UndefValue::get(VectorType::get(mFP32Ty, JM()->mVWidth)); + return UndefValue::get(VectorType::get(mFP32Ty, mVWidth)); } Value *Builder::VUNDEF(Type* t) { - return UndefValue::get(VectorType::get(t, JM()->mVWidth)); + return UndefValue::get(VectorType::get(t, mVWidth)); } -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +#if HAVE_LLVM == 0x306 Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index) { return VINSERT(vec, val, C((int64_t)index)); @@ -247,7 +249,7 @@ Value *Builder::VBROADCAST(Value *src) return src; } - return VECTOR_SPLAT(JM()->mVWidth, src); + return VECTOR_SPLAT(mVWidth, src); } uint32_t Builder::IMMED(Value* v) @@ -257,6 +259,13 @@ uint32_t Builder::IMMED(Value* v) return pValConst->getZExtValue(); } +int32_t Builder::S_IMMED(Value* v) +{ + SWR_ASSERT(isa<ConstantInt>(v)); + ConstantInt *pValConst = cast<ConstantInt>(v); + return pValConst->getSExtValue(); +} + Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList) { std::vector<Value*> indices; @@ -342,8 +351,8 @@ Value *Builder::MASKLOADD(Value* src,Value* mask) else { Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256); - Value* fMask = BITCAST(mask,VectorType::get(mFP32Ty,JM()->mVWidth)); - vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,JM()->mVWidth)); + Value* fMask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth)); + vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,mVWidth)); } return vResult; } @@ -512,7 +521,7 @@ CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list // get a pointer to the first character in the constant string array std::vector<Constant*> geplist{C(0),C(0)}; -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +#if HAVE_LLVM == 0x306 Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false); #else Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false); @@ -575,7 +584,7 @@ Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMas Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty)); Value *vOffsets = MUL(vIndices,vScaleVec); Value *mask = MASK(vMask); - for(uint32_t i = 0; i < JM()->mVWidth; ++i) + for(uint32_t i = 0; i < mVWidth; ++i) { // single component byte index Value *offset = VEXTRACT(vOffsets,C(i)); @@ -625,7 +634,7 @@ Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMas Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty)); Value *vOffsets = MUL(vIndices, vScaleVec); Value *mask = MASK(vMask); - for(uint32_t i = 0; i < JM()->mVWidth; ++i) + for(uint32_t i = 0; i < mVWidth; ++i) { // single component byte index Value *offset = VEXTRACT(vOffsets, C(i)); @@ -774,12 +783,61 @@ Value *Builder::PERMD(Value* a, Value* idx) } else { - res = VSHUFFLE(a, a, idx); + if (isa<Constant>(idx)) + { + res = VSHUFFLE(a, a, idx); + } + else + { + res = VUNDEF_I(); + for (uint32_t l = 0; l < JM()->mVWidth; ++l) + { + Value* pIndex = VEXTRACT(idx, C(l)); + Value* pVal = VEXTRACT(a, pIndex); + res = VINSERT(res, pVal, C(l)); + } + } } return res; } ////////////////////////////////////////////////////////////////////////// +/// @brief Generate a VPERMPS operation (shuffle 32 bit float values +/// across 128 bit lanes) in LLVM IR. If not supported on the underlying +/// platform, emulate it +/// @param a - 256bit SIMD lane(8x32bit) of float values. +/// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values +Value *Builder::PERMPS(Value* a, Value* idx) +{ + Value* res; + // use avx2 permute instruction if available + if (JM()->mArch.AVX2()) + { + // llvm 3.6.0 swapped the order of the args to vpermd + res = VPERMPS(idx, a); + } + else + { + if (isa<Constant>(idx)) + { + res = VSHUFFLE(a, a, idx); + } + else + { + res = VUNDEF_F(); + for (uint32_t l = 0; l < JM()->mVWidth; ++l) + { + Value* pIndex = VEXTRACT(idx, C(l)); + Value* pVal = VEXTRACT(a, pIndex); + res = VINSERT(res, pVal, C(l)); + } + } + } + + return res; +} + +////////////////////////////////////////////////////////////////////////// /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion) /// in LLVM IR. If not supported on the underlying platform, emulate it /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format. @@ -800,7 +858,7 @@ Value *Builder::CVTPH2PS(Value* a) } Value* pResult = UndefValue::get(mSimdFP32Ty); - for (uint32_t i = 0; i < JM()->mVWidth; ++i) + for (uint32_t i = 0; i < mVWidth; ++i) { Value* pSrc = VEXTRACT(a, C(i)); Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc}); @@ -833,7 +891,7 @@ Value *Builder::CVTPS2PH(Value* a, Value* rounding) } Value* pResult = UndefValue::get(mSimdInt16Ty); - for (uint32_t i = 0; i < JM()->mVWidth; ++i) + for (uint32_t i = 0; i < mVWidth; ++i) { Value* pSrc = VEXTRACT(a, C(i)); Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc}); @@ -1085,8 +1143,8 @@ void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byt void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput) { // cast types - Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); - Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits + Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); + Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits // input could either be float or int vector; do shuffle work in int vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty); @@ -1094,7 +1152,7 @@ void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInp if(bPackedOutput) { - Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits + Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits // shuffle mask Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, @@ -1179,12 +1237,12 @@ void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInp void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput) { // cast types - Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); - Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits + Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); + Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits if(bPackedOutput) { - Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits + Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits // shuffle mask Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}); @@ -1286,16 +1344,18 @@ void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask) { Value* pStack = STACKSAVE(); + Type* pSrcTy = vSrc->getType()->getVectorElementType(); + // allocate tmp stack for masked off lanes - Value* vTmpPtr = ALLOCA(vSrc->getType()->getVectorElementType()); + Value* vTmpPtr = ALLOCA(pSrcTy); Value *mask = MASK(vMask); - for (uint32_t i = 0; i < JM()->mVWidth; ++i) + for (uint32_t i = 0; i < mVWidth; ++i) { Value *offset = VEXTRACT(vOffsets, C(i)); // byte pointer to component Value *storeAddress = GEP(pDst, offset); - storeAddress = BITCAST(storeAddress, PointerType::get(mFP32Ty, 0)); + storeAddress = BITCAST(storeAddress, PointerType::get(pSrcTy, 0)); Value *selMask = VEXTRACT(mask, C(i)); Value *srcElem = VEXTRACT(vSrc, C(i)); // switch in a safe address to load if we're trying to access a vertex @@ -1349,7 +1409,7 @@ Value *Builder::FCLAMP(Value* src, float low, float high) Value* Builder::STACKSAVE() { Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave); -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +#if HAVE_LLVM == 0x306 return CALL(pfnStackSave); #else return CALLA(pfnStackSave); @@ -1401,11 +1461,13 @@ void __cdecl CallPrint(const char* fmt, ...) vsnprintf_s(strBuf, _TRUNCATE, fmt, args); OutputDebugString(strBuf); #endif + + va_end(args); } Value *Builder::VEXTRACTI128(Value* a, Constant* imm8) { -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +#if HAVE_LLVM == 0x306 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_vextractf128_si_256); @@ -1413,8 +1475,8 @@ Value *Builder::VEXTRACTI128(Value* a, Constant* imm8) #else bool flag = !imm8->isZeroValue(); SmallVector<Constant*,8> idx; - for (unsigned i = 0; i < JM()->mVWidth / 2; i++) { - idx.push_back(C(flag ? i + JM()->mVWidth / 2 : i)); + for (unsigned i = 0; i < mVWidth / 2; i++) { + idx.push_back(C(flag ? i + mVWidth / 2 : i)); } return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx)); #endif @@ -1422,7 +1484,7 @@ Value *Builder::VEXTRACTI128(Value* a, Constant* imm8) Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8) { -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +#if HAVE_LLVM == 0x306 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_vinsertf128_si_256); @@ -1430,18 +1492,54 @@ Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8) #else bool flag = !imm8->isZeroValue(); SmallVector<Constant*,8> idx; - for (unsigned i = 0; i < JM()->mVWidth; i++) { + for (unsigned i = 0; i < mVWidth; i++) { idx.push_back(C(i)); } Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx)); SmallVector<Constant*,8> idx2; - for (unsigned i = 0; i < JM()->mVWidth / 2; i++) { - idx2.push_back(C(flag ? i : i + JM()->mVWidth)); + for (unsigned i = 0; i < mVWidth / 2; i++) { + idx2.push_back(C(flag ? i : i + mVWidth)); } - for (unsigned i = JM()->mVWidth / 2; i < JM()->mVWidth; i++) { - idx2.push_back(C(flag ? i + JM()->mVWidth / 2 : i)); + for (unsigned i = mVWidth / 2; i < mVWidth; i++) { + idx2.push_back(C(flag ? i + mVWidth / 2 : i)); } return VSHUFFLE(a, inter, ConstantVector::get(idx2)); #endif } + +// rdtsc buckets macros +void Builder::RDTSC_START(Value* pBucketMgr, Value* pId) +{ + std::vector<Type*> args{ + PointerType::get(mInt32Ty, 0), // pBucketMgr + mInt32Ty // id + }; + + FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false); + Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy)); + if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr) + { + sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket); + } + + CALL(pFunc, { pBucketMgr, pId }); +} + +void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId) +{ + std::vector<Type*> args{ + PointerType::get(mInt32Ty, 0), // pBucketMgr + mInt32Ty // id + }; + + FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false); + Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy)); + if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr) + { + sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket); + } + + CALL(pFunc, { pBucketMgr, pId }); +} + diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h index 48e0558c4dd..f43ef69d1ed 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h @@ -59,7 +59,7 @@ Value *VUNDEF_F(); Value *VUNDEF_I(); Value *VUNDEF(Type* ty, uint32_t size); Value *VUNDEF_IPTR(); -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +#if HAVE_LLVM == 0x306 Value *VINSERT(Value *vec, Value *val, uint64_t index); #endif Value *VBROADCAST(Value *src); @@ -67,6 +67,7 @@ Value *VRCP(Value *va); Value *VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY); uint32_t IMMED(Value* i); +int32_t S_IMMED(Value* i); Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList); Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList); @@ -115,6 +116,7 @@ Value *PSHUFB(Value* a, Value* b); Value *PMOVSXBD(Value* a); Value *PMOVSXWD(Value* a); Value *PERMD(Value* a, Value* idx); +Value *PERMPS(Value* a, Value* idx); Value *CVTPH2PS(Value* a); Value *CVTPS2PH(Value* a, Value* rounding); Value *PMAXSD(Value* a, Value* b); @@ -147,3 +149,7 @@ Value* INT3() { return INTERRUPT(C((uint8_t)3)); } Value *VEXTRACTI128(Value* a, Constant* imm8); Value *VINSERTI128(Value* a, Value* b, Constant* imm8); + +// rdtsc buckets macros +void RDTSC_START(Value* pBucketMgr, Value* pId); +void RDTSC_STOP(Value* pBucketMgr, Value* pId); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index c5a180e27cb..2c2c56bd151 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -105,7 +105,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) std::vector<Value*> vtxInputIndices(2, C(0)); // GEP pVtxOut = GEP(pVtxOut, C(0)); - pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, JM()->mVWidth), 0)); + pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0)); // SWR_FETCH_CONTEXT::pStreams Value* streams = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pStreams}); @@ -174,7 +174,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) verifyFunction(*fetch); - FunctionPassManager setupPasses(JM()->mpCurrentModule); +#if HAVE_LLVM == 0x306 + FunctionPassManager +#else + llvm::legacy::FunctionPassManager +#endif + setupPasses(JM()->mpCurrentModule); ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification) setupPasses.add(createBreakCriticalEdgesPass()); @@ -186,7 +191,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) JitManager::DumpToFile(fetch, "se"); - FunctionPassManager optPasses(JM()->mpCurrentModule); +#if HAVE_LLVM == 0x306 + FunctionPassManager +#else + llvm::legacy::FunctionPassManager +#endif + optPasses(JM()->mpCurrentModule); ///@todo Haven't touched these either. Need to remove some of these and add others. optPasses.add(createCFGSimplificationPass()); @@ -220,8 +230,8 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet SWRL::UncheckedFixedVector<Value*, 16> vectors; - std::vector<Constant*> pMask(JM()->mVWidth); - for(uint32_t i = 0; i < JM()->mVWidth; ++i) + std::vector<Constant*> pMask(mVWidth); + for(uint32_t i = 0; i < mVWidth; ++i) { pMask[i] = (C(i < 4 ? i : 4)); } @@ -254,7 +264,7 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet Value* startVertexOffset = MUL(Z_EXT(startVertex, mInt64Ty), stride); // Load from the stream. - for(uint32_t lane = 0; lane < JM()->mVWidth; ++lane) + for(uint32_t lane = 0; lane < mVWidth; ++lane) { // Get index Value* index = VEXTRACT(vIndices, C(lane)); @@ -380,44 +390,44 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet vectors.push_back(wvec); } - std::vector<Constant*> v01Mask(JM()->mVWidth); - std::vector<Constant*> v23Mask(JM()->mVWidth); - std::vector<Constant*> v02Mask(JM()->mVWidth); - std::vector<Constant*> v13Mask(JM()->mVWidth); + std::vector<Constant*> v01Mask(mVWidth); + std::vector<Constant*> v23Mask(mVWidth); + std::vector<Constant*> v02Mask(mVWidth); + std::vector<Constant*> v13Mask(mVWidth); // Concatenate the vectors together. elements[0] = VUNDEF_F(); elements[1] = VUNDEF_F(); elements[2] = VUNDEF_F(); elements[3] = VUNDEF_F(); - for(uint32_t b = 0, num4Wide = JM()->mVWidth / 4; b < num4Wide; ++b) + for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b) { v01Mask[4 * b + 0] = C(0 + 4 * b); v01Mask[4 * b + 1] = C(1 + 4 * b); - v01Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth); - v01Mask[4 * b + 3] = C(1 + 4 * b + JM()->mVWidth); + v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth); + v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth); v23Mask[4 * b + 0] = C(2 + 4 * b); v23Mask[4 * b + 1] = C(3 + 4 * b); - v23Mask[4 * b + 2] = C(2 + 4 * b + JM()->mVWidth); - v23Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth); + v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth); + v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth); v02Mask[4 * b + 0] = C(0 + 4 * b); v02Mask[4 * b + 1] = C(2 + 4 * b); - v02Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth); - v02Mask[4 * b + 3] = C(2 + 4 * b + JM()->mVWidth); + v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth); + v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth); v13Mask[4 * b + 0] = C(1 + 4 * b); v13Mask[4 * b + 1] = C(3 + 4 * b); - v13Mask[4 * b + 2] = C(1 + 4 * b + JM()->mVWidth); - v13Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth); + v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth); + v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth); - std::vector<Constant*> iMask(JM()->mVWidth); - for(uint32_t i = 0; i < JM()->mVWidth; ++i) + std::vector<Constant*> iMask(mVWidth); + for(uint32_t i = 0; i < mVWidth; ++i) { if(((4 * b) <= i) && (i < (4 * (b + 1)))) { - iMask[i] = C(i % 4 + JM()->mVWidth); + iMask[i] = C(i % 4 + mVWidth); } else { @@ -805,7 +815,7 @@ Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex) STORE(C((uint8_t)0), pZeroIndex); // Load a SIMD of index pointers - for(int64_t lane = 0; lane < JM()->mVWidth; lane++) + for(int64_t lane = 0; lane < mVWidth; lane++) { // Calculate the address of the requested index Value *pIndex = GEP(pIndices, C(lane)); @@ -840,7 +850,7 @@ Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex) STORE(C((uint16_t)0), pZeroIndex); // Load a SIMD of index pointers - for(int64_t lane = 0; lane < JM()->mVWidth; lane++) + for(int64_t lane = 0; lane < mVWidth; lane++) { // Calculate the address of the requested index Value *pIndex = GEP(pIndices, C(lane)); @@ -925,13 +935,13 @@ void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args) const uint32_t (&swizzle)[4] = std::get<9>(args); // cast types - Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); - Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits + Type* vGatherTy = mSimdInt32Ty; + Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits // have to do extra work for sign extending if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){ - Type* v16x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 2); // 8x16bit ints in a 128bit lane - Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits + Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane + Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits // shuffle mask, including any swizzling const char x = (char)swizzle[0]; const char y = (char)swizzle[1]; @@ -1138,8 +1148,8 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) Value* (&vVertexElements)[4] = std::get<8>(args); // cast types - Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); - Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits + Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); + Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits // have to do extra work for sign extending if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)|| @@ -1149,7 +1159,7 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false; Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane - Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits + Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits // shuffle mask Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py index 1814b7c8d5f..e73b232757b 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py +++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py @@ -27,7 +27,7 @@ import json as JSON import operator header = r"""/**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -84,16 +84,16 @@ inst_aliases = { } intrinsics = [ - ["VGATHERPS", "x86_avx2_gather_d_ps_256", ["src", "pBase", "indices", "mask", "scale"]], + ["VGATHERPS", "x86_avx2_gather_d_ps_256", ["src", "pBase", "indices", "mask", "scale"]], ["VGATHERDD", "x86_avx2_gather_d_d_256", ["src", "pBase", "indices", "mask", "scale"]], - ["VSQRTPS", "x86_avx_sqrt_ps_256", ["a"]], - ["VRSQRTPS", "x86_avx_rsqrt_ps_256", ["a"]], - ["VRCPPS", "x86_avx_rcp_ps_256", ["a"]], - ["VMINPS", "x86_avx_min_ps_256", ["a", "b"]], - ["VMAXPS", "x86_avx_max_ps_256", ["a", "b"]], - ["VPMINSD", "x86_avx2_pmins_d", ["a", "b"]], - ["VPMAXSD", "x86_avx2_pmaxs_d", ["a", "b"]], - ["VROUND", "x86_avx_round_ps_256", ["a", "rounding"]], + ["VSQRTPS", "x86_avx_sqrt_ps_256", ["a"]], + ["VRSQRTPS", "x86_avx_rsqrt_ps_256", ["a"]], + ["VRCPPS", "x86_avx_rcp_ps_256", ["a"]], + ["VMINPS", "x86_avx_min_ps_256", ["a", "b"]], + ["VMAXPS", "x86_avx_max_ps_256", ["a", "b"]], + ["VPMINSD", "x86_avx2_pmins_d", ["a", "b"]], + ["VPMAXSD", "x86_avx2_pmaxs_d", ["a", "b"]], + ["VROUND", "x86_avx_round_ps_256", ["a", "rounding"]], ["VCMPPS", "x86_avx_cmp_ps_256", ["a", "b", "cmpop"]], ["VBLENDVPS", "x86_avx_blendv_ps_256", ["a", "b", "mask"]], ["BEXTR_32", "x86_bmi_bextr_32", ["src", "control"]], @@ -103,6 +103,7 @@ intrinsics = [ ["VPMOVSXBD", "x86_avx2_pmovsxbd", ["a"]], # sign extend packed 8bit components ["VPMOVSXWD", "x86_avx2_pmovsxwd", ["a"]], # sign extend packed 16bit components ["VPERMD", "x86_avx2_permd", ["idx", "a"]], + ["VPERMPS", "x86_avx2_permps", ["idx", "a"]], ["VCVTPH2PS", "x86_vcvtph2ps_256", ["a"]], ["VCVTPS2PH", "x86_vcvtps2ph_256", ["a", "round"]], ["VHSUBPS", "x86_avx_hsub_ps_256", ["a", "b"]], diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py index 7bba435467b..0b53a929e6c 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py +++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py @@ -28,7 +28,7 @@ import operator header = r""" /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp index 6c5f22bc47c..36baa8d794b 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp @@ -293,7 +293,13 @@ struct StreamOutJit : public Builder JitManager::DumpToFile(soFunc, "SoFunc"); - FunctionPassManager passes(JM()->mpCurrentModule); +#if HAVE_LLVM == 0x306 + FunctionPassManager +#else + llvm::legacy::FunctionPassManager +#endif + passes(JM()->mpCurrentModule); + passes.add(createBreakCriticalEdgesPass()); passes.add(createCFGSimplificationPass()); passes.add(createEarlyCSEPass()); diff --git a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp index ad73cd840a7..d001cb6b5cb 100644 --- a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp +++ b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp @@ -33,7 +33,7 @@ #include "memory/tilingtraits.h" #include "memory/Convert.h" -typedef void(*PFN_STORE_TILES_CLEAR)(const FLOAT*, SWR_SURFACE_STATE*, UINT, UINT); +typedef void(*PFN_STORE_TILES_CLEAR)(const float*, SWR_SURFACE_STATE*, UINT, UINT); ////////////////////////////////////////////////////////////////////////// /// Clear Raster Tile Function Tables. @@ -54,17 +54,17 @@ struct StoreRasterTileClear /// @param pDstSurface - Destination surface state /// @param x, y - Coordinates to raster tile. INLINE static void StoreClear( - const BYTE* dstFormattedColor, + const uint8_t* dstFormattedColor, UINT dstBytesPerPixel, SWR_SURFACE_STATE* pDstSurface, UINT x, UINT y) // (x, y) pixel coordinate to start of raster tile. { // Compute destination address for raster tile. - BYTE* pDstTile = (BYTE*)pDstSurface->pBaseAddress + + uint8_t* pDstTile = (uint8_t*)pDstSurface->pBaseAddress + (y * pDstSurface->pitch) + (x * dstBytesPerPixel); // start of first row - BYTE* pDst = pDstTile; + uint8_t* pDst = pDstTile; UINT dstBytesPerRow = 0; // For each raster tile pixel in row 0 (rx, 0) @@ -104,15 +104,15 @@ struct StoreMacroTileClear /// @param pDstSurface - Destination surface state /// @param x, y - Coordinates to macro tile static void StoreClear( - const FLOAT *pColor, + const float *pColor, SWR_SURFACE_STATE* pDstSurface, UINT x, UINT y) { UINT dstBytesPerPixel = (FormatTraits<DstFormat>::bpp / 8); - BYTE dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel + uint8_t dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel - FLOAT srcColor[4]; + float srcColor[4]; for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp) { diff --git a/src/gallium/drivers/swr/rasterizer/memory/Convert.h b/src/gallium/drivers/swr/rasterizer/memory/Convert.h index 0f9e0ad4bd8..7c185e5e454 100644 --- a/src/gallium/drivers/swr/rasterizer/memory/Convert.h +++ b/src/gallium/drivers/swr/rasterizer/memory/Convert.h @@ -227,10 +227,10 @@ static uint16_t Convert32To16Float(float val) /// @param srcPixel - Pointer to source pixel (pre-swizzled according to dest). template<SWR_FORMAT DstFormat> static void ConvertPixelFromFloat( - BYTE* pDstPixel, + uint8_t* pDstPixel, const float srcPixel[4]) { - UINT outColor[4]; // typeless bits + uint32_t outColor[4] = { 0 }; // typeless bits // Store component for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp) @@ -390,9 +390,9 @@ static void ConvertPixelFromFloat( template<SWR_FORMAT SrcFormat> INLINE static void ConvertPixelToFloat( float dstPixel[4], - const BYTE* pSrc) + const uint8_t* pSrc) { - UINT srcColor[4]; // typeless bits + uint32_t srcColor[4]; // typeless bits // unpack src pixel typename FormatTraits<SrcFormat>::FormatT* pPixel = (typename FormatTraits<SrcFormat>::FormatT*)pSrc; @@ -421,11 +421,11 @@ INLINE static void ConvertPixelToFloat( } // Convert components - for (UINT comp = 0; comp < FormatTraits<SrcFormat>::numComps; ++comp) + for (uint32_t comp = 0; comp < FormatTraits<SrcFormat>::numComps; ++comp) { SWR_TYPE type = FormatTraits<SrcFormat>::GetType(comp); - UINT src = srcColor[comp]; + uint32_t src = srcColor[comp]; switch (type) { @@ -486,7 +486,7 @@ INLINE static void ConvertPixelToFloat( } case SWR_TYPE_UINT: { - UINT dst = (UINT)src; + uint32_t dst = (uint32_t)src; dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst; break; } diff --git a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h index 50f8e57c22a..381ac89a7b8 100644 --- a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h +++ b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h @@ -28,6 +28,7 @@ #pragma once #include "core/state.h" +#include "common/simdintrin.h" template<SWR_TILE_MODE mode, int> struct TilingTraits @@ -130,63 +131,6 @@ template<int X> struct TilingTraits <SWR_TILE_MODE_WMAJOR, X> static UINT GetPdepY() { return 0x1ea; } }; -INLINE -UINT pdep_u32(UINT a, UINT mask) -{ -#if KNOB_ARCH==KNOB_ARCH_AVX2 - return _pdep_u32(a, mask); -#else - UINT result = 0; - - // copied from http://wm.ite.pl/articles/pdep-soft-emu.html - // using bsf instead of funky loop - DWORD maskIndex; - while (_BitScanForward(&maskIndex, mask)) - { - // 1. isolate lowest set bit of mask - const UINT lowest = 1 << maskIndex; - - // 2. populate LSB from src - const UINT LSB = (UINT)((int)(a << 31) >> 31); - - // 3. copy bit from mask - result |= LSB & lowest; - - // 4. clear lowest bit - mask &= ~lowest; - - // 5. prepare for next iteration - a >>= 1; - } - - return result; -#endif -} - -INLINE -UINT pext_u32(UINT a, UINT mask) -{ -#if KNOB_ARCH==KNOB_ARCH_AVX2 - return _pext_u32(a, mask); -#else - UINT result = 0; - DWORD maskIndex; - uint32_t currentBit = 0; - while (_BitScanForward(&maskIndex, mask)) - { - // 1. isolate lowest set bit of mask - const UINT lowest = 1 << maskIndex; - - // 2. copy bit from mask - result |= ((a & lowest) > 0) << currentBit++; - - // 3. clear lowest bit - mask &= ~lowest; - } - return result; -#endif -} - ////////////////////////////////////////////////////////////////////////// /// @brief Computes the tileID for 2D tiled surfaces /// @param pitch - surface pitch in bytes diff --git a/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py index 44ab69815b1..3d003fb4a33 100644 --- a/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py +++ b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py @@ -1,4 +1,4 @@ -# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +# Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py index 8c51e1e8e73..0f3ded68544 100644 --- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py +++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py @@ -1,4 +1,4 @@ -# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +# Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), @@ -21,24 +21,20 @@ # Python source KNOBS = [ - ['ENABLE_ASSERT_DIALOGS', { - 'type' : 'bool', - 'default' : 'true', - 'desc' : ['Use dialogs when asserts fire.', - 'Asserts are only enabled in debug builds'], - }], ['SINGLE_THREADED', { 'type' : 'bool', 'default' : 'false', 'desc' : ['If enabled will perform all rendering on the API thread.', 'This is useful mainly for debugging purposes.'], + 'category' : 'debug', }], ['DUMP_SHADER_IR', { - 'type' : 'bool', - 'default' : 'false', - 'desc' : ['Dumps shader LLVM IR at various stages of jit compilation.'], + 'type' : 'bool', + 'default' : 'false', + 'desc' : ['Dumps shader LLVM IR at various stages of jit compilation.'], + 'category' : 'debug', }], ['USE_GENERIC_STORETILE', { @@ -46,6 +42,7 @@ KNOBS = [ 'default' : 'false', 'desc' : ['Always use generic function for performing StoreTile.', 'Will be slightly slower than using optimized (jitted) path'], + 'category' : 'debug', }], ['FAST_CLEAR', { @@ -53,6 +50,7 @@ KNOBS = [ 'default' : 'true', 'desc' : ['Replace 3D primitive execute with a SWRClearRT operation and', 'defer clear execution to first backend op on hottile, or hottile store'], + 'category' : 'perf', }], ['MAX_NUMA_NODES', { @@ -61,6 +59,7 @@ KNOBS = [ 'desc' : ['Maximum # of NUMA-nodes per system used for worker threads', ' 0 == ALL NUMA-nodes in the system', ' N == Use at most N NUMA-nodes for rendering'], + 'category' : 'perf', }], ['MAX_CORES_PER_NUMA_NODE', { @@ -69,6 +68,7 @@ KNOBS = [ 'desc' : ['Maximum # of cores per NUMA-node used for worker threads.', ' 0 == ALL non-API thread cores per NUMA-node', ' N == Use at most N cores per NUMA-node'], + 'category' : 'perf', }], ['MAX_THREADS_PER_CORE', { @@ -77,6 +77,7 @@ KNOBS = [ 'desc' : ['Maximum # of (hyper)threads per physical core used for worker threads.', ' 0 == ALL hyper-threads per core', ' N == Use at most N hyper-threads per physical core'], + 'category' : 'perf', }], ['MAX_WORKER_THREADS', { @@ -87,6 +88,7 @@ KNOBS = [ 'IMPORTANT: If this is non-zero, no worker threads will be bound to', 'specific HW threads. They will all be "floating" SW threads.', 'In this case, the above 3 KNOBS will be ignored.'], + 'category' : 'perf', }], ['BUCKETS_START_FRAME', { @@ -96,6 +98,7 @@ KNOBS = [ '', 'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h', 'for this to have an effect.'], + 'category' : 'perf', }], ['BUCKETS_END_FRAME', { @@ -105,6 +108,7 @@ KNOBS = [ '', 'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h', 'for this to have an effect.'], + 'category' : 'perf', }], ['WORKER_SPIN_LOOP_COUNT', { @@ -112,46 +116,32 @@ KNOBS = [ 'default' : '5000', 'desc' : ['Number of spin-loop iterations worker threads will perform', 'before going to sleep when waiting for work'], + 'category' : 'perf', }], ['MAX_DRAWS_IN_FLIGHT', { 'type' : 'uint32_t', - 'default' : '160', + 'default' : '96', 'desc' : ['Maximum number of draws outstanding before API thread blocks.'], + 'category' : 'perf', }], ['MAX_PRIMS_PER_DRAW', { - 'type' : 'uint32_t', - 'default' : '2040', - 'desc' : ['Maximum primitives in a single Draw().', + 'type' : 'uint32_t', + 'default' : '2040', + 'desc' : ['Maximum primitives in a single Draw().', 'Larger primitives are split into smaller Draw calls.', 'Should be a multiple of (3 * vectorWidth).'], + 'category' : 'perf', }], ['MAX_TESS_PRIMS_PER_DRAW', { - 'type' : 'uint32_t', - 'default' : '16', - 'desc' : ['Maximum primitives in a single Draw() with tessellation enabled.', + 'type' : 'uint32_t', + 'default' : '16', + 'desc' : ['Maximum primitives in a single Draw() with tessellation enabled.', 'Larger primitives are split into smaller Draw calls.', 'Should be a multiple of (vectorWidth).'], - }], - - ['MAX_FRAC_ODD_TESS_FACTOR', { - 'type' : 'float', - 'default' : '63.0f', - 'desc' : ['(DEBUG) Maximum tessellation factor for fractional-odd partitioning.'], - }], - - ['MAX_FRAC_EVEN_TESS_FACTOR', { - 'type' : 'float', - 'default' : '64.0f', - 'desc' : ['(DEBUG) Maximum tessellation factor for fractional-even partitioning.'], - }], - - ['MAX_INTEGER_TESS_FACTOR', { - 'type' : 'uint32_t', - 'default' : '64', - 'desc' : ['(DEBUG) Maximum tessellation factor for integer partitioning.'], + 'category' : 'perf', }], @@ -159,12 +149,14 @@ KNOBS = [ 'type' : 'bool', 'default' : 'false', 'desc' : ['Enable threadviz output.'], + 'category' : 'perf', }], ['TOSS_DRAW', { 'type' : 'bool', 'default' : 'false', 'desc' : ['Disable per-draw/dispatch execution'], + 'category' : 'perf', }], ['TOSS_QUEUE_FE', { @@ -173,6 +165,7 @@ KNOBS = [ 'desc' : ['Stop per-draw execution at worker FE', '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], + 'category' : 'perf', }], ['TOSS_FETCH', { @@ -181,6 +174,7 @@ KNOBS = [ 'desc' : ['Stop per-draw execution at vertex fetch', '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], + 'category' : 'perf', }], ['TOSS_IA', { @@ -189,6 +183,7 @@ KNOBS = [ 'desc' : ['Stop per-draw execution at input assembler', '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], + 'category' : 'perf', }], ['TOSS_VS', { @@ -197,6 +192,7 @@ KNOBS = [ 'desc' : ['Stop per-draw execution at vertex shader', '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], + 'category' : 'perf', }], ['TOSS_SETUP_TRIS', { @@ -205,6 +201,7 @@ KNOBS = [ 'desc' : ['Stop per-draw execution at primitive setup', '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], + 'category' : 'perf', }], ['TOSS_BIN_TRIS', { @@ -213,6 +210,7 @@ KNOBS = [ 'desc' : ['Stop per-draw execution at primitive binning', '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], + 'category' : 'perf', }], ['TOSS_RS', { @@ -221,6 +219,5 @@ KNOBS = [ 'desc' : ['Stop per-draw execution at rasterizer', '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], - }], - -] + 'category' : 'perf', + }],] diff --git a/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template index 922117e7e16..521346ca833 100644 --- a/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template +++ b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template @@ -10,7 +10,7 @@ return ' '*(max_len - knob_len) %>/****************************************************************************** * -* Copyright 2015 +* Copyright 2015-2016 * Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -77,7 +77,11 @@ struct GlobalKnobs % for line in knob[1]['desc']: // ${line} % endfor + % if knob[1]['type'] == 'std::string': + DEFINE_KNOB(${knob[0]}, ${knob[1]['type']}, "${repr(knob[1]['default'])[1:-1]}"); + % else: DEFINE_KNOB(${knob[0]}, ${knob[1]['type']}, ${knob[1]['default']}); + % endif % endfor GlobalKnobs(); @@ -125,7 +129,7 @@ std::string GlobalKnobs::ToString(const char* optPerLinePrefix) str << optPerLinePrefix << "KNOB_${knob[0]}:${space_knob(knob[0])}"; % if knob[1]['type'] == 'bool': str << (KNOB_${knob[0]} ? "+\n" : "-\n"); - % elif knob[1]['type'] != 'float': + % elif knob[1]['type'] != 'float' and knob[1]['type'] != 'std::string': str << std::hex << std::setw(11) << std::left << KNOB_${knob[0]}; str << std::dec << KNOB_${knob[0]} << "\n"; % else: |