summaryrefslogtreecommitdiffstats
path: root/src/gallium
diff options
context:
space:
mode:
authorTim Rowley <[email protected]>2016-02-16 17:28:09 -0600
committerTim Rowley <[email protected]>2016-03-02 18:38:41 -0600
commitc6e67f5a9373e916a8d2333585cb5787aa5f7bb7 (patch)
tree5b5c60bea784f16736c394c989fdd5df3ebae233 /src/gallium
parent2b2d3680bf164ec4f8b50436b96c3fc195318ea5 (diff)
gallium/swr: add OpenSWR rasterizer
Acked-by: Roland Scheidegger <[email protected]> Acked-by: Jose Fonseca <[email protected]>
Diffstat (limited to 'src/gallium')
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/containers.hpp208
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/formats.cpp5469
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/formats.h251
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/isa.hpp235
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/os.h221
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp188
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h229
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h167
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/simdintrin.h787
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp238
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/swr_assert.h109
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/api.cpp1511
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/api.h500
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/arena.cpp166
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/arena.h69
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/backend.cpp1899
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/backend.h59
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/blend.h318
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/clip.cpp201
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/clip.h868
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/context.h495
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/depthstencil.h245
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/fifo.hpp136
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/format_conversion.h196
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/format_traits.h3548
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/format_types.h1075
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/frontend.cpp2345
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/frontend.h327
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/knobs.h142
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/knobs_init.h98
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/multisample.cpp51
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/multisample.h620
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/pa.h1208
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp1177
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp1393
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/rasterizer.h35
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp91
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h177
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/state.h1027
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/tessellator.h88
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/threads.cpp962
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/threads.h63
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp105
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/tilemgr.h390
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/utils.cpp148
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/utils.h831
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp313
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/JitManager.h186
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp772
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h93
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/builder.cpp71
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/builder.h71
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/builder_math.h34
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp1447
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h149
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp1431
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h128
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/jit_api.h108
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py401
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py341
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp357
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h94
-rw-r--r--src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp287
-rw-r--r--src/gallium/drivers/swr/rasterizer/memory/Convert.h698
-rw-r--r--src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp396
-rw-r--r--src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp1717
-rw-r--r--src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h581
-rw-r--r--src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h263
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py79
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py226
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/mako/__init__.py8
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/mako/_ast_util.py845
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/mako/ast.py178
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/mako/cache.py238
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/mako/cmd.py62
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/mako/codegen.py1237
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/mako/compat.py174
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/mako/exceptions.py373
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/mako/filters.py201
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/mako/lexer.py441
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/mako/lookup.py359
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/mako/parsetree.py594
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/mako/pygen.py299
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/mako/pyparser.py232
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/mako/runtime.py878
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/mako/template.py705
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/mako/util.py360
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template141
88 files changed, 48234 insertions, 0 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/common/containers.hpp b/src/gallium/drivers/swr/rasterizer/common/containers.hpp
new file mode 100644
index 00000000000..bc96c5f62fd
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/containers.hpp
@@ -0,0 +1,208 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+
+#ifndef SWRLIB_CONTAINERS_HPP__
+#define SWRLIB_CONTAINERS_HPP__
+
+#include <functional>
+#include "common/os.h"
+
+namespace SWRL
+{
+
+template <typename T, int NUM_ELEMENTS>
+struct UncheckedFixedVector
+{
+ UncheckedFixedVector() : mSize(0)
+ {
+ }
+
+ UncheckedFixedVector(std::size_t size, T const& exemplar)
+ {
+ this->mSize = 0;
+ for (std::size_t i = 0; i < size; ++i)
+ this->push_back(exemplar);
+ }
+
+ template <typename Iter>
+ UncheckedFixedVector(Iter fst, Iter lst)
+ {
+ this->mSize = 0;
+ for ( ; fst != lst; ++fst)
+ this->push_back(*fst);
+ }
+
+ UncheckedFixedVector(UncheckedFixedVector const& UFV)
+ {
+ this->mSize = 0;
+ for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
+ (*this)[i] = UFV[i];
+ this->mSize = UFV.size();
+ }
+
+ UncheckedFixedVector& operator=(UncheckedFixedVector const& UFV)
+ {
+ for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
+ (*this)[i] = UFV[i];
+ this->mSize = UFV.size();
+ return *this;
+ }
+
+ T* begin() { return &this->mElements[0]; }
+ T* end() { return &this->mElements[0] + this->mSize; }
+ T const* begin() const { return &this->mElements[0]; }
+ T const* end() const { return &this->mElements[0] + this->mSize; }
+
+ friend bool operator==(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
+ {
+ if (L.size() != R.size()) return false;
+ for (std::size_t i = 0, N = L.size(); i < N; ++i)
+ {
+ if (L[i] != R[i]) return false;
+ }
+ return true;
+ }
+
+ friend bool operator!=(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
+ {
+ if (L.size() != R.size()) return true;
+ for (std::size_t i = 0, N = L.size(); i < N; ++i)
+ {
+ if (L[i] != R[i]) return true;
+ }
+ return false;
+ }
+
+ T& operator[](std::size_t idx)
+ {
+ return this->mElements[idx];
+ }
+ T const& operator[](std::size_t idx) const
+ {
+ return this->mElements[idx];
+ }
+ void push_back(T const& t)
+ {
+ this->mElements[this->mSize] = t;
+ ++this->mSize;
+ }
+ void pop_back()
+ {
+ SWR_ASSERT(this->mSize > 0);
+ --this->mSize;
+ }
+ T& back()
+ {
+ return this->mElements[this->mSize-1];
+ }
+ T const& back() const
+ {
+ return this->mElements[this->mSize-1];
+ }
+ bool empty() const
+ {
+ return this->mSize == 0;
+ }
+ std::size_t size() const
+ {
+ return this->mSize;
+ }
+ void resize(std::size_t sz)
+ {
+ this->mSize = sz;
+ }
+ void clear()
+ {
+ this->resize(0);
+ }
+private:
+ std::size_t mSize;
+ T mElements[NUM_ELEMENTS];
+};
+
+template <typename T, int NUM_ELEMENTS>
+struct FixedStack : UncheckedFixedVector<T, NUM_ELEMENTS>
+{
+ FixedStack() {}
+
+ void push(T const& t)
+ {
+ this->push_back(t);
+ }
+
+ void pop()
+ {
+ this->pop_back();
+ }
+
+ T& top()
+ {
+ return this->back();
+ }
+
+ T const& top() const
+ {
+ return this->back();
+ }
+};
+
+template <typename T>
+struct CRCHash
+{
+ static_assert((sizeof(T) % sizeof(UINT)) == 0, "CRCHash expects templated type size is even multiple of 4B");
+ UINT operator()(const T& k) const
+ {
+ UINT *pData = (UINT*)&k;
+ UINT crc = 0;
+ for (UINT i = 0; i < sizeof(T) / sizeof(UINT); ++i)
+ {
+ crc = _mm_crc32_u32(crc, pData[i]);
+ }
+ return crc;
+ }
+};
+
+}// end SWRL
+
+namespace std
+{
+
+template <typename T, int N>
+struct hash<SWRL::UncheckedFixedVector<T, N>>
+{
+ size_t operator() (SWRL::UncheckedFixedVector<T, N> const& v) const
+ {
+ if (v.size() == 0) return 0;
+ std::hash<T> H;
+ size_t x = H(v[0]);
+ if (v.size() == 1) return x;
+ for (size_t i = 1; i < v.size(); ++i)
+ x ^= H(v[i]) + 0x9e3779b9 + (x<<6) + (x>>2);
+ return x;
+ }
+};
+
+
+}// end std.
+
+#endif//SWRLIB_CONTAINERS_HPP__
diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.cpp b/src/gallium/drivers/swr/rasterizer/common/formats.cpp
new file mode 100644
index 00000000000..ed8ce7e5b0f
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/formats.cpp
@@ -0,0 +1,5469 @@
+
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file formats.cpp
+*
+* @brief auto-generated file
+*
+* DO NOT EDIT
+*
+******************************************************************************/
+
+#include "formats.h"
+
+// lookup table for unorm8 srgb -> float conversion
+const uint32_t srgb8Table[256] = {
+ 0x00000000, 0x399f22b4, 0x3a1f22b4, 0x3a6eb40f, 0x3a9f22b4, 0x3ac6eb61, 0x3aeeb40f, 0x3b0b3e5e, 0x3b1f22b4, 0x3b33070b, 0x3b46eb61, 0x3b5b518d, 0x3b70f18d, 0x3b83e1c6, 0x3b8fe616, 0x3b9c87fd,
+ 0x3ba9c9b5, 0x3bb7ad6f, 0x3bc63549, 0x3bd5635f, 0x3be539c1, 0x3bf5ba70, 0x3c0373b5, 0x3c0c6152, 0x3c15a703, 0x3c1f45be, 0x3c293e6b, 0x3c3391f7, 0x3c3e4149, 0x3c494d43, 0x3c54b6c7, 0x3c607eb1,
+ 0x3c6ca5dc, 0x3c792d22, 0x3c830aa8, 0x3c89af9f, 0x3c9085db, 0x3c978dc5, 0x3c9ec7c0, 0x3ca63431, 0x3cadd37d, 0x3cb5a601, 0x3cbdac20, 0x3cc5e639, 0x3cce54ab, 0x3cd6f7d3, 0x3cdfd00e, 0x3ce8ddb9,
+ 0x3cf22131, 0x3cfb9ac6, 0x3d02a56c, 0x3d0798df, 0x3d0ca7e7, 0x3d11d2b0, 0x3d171965, 0x3d1c7c31, 0x3d21fb3c, 0x3d2796b2, 0x3d2d4ebe, 0x3d332384, 0x3d39152e, 0x3d3f23e6, 0x3d454fd4, 0x3d4b991f,
+ 0x3d51ffef, 0x3d58846a, 0x3d5f26b7, 0x3d65e6fe, 0x3d6cc564, 0x3d73c20f, 0x3d7add25, 0x3d810b66, 0x3d84b795, 0x3d887330, 0x3d8c3e4a, 0x3d9018f6, 0x3d940345, 0x3d97fd4a, 0x3d9c0716, 0x3da020bb,
+ 0x3da44a4b, 0x3da883d7, 0x3daccd70, 0x3db12728, 0x3db59110, 0x3dba0b38, 0x3dbe95b5, 0x3dc33092, 0x3dc7dbe2, 0x3dcc97b6, 0x3dd1641f, 0x3dd6412c, 0x3ddb2eef, 0x3de02d77, 0x3de53cd5, 0x3dea5d19,
+ 0x3def8e55, 0x3df4d093, 0x3dfa23e8, 0x3dff8861, 0x3e027f07, 0x3e054282, 0x3e080ea5, 0x3e0ae379, 0x3e0dc107, 0x3e10a755, 0x3e13966c, 0x3e168e53, 0x3e198f11, 0x3e1c98ae, 0x3e1fab32, 0x3e22c6a3,
+ 0x3e25eb09, 0x3e29186c, 0x3e2c4ed2, 0x3e2f8e45, 0x3e32d6c8, 0x3e362865, 0x3e398322, 0x3e3ce706, 0x3e405419, 0x3e43ca62, 0x3e4749e8, 0x3e4ad2b1, 0x3e4e64c6, 0x3e52002b, 0x3e55a4e9, 0x3e595307,
+ 0x3e5d0a8b, 0x3e60cb7c, 0x3e6495e0, 0x3e6869bf, 0x3e6c4720, 0x3e702e08, 0x3e741e7f, 0x3e78188c, 0x3e7c1c38, 0x3e8014c2, 0x3e82203c, 0x3e84308d, 0x3e8645ba, 0x3e885fc5, 0x3e8a7eb2, 0x3e8ca283,
+ 0x3e8ecb3d, 0x3e90f8e1, 0x3e932b74, 0x3e9562f8, 0x3e979f71, 0x3e99e0e2, 0x3e9c274e, 0x3e9e72b7, 0x3ea0c322, 0x3ea31892, 0x3ea57308, 0x3ea7d289, 0x3eaa3718, 0x3eaca0b7, 0x3eaf0f69, 0x3eb18333,
+ 0x3eb3fc16, 0x3eb67a15, 0x3eb8fd34, 0x3ebb8576, 0x3ebe12e1, 0x3ec0a571, 0x3ec33d2d, 0x3ec5da17, 0x3ec87c33, 0x3ecb2383, 0x3ecdd00b, 0x3ed081cd, 0x3ed338cc, 0x3ed5f50b, 0x3ed8b68d, 0x3edb7d54,
+ 0x3ede4965, 0x3ee11ac1, 0x3ee3f16b, 0x3ee6cd67, 0x3ee9aeb6, 0x3eec955d, 0x3eef815d, 0x3ef272ba, 0x3ef56976, 0x3ef86594, 0x3efb6717, 0x3efe6e02, 0x3f00bd2b, 0x3f02460c, 0x3f03d1a5, 0x3f055ff8,
+ 0x3f06f106, 0x3f0884cf, 0x3f0a1b57, 0x3f0bb49d, 0x3f0d50a2, 0x3f0eef69, 0x3f1090f2, 0x3f123540, 0x3f13dc53, 0x3f15862d, 0x3f1732cf, 0x3f18e23b, 0x3f1a9471, 0x3f1c4973, 0x3f1e0143, 0x3f1fbbe1,
+ 0x3f217950, 0x3f23398f, 0x3f24fca2, 0x3f26c288, 0x3f288b43, 0x3f2a56d5, 0x3f2c253f, 0x3f2df681, 0x3f2fca9e, 0x3f31a197, 0x3f337b6c, 0x3f355820, 0x3f3737b3, 0x3f391a26, 0x3f3aff7e, 0x3f3ce7b7,
+ 0x3f3ed2d4, 0x3f40c0d6, 0x3f42b1c0, 0x3f44a592, 0x3f469c4d, 0x3f4895f3, 0x3f4a9284, 0x3f4c9203, 0x3f4e9470, 0x3f5099cd, 0x3f52a21a, 0x3f54ad59, 0x3f56bb8c, 0x3f58ccb3, 0x3f5ae0cf, 0x3f5cf7e2,
+ 0x3f5f11ee, 0x3f612ef2, 0x3f634eef, 0x3f6571ec, 0x3f6797e1, 0x3f69c0d8, 0x3f6beccb, 0x3f6e1bc2, 0x3f704db6, 0x3f7282b1, 0x3f74baae, 0x3f76f5b3, 0x3f7933b9, 0x3f7b74cb, 0x3f7db8e0, 0x3f800000,
+};
+
+// order must match SWR_FORMAT
+const SWR_FORMAT_INFO gFormatInfo[] = {
+ // R32G32B32A32_FLOAT (0x0)
+ {
+ "R32G32B32A32_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 32, 32, 32, 32 }, // Bits per component
+ 128, // Bits per element
+ 16, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R32G32B32A32_SINT (0x1)
+ {
+ "R32G32B32A32_SINT",
+ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 32, 32, 32, 32 }, // Bits per component
+ 128, // Bits per element
+ 16, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R32G32B32A32_UINT (0x2)
+ {
+ "R32G32B32A32_UINT",
+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 32, 32, 32, 32 }, // Bits per component
+ 128, // Bits per element
+ 16, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // 0x3 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x4 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x5 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // R32G32B32X32_FLOAT (0x6)
+ {
+ "R32G32B32X32_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 32, 32, 32, 32 }, // Bits per component
+ 128, // Bits per element
+ 16, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R32G32B32A32_SSCALED (0x7)
+ {
+ "R32G32B32A32_SSCALED",
+ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 32, 32, 32, 32 }, // Bits per component
+ 128, // Bits per element
+ 16, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R32G32B32A32_USCALED (0x8)
+ {
+ "R32G32B32A32_USCALED",
+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 32, 32, 32, 32 }, // Bits per component
+ 128, // Bits per element
+ 16, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // 0x9 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xa (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xb (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xc (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xd (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xe (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xf (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x10 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x11 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x12 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x13 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x14 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x15 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x16 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x17 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x18 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x19 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1a (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1b (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1c (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1d (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1e (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1f (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x20 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x21 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x22 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x23 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x24 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x25 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x26 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x27 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x28 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x29 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x2a (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x2b (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x2c (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x2d (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x2e (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x2f (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x30 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x31 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x32 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x33 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x34 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x35 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x36 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x37 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x38 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x39 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x3a (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x3b (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x3c (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x3d (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x3e (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x3f (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // R32G32B32_FLOAT (0x40)
+ {
+ "R32G32B32_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 0 }, // Swizzle
+ { 32, 32, 32, 0 }, // Bits per component
+ 96, // Bits per element
+ 12, // Bytes per element
+ 3, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R32G32B32_SINT (0x41)
+ {
+ "R32G32B32_SINT",
+ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 2, 0 }, // Swizzle
+ { 32, 32, 32, 0 }, // Bits per component
+ 96, // Bits per element
+ 12, // Bytes per element
+ 3, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R32G32B32_UINT (0x42)
+ {
+ "R32G32B32_UINT",
+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 2, 0 }, // Swizzle
+ { 32, 32, 32, 0 }, // Bits per component
+ 96, // Bits per element
+ 12, // Bytes per element
+ 3, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // 0x43 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x44 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // R32G32B32_SSCALED (0x45)
+ {
+ "R32G32B32_SSCALED",
+ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 0 }, // Swizzle
+ { 32, 32, 32, 0 }, // Bits per component
+ 96, // Bits per element
+ 12, // Bytes per element
+ 3, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R32G32B32_USCALED (0x46)
+ {
+ "R32G32B32_USCALED",
+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 0 }, // Swizzle
+ { 32, 32, 32, 0 }, // Bits per component
+ 96, // Bits per element
+ 12, // Bytes per element
+ 3, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // 0x47 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x48 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x49 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x4a (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x4b (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x4c (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x4d (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x4e (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x4f (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x50 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x51 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x52 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x53 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x54 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x55 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x56 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x57 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x58 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x59 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x5a (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x5b (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x5c (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x5d (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x5e (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x5f (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x60 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x61 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x62 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x63 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x64 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x65 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x66 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x67 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x68 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x69 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x6a (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x6b (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x6c (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x6d (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x6e (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x6f (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x70 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x71 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x72 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x73 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x74 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x75 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x76 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x77 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x78 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x79 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x7a (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x7b (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x7c (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x7d (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x7e (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x7f (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // R16G16B16A16_UNORM (0x80)
+ {
+ "R16G16B16A16_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 16, 16, 16, 16 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16G16B16A16_SNORM (0x81)
+ {
+ "R16G16B16A16_SNORM",
+ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 16, 16, 16, 16 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16G16B16A16_SINT (0x82)
+ {
+ "R16G16B16A16_SINT",
+ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 16, 16, 16, 16 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16G16B16A16_UINT (0x83)
+ {
+ "R16G16B16A16_UINT",
+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 16, 16, 16, 16 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16G16B16A16_FLOAT (0x84)
+ {
+ "R16G16B16A16_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 16, 16, 16, 16 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R32G32_FLOAT (0x85)
+ {
+ "R32G32_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 0, 0 }, // Swizzle
+ { 32, 32, 0, 0 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R32G32_SINT (0x86)
+ {
+ "R32G32_SINT",
+ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 0, 0 }, // Swizzle
+ { 32, 32, 0, 0 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R32G32_UINT (0x87)
+ {
+ "R32G32_UINT",
+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 0, 0 }, // Swizzle
+ { 32, 32, 0, 0 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R32_FLOAT_X8X24_TYPELESS (0x88)
+ {
+ "R32_FLOAT_X8X24_TYPELESS",
+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 0, 0 }, // Swizzle
+ { 32, 32, 0, 0 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // X32_TYPELESS_G8X24_UINT (0x89)
+ {
+ "X32_TYPELESS_G8X24_UINT",
+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 0, 0 }, // Swizzle
+ { 32, 32, 0, 0 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // L32A32_FLOAT (0x8a)
+ {
+ "L32A32_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 3, 0, 0 }, // Swizzle
+ { 32, 32, 0, 0 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // 0x8b (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x8c (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x8d (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // R16G16B16X16_UNORM (0x8e)
+ {
+ "R16G16B16X16_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 16, 16, 16, 16 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16G16B16X16_FLOAT (0x8f)
+ {
+ "R16G16B16X16_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 16, 16, 16, 16 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // 0x90 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // L32X32_FLOAT (0x91)
+ {
+ "L32X32_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 3, 0, 0 }, // Swizzle
+ { 32, 32, 0, 0 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // I32X32_FLOAT (0x92)
+ {
+ "I32X32_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 3, 0, 0 }, // Swizzle
+ { 32, 32, 0, 0 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // R16G16B16A16_SSCALED (0x93)
+ {
+ "R16G16B16A16_SSCALED",
+ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 16, 16, 16, 16 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16G16B16A16_USCALED (0x94)
+ {
+ "R16G16B16A16_USCALED",
+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 16, 16, 16, 16 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R32G32_SSCALED (0x95)
+ {
+ "R32G32_SSCALED",
+ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 0, 0 }, // Swizzle
+ { 32, 32, 0, 0 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R32G32_USCALED (0x96)
+ {
+ "R32G32_USCALED",
+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 0, 0 }, // Swizzle
+ { 32, 32, 0, 0 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // 0x97 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // R32_FLOAT_X8X24_TYPELESS_LD (0x98)
+ {
+ "R32_FLOAT_X8X24_TYPELESS_LD",
+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 0, 0 }, // Swizzle
+ { 32, 32, 0, 0 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // 0x99 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x9a (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x9b (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x9c (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x9d (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x9e (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x9f (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xa0 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xa1 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xa2 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xa3 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xa4 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xa5 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xa6 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xa7 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xa8 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xa9 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xaa (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xab (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xac (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xad (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xae (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xaf (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xb0 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xb1 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xb2 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xb3 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xb4 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xb5 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xb6 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xb7 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xb8 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xb9 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xba (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xbb (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xbc (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xbd (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xbe (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xbf (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // B8G8R8A8_UNORM (0xc0)
+ {
+ "B8G8R8A8_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 2, 1, 0, 3 }, // Swizzle
+ { 8, 8, 8, 8 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // B8G8R8A8_UNORM_SRGB (0xc1)
+ {
+ "B8G8R8A8_UNORM_SRGB",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 2, 1, 0, 3 }, // Swizzle
+ { 8, 8, 8, 8 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ true, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R10G10B10A2_UNORM (0xc2)
+ {
+ "R10G10B10A2_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 10, 10, 10, 2 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R10G10B10A2_UNORM_SRGB (0xc3)
+ {
+ "R10G10B10A2_UNORM_SRGB",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 10, 10, 10, 2 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ true, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R10G10B10A2_UINT (0xc4)
+ {
+ "R10G10B10A2_UINT",
+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 10, 10, 10, 2 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // 0xc5 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xc6 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // R8G8B8A8_UNORM (0xc7)
+ {
+ "R8G8B8A8_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 8, 8, 8, 8 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R8G8B8A8_UNORM_SRGB (0xc8)
+ {
+ "R8G8B8A8_UNORM_SRGB",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 8, 8, 8, 8 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ true, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R8G8B8A8_SNORM (0xc9)
+ {
+ "R8G8B8A8_SNORM",
+ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 8, 8, 8, 8 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R8G8B8A8_SINT (0xca)
+ {
+ "R8G8B8A8_SINT",
+ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 8, 8, 8, 8 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R8G8B8A8_UINT (0xcb)
+ {
+ "R8G8B8A8_UINT",
+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 8, 8, 8, 8 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16G16_UNORM (0xcc)
+ {
+ "R16G16_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 0, 0 }, // Swizzle
+ { 16, 16, 0, 0 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 65535.0f, 1.0f / 65535.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16G16_SNORM (0xcd)
+ {
+ "R16G16_SNORM",
+ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 0, 0 }, // Swizzle
+ { 16, 16, 0, 0 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 32767.0f, 1.0f / 32767.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16G16_SINT (0xce)
+ {
+ "R16G16_SINT",
+ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 0, 0 }, // Swizzle
+ { 16, 16, 0, 0 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16G16_UINT (0xcf)
+ {
+ "R16G16_UINT",
+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 0, 0 }, // Swizzle
+ { 16, 16, 0, 0 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16G16_FLOAT (0xd0)
+ {
+ "R16G16_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 0, 0 }, // Swizzle
+ { 16, 16, 0, 0 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // B10G10R10A2_UNORM (0xd1)
+ {
+ "B10G10R10A2_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 2, 1, 0, 3 }, // Swizzle
+ { 10, 10, 10, 2 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // B10G10R10A2_UNORM_SRGB (0xd2)
+ {
+ "B10G10R10A2_UNORM_SRGB",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 2, 1, 0, 3 }, // Swizzle
+ { 10, 10, 10, 2 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ true, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R11G11B10_FLOAT (0xd3)
+ {
+ "R11G11B10_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 0 }, // Swizzle
+ { 11, 11, 10, 0 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 3, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // 0xd4 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xd5 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // R32_SINT (0xd6)
+ {
+ "R32_SINT",
+ { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 32, 0, 0, 0 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R32_UINT (0xd7)
+ {
+ "R32_UINT",
+ { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 32, 0, 0, 0 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R32_FLOAT (0xd8)
+ {
+ "R32_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 32, 0, 0, 0 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R24_UNORM_X8_TYPELESS (0xd9)
+ {
+ "R24_UNORM_X8_TYPELESS",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 24, 0, 0, 0 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 16777215.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // 0xda (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xdb (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // R24_UNORM_X8_TYPELESS_LD (0xdc)
+ {
+ "R24_UNORM_X8_TYPELESS_LD",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 24, 0, 0, 0 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 16777215.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // L32_UNORM (0xdd)
+ {
+ "L32_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 32, 0, 0, 0 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 4294967295.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // 0xde (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // L16A16_UNORM (0xdf)
+ {
+ "L16A16_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 3, 0, 0 }, // Swizzle
+ { 16, 16, 0, 0 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 65535.0f, 1.0f / 65535.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // I24X8_UNORM (0xe0)
+ {
+ "I24X8_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 3, 0, 0 }, // Swizzle
+ { 24, 8, 0, 0 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 16777215.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // L24X8_UNORM (0xe1)
+ {
+ "L24X8_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 3, 0, 0 }, // Swizzle
+ { 24, 8, 0, 0 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 16777215.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // 0xe2 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // I32_FLOAT (0xe3)
+ {
+ "I32_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 32, 0, 0, 0 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // L32_FLOAT (0xe4)
+ {
+ "L32_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 32, 0, 0, 0 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // A32_FLOAT (0xe5)
+ {
+ "A32_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 3, 0, 0, 0 }, // Swizzle
+ { 32, 0, 0, 0 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // 0xe6 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xe7 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xe8 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // B8G8R8X8_UNORM (0xe9)
+ {
+ "B8G8R8X8_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 2, 1, 0, 3 }, // Swizzle
+ { 8, 8, 8, 8 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // B8G8R8X8_UNORM_SRGB (0xea)
+ {
+ "B8G8R8X8_UNORM_SRGB",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 2, 1, 0, 3 }, // Swizzle
+ { 8, 8, 8, 8 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ true, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R8G8B8X8_UNORM (0xeb)
+ {
+ "R8G8B8X8_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 8, 8, 8, 8 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R8G8B8X8_UNORM_SRGB (0xec)
+ {
+ "R8G8B8X8_UNORM_SRGB",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 8, 8, 8, 8 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ true, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R9G9B9E5_SHAREDEXP (0xed)
+ {
+ "R9G9B9E5_SHAREDEXP",
+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 9, 9, 9, 5 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // B10G10R10X2_UNORM (0xee)
+ {
+ "B10G10R10X2_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 2, 1, 0, 3 }, // Swizzle
+ { 10, 10, 10, 2 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // 0xef (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // L16A16_FLOAT (0xf0)
+ {
+ "L16A16_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 3, 0, 0 }, // Swizzle
+ { 16, 16, 0, 0 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // 0xf1 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xf2 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // R10G10B10X2_USCALED (0xf3)
+ {
+ "R10G10B10X2_USCALED",
+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNUSED },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 10, 10, 10, 2 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R8G8B8A8_SSCALED (0xf4)
+ {
+ "R8G8B8A8_SSCALED",
+ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 8, 8, 8, 8 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R8G8B8A8_USCALED (0xf5)
+ {
+ "R8G8B8A8_USCALED",
+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 8, 8, 8, 8 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16G16_SSCALED (0xf6)
+ {
+ "R16G16_SSCALED",
+ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 0, 0 }, // Swizzle
+ { 16, 16, 0, 0 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16G16_USCALED (0xf7)
+ {
+ "R16G16_USCALED",
+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 0, 0 }, // Swizzle
+ { 16, 16, 0, 0 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R32_SSCALED (0xf8)
+ {
+ "R32_SSCALED",
+ { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 32, 0, 0, 0 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R32_USCALED (0xf9)
+ {
+ "R32_USCALED",
+ { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 32, 0, 0, 0 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // 0xfa (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xfb (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xfc (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xfd (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xfe (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0xff (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // B5G6R5_UNORM (0x100)
+ {
+ "B5G6R5_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 2, 1, 0, 0 }, // Swizzle
+ { 5, 6, 5, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 3, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // B5G6R5_UNORM_SRGB (0x101)
+ {
+ "B5G6R5_UNORM_SRGB",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 2, 1, 0, 0 }, // Swizzle
+ { 5, 6, 5, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 3, // Num components
+ true, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // B5G5R5A1_UNORM (0x102)
+ {
+ "B5G5R5A1_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 2, 1, 0, 3 }, // Swizzle
+ { 5, 5, 5, 1 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // B5G5R5A1_UNORM_SRGB (0x103)
+ {
+ "B5G5R5A1_UNORM_SRGB",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 2, 1, 0, 3 }, // Swizzle
+ { 5, 5, 5, 1 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 4, // Num components
+ true, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // B4G4R4A4_UNORM (0x104)
+ {
+ "B4G4R4A4_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 2, 1, 0, 3 }, // Swizzle
+ { 4, 4, 4, 4 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // B4G4R4A4_UNORM_SRGB (0x105)
+ {
+ "B4G4R4A4_UNORM_SRGB",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 2, 1, 0, 3 }, // Swizzle
+ { 4, 4, 4, 4 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 4, // Num components
+ true, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R8G8_UNORM (0x106)
+ {
+ "R8G8_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 0, 0 }, // Swizzle
+ { 8, 8, 0, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R8G8_SNORM (0x107)
+ {
+ "R8G8_SNORM",
+ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 0, 0 }, // Swizzle
+ { 8, 8, 0, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 127.0f, 1.0f / 127.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R8G8_SINT (0x108)
+ {
+ "R8G8_SINT",
+ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 0, 0 }, // Swizzle
+ { 8, 8, 0, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R8G8_UINT (0x109)
+ {
+ "R8G8_UINT",
+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 0, 0 }, // Swizzle
+ { 8, 8, 0, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16_UNORM (0x10a)
+ {
+ "R16_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 16, 0, 0, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16_SNORM (0x10b)
+ {
+ "R16_SNORM",
+ { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 16, 0, 0, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 32767.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16_SINT (0x10c)
+ {
+ "R16_SINT",
+ { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 16, 0, 0, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16_UINT (0x10d)
+ {
+ "R16_UINT",
+ { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 16, 0, 0, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16_FLOAT (0x10e)
+ {
+ "R16_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 16, 0, 0, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // 0x10f (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x110 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // I16_UNORM (0x111)
+ {
+ "I16_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 16, 0, 0, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // L16_UNORM (0x112)
+ {
+ "L16_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 16, 0, 0, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // A16_UNORM (0x113)
+ {
+ "A16_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 3, 0, 0, 0 }, // Swizzle
+ { 16, 0, 0, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // L8A8_UNORM (0x114)
+ {
+ "L8A8_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 3, 0, 0 }, // Swizzle
+ { 8, 8, 0, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // I16_FLOAT (0x115)
+ {
+ "I16_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 16, 0, 0, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // L16_FLOAT (0x116)
+ {
+ "L16_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 16, 0, 0, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // A16_FLOAT (0x117)
+ {
+ "A16_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 3, 0, 0, 0 }, // Swizzle
+ { 16, 0, 0, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // L8A8_UNORM_SRGB (0x118)
+ {
+ "L8A8_UNORM_SRGB",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 3, 0, 0 }, // Swizzle
+ { 8, 8, 0, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 2, // Num components
+ true, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // 0x119 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // B5G5R5X1_UNORM (0x11a)
+ {
+ "B5G5R5X1_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 2, 1, 0, 3 }, // Swizzle
+ { 5, 5, 5, 1 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // B5G5R5X1_UNORM_SRGB (0x11b)
+ {
+ "B5G5R5X1_UNORM_SRGB",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 2, 1, 0, 3 }, // Swizzle
+ { 5, 5, 5, 1 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 4, // Num components
+ true, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R8G8_SSCALED (0x11c)
+ {
+ "R8G8_SSCALED",
+ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 0, 0 }, // Swizzle
+ { 8, 8, 0, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R8G8_USCALED (0x11d)
+ {
+ "R8G8_USCALED",
+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 0, 0 }, // Swizzle
+ { 8, 8, 0, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16_SSCALED (0x11e)
+ {
+ "R16_SSCALED",
+ { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 16, 0, 0, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16_USCALED (0x11f)
+ {
+ "R16_USCALED",
+ { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 16, 0, 0, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // 0x120 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x121 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x122 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x123 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x124 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x125 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // L8A8_UINT (0x126)
+ {
+ "L8A8_UINT",
+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 3, 0, 0 }, // Swizzle
+ { 8, 8, 0, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // L8A8_SINT (0x127)
+ {
+ "L8A8_SINT",
+ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 3, 0, 0 }, // Swizzle
+ { 8, 8, 0, 0 }, // Bits per component
+ 16, // Bits per element
+ 2, // Bytes per element
+ 2, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // 0x128 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x129 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x12a (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x12b (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x12c (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x12d (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x12e (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x12f (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x130 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x131 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x132 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x133 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x134 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x135 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x136 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x137 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x138 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x139 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x13a (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x13b (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x13c (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x13d (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x13e (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x13f (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // R8_UNORM (0x140)
+ {
+ "R8_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 8, // Bits per element
+ 1, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R8_SNORM (0x141)
+ {
+ "R8_SNORM",
+ { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 8, // Bits per element
+ 1, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R8_SINT (0x142)
+ {
+ "R8_SINT",
+ { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 8, // Bits per element
+ 1, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R8_UINT (0x143)
+ {
+ "R8_UINT",
+ { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 8, // Bits per element
+ 1, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // A8_UNORM (0x144)
+ {
+ "A8_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 3, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 8, // Bits per element
+ 1, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // I8_UNORM (0x145)
+ {
+ "I8_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 8, // Bits per element
+ 1, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // L8_UNORM (0x146)
+ {
+ "L8_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 8, // Bits per element
+ 1, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // 0x147 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x148 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // R8_SSCALED (0x149)
+ {
+ "R8_SSCALED",
+ { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 8, // Bits per element
+ 1, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R8_USCALED (0x14a)
+ {
+ "R8_USCALED",
+ { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 8, // Bits per element
+ 1, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // 0x14b (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // L8_UNORM_SRGB (0x14c)
+ {
+ "L8_UNORM_SRGB",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 8, // Bits per element
+ 1, // Bytes per element
+ 1, // Num components
+ true, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // 0x14d (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x14e (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x14f (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x150 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x151 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // L8_UINT (0x152)
+ {
+ "L8_UINT",
+ { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 8, // Bits per element
+ 1, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // L8_SINT (0x153)
+ {
+ "L8_SINT",
+ { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 8, // Bits per element
+ 1, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // I8_UINT (0x154)
+ {
+ "I8_UINT",
+ { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 8, // Bits per element
+ 1, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // I8_SINT (0x155)
+ {
+ "I8_SINT",
+ { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 8, // Bits per element
+ 1, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 0, 0, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ true, // isLuminance
+ },
+ // 0x156 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x157 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x158 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x159 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x15a (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x15b (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x15c (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x15d (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x15e (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x15f (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x160 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x161 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x162 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x163 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x164 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x165 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x166 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x167 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x168 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x169 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x16a (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x16b (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x16c (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x16d (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x16e (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x16f (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x170 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x171 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x172 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x173 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x174 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x175 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x176 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x177 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x178 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x179 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x17a (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x17b (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x17c (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x17d (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x17e (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x17f (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x180 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x181 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x182 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // YCRCB_SWAPUVY (0x183)
+ {
+ "YCRCB_SWAPUVY",
+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 8, 8, 8, 8 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ true, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 2, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // 0x184 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x185 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // BC1_UNORM (0x186)
+ {
+ "BC1_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ true, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+ 4, // bcWidth
+ 4, // bcHeight
+ false, // isLuminance
+ },
+ // BC2_UNORM (0x187)
+ {
+ "BC2_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 128, // Bits per element
+ 16, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ true, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+ 4, // bcWidth
+ 4, // bcHeight
+ false, // isLuminance
+ },
+ // BC3_UNORM (0x188)
+ {
+ "BC3_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 128, // Bits per element
+ 16, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ true, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+ 4, // bcWidth
+ 4, // bcHeight
+ false, // isLuminance
+ },
+ // BC4_UNORM (0x189)
+ {
+ "BC4_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ true, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+ 4, // bcWidth
+ 4, // bcHeight
+ false, // isLuminance
+ },
+ // BC5_UNORM (0x18a)
+ {
+ "BC5_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 128, // Bits per element
+ 16, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ true, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+ 4, // bcWidth
+ 4, // bcHeight
+ false, // isLuminance
+ },
+ // BC1_UNORM_SRGB (0x18b)
+ {
+ "BC1_UNORM_SRGB",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 1, // Num components
+ true, // isSRGB
+ true, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+ 4, // bcWidth
+ 4, // bcHeight
+ false, // isLuminance
+ },
+ // BC2_UNORM_SRGB (0x18c)
+ {
+ "BC2_UNORM_SRGB",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 128, // Bits per element
+ 16, // Bytes per element
+ 1, // Num components
+ true, // isSRGB
+ true, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+ 4, // bcWidth
+ 4, // bcHeight
+ false, // isLuminance
+ },
+ // BC3_UNORM_SRGB (0x18d)
+ {
+ "BC3_UNORM_SRGB",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 128, // Bits per element
+ 16, // Bytes per element
+ 1, // Num components
+ true, // isSRGB
+ true, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+ 4, // bcWidth
+ 4, // bcHeight
+ false, // isLuminance
+ },
+ // 0x18e (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // YCRCB_SWAPUV (0x18f)
+ {
+ "YCRCB_SWAPUV",
+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 8, 8, 8, 8 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ true, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 2, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // 0x190 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x191 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x192 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // R8G8B8_UNORM (0x193)
+ {
+ "R8G8B8_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 0 }, // Swizzle
+ { 8, 8, 8, 0 }, // Bits per component
+ 24, // Bits per element
+ 3, // Bytes per element
+ 3, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R8G8B8_SNORM (0x194)
+ {
+ "R8G8B8_SNORM",
+ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 0 }, // Swizzle
+ { 8, 8, 8, 0 }, // Bits per component
+ 24, // Bits per element
+ 3, // Bytes per element
+ 3, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R8G8B8_SSCALED (0x195)
+ {
+ "R8G8B8_SSCALED",
+ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 0 }, // Swizzle
+ { 8, 8, 8, 0 }, // Bits per component
+ 24, // Bits per element
+ 3, // Bytes per element
+ 3, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R8G8B8_USCALED (0x196)
+ {
+ "R8G8B8_USCALED",
+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 0 }, // Swizzle
+ { 8, 8, 8, 0 }, // Bits per component
+ 24, // Bits per element
+ 3, // Bytes per element
+ 3, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // 0x197 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x198 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // BC4_SNORM (0x199)
+ {
+ "BC4_SNORM",
+ { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 64, // Bits per element
+ 8, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ true, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor
+ 4, // bcWidth
+ 4, // bcHeight
+ false, // isLuminance
+ },
+ // BC5_SNORM (0x19a)
+ {
+ "BC5_SNORM",
+ { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 128, // Bits per element
+ 16, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ true, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor
+ 4, // bcWidth
+ 4, // bcHeight
+ false, // isLuminance
+ },
+ // R16G16B16_FLOAT (0x19b)
+ {
+ "R16G16B16_FLOAT",
+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 0 }, // Swizzle
+ { 16, 16, 16, 0 }, // Bits per component
+ 48, // Bits per element
+ 6, // Bytes per element
+ 3, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16G16B16_UNORM (0x19c)
+ {
+ "R16G16B16_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 0 }, // Swizzle
+ { 16, 16, 16, 0 }, // Bits per component
+ 48, // Bits per element
+ 6, // Bytes per element
+ 3, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16G16B16_SNORM (0x19d)
+ {
+ "R16G16B16_SNORM",
+ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 0 }, // Swizzle
+ { 16, 16, 16, 0 }, // Bits per component
+ 48, // Bits per element
+ 6, // Bytes per element
+ 3, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16G16B16_SSCALED (0x19e)
+ {
+ "R16G16B16_SSCALED",
+ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 0 }, // Swizzle
+ { 16, 16, 16, 0 }, // Bits per component
+ 48, // Bits per element
+ 6, // Bytes per element
+ 3, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16G16B16_USCALED (0x19f)
+ {
+ "R16G16B16_USCALED",
+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 0 }, // Swizzle
+ { 16, 16, 16, 0 }, // Bits per component
+ 48, // Bits per element
+ 6, // Bytes per element
+ 3, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // 0x1a0 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // BC6H_SF16 (0x1a1)
+ {
+ "BC6H_SF16",
+ { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 128, // Bits per element
+ 16, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ true, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor
+ 4, // bcWidth
+ 4, // bcHeight
+ false, // isLuminance
+ },
+ // BC7_UNORM (0x1a2)
+ {
+ "BC7_UNORM",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 128, // Bits per element
+ 16, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ true, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+ 4, // bcWidth
+ 4, // bcHeight
+ false, // isLuminance
+ },
+ // BC7_UNORM_SRGB (0x1a3)
+ {
+ "BC7_UNORM_SRGB",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 128, // Bits per element
+ 16, // Bytes per element
+ 1, // Num components
+ true, // isSRGB
+ true, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+ 4, // bcWidth
+ 4, // bcHeight
+ false, // isLuminance
+ },
+ // BC6H_UF16 (0x1a4)
+ {
+ "BC6H_UF16",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 0, 0, 0 }, // Swizzle
+ { 8, 0, 0, 0 }, // Bits per component
+ 128, // Bits per element
+ 16, // Bytes per element
+ 1, // Num components
+ false, // isSRGB
+ true, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+ 4, // bcWidth
+ 4, // bcHeight
+ false, // isLuminance
+ },
+ // 0x1a5 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1a6 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1a7 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // R8G8B8_UNORM_SRGB (0x1a8)
+ {
+ "R8G8B8_UNORM_SRGB",
+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 0 }, // Swizzle
+ { 8, 8, 8, 0 }, // Bits per component
+ 24, // Bits per element
+ 3, // Bytes per element
+ 3, // Num components
+ true, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // 0x1a9 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1aa (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1ab (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1ac (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1ad (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1ae (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1af (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // R16G16B16_UINT (0x1b0)
+ {
+ "R16G16B16_UINT",
+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 2, 0 }, // Swizzle
+ { 16, 16, 16, 0 }, // Bits per component
+ 48, // Bits per element
+ 6, // Bytes per element
+ 3, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R16G16B16_SINT (0x1b1)
+ {
+ "R16G16B16_SINT",
+ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 2, 0 }, // Swizzle
+ { 16, 16, 16, 0 }, // Bits per component
+ 48, // Bits per element
+ 6, // Bytes per element
+ 3, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // 0x1b2 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // R10G10B10A2_SNORM (0x1b3)
+ {
+ "R10G10B10A2_SNORM",
+ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 10, 10, 10, 2 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R10G10B10A2_USCALED (0x1b4)
+ {
+ "R10G10B10A2_USCALED",
+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 10, 10, 10, 2 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R10G10B10A2_SSCALED (0x1b5)
+ {
+ "R10G10B10A2_SSCALED",
+ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 10, 10, 10, 2 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R10G10B10A2_SINT (0x1b6)
+ {
+ "R10G10B10A2_SINT",
+ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 2, 3 }, // Swizzle
+ { 10, 10, 10, 2 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // B10G10R10A2_SNORM (0x1b7)
+ {
+ "B10G10R10A2_SNORM",
+ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 2, 1, 0, 3 }, // Swizzle
+ { 10, 10, 10, 2 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { true, true, true, true }, // Is normalized?
+ { 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // B10G10R10A2_USCALED (0x1b8)
+ {
+ "B10G10R10A2_USCALED",
+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 2, 1, 0, 3 }, // Swizzle
+ { 10, 10, 10, 2 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // B10G10R10A2_SSCALED (0x1b9)
+ {
+ "B10G10R10A2_SSCALED",
+ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED },
+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+ { 2, 1, 0, 3 }, // Swizzle
+ { 10, 10, 10, 2 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // B10G10R10A2_UINT (0x1ba)
+ {
+ "B10G10R10A2_UINT",
+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 2, 1, 0, 3 }, // Swizzle
+ { 10, 10, 10, 2 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // B10G10R10A2_SINT (0x1bb)
+ {
+ "B10G10R10A2_SINT",
+ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 2, 1, 0, 3 }, // Swizzle
+ { 10, 10, 10, 2 }, // Bits per component
+ 32, // Bits per element
+ 4, // Bytes per element
+ 4, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // 0x1bc (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1bd (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1be (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1bf (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1c0 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1c1 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1c2 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1c3 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1c4 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1c5 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1c6 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // 0x1c7 (Padding)
+ {
+ "UNKNOWN",
+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+ { false, false, false, false },
+ { 0.0f, 0.0f, 0.0f, 0.0f },
+ 1, 1, false },
+ // R8G8B8_UINT (0x1c8)
+ {
+ "R8G8B8_UINT",
+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 2, 0 }, // Swizzle
+ { 8, 8, 8, 0 }, // Bits per component
+ 24, // Bits per element
+ 3, // Bytes per element
+ 3, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+ // R8G8B8_SINT (0x1c9)
+ {
+ "R8G8B8_SINT",
+ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN },
+ { 0, 0, 0, 0x1 }, // Defaults for missing components
+ { 0, 1, 2, 0 }, // Swizzle
+ { 8, 8, 8, 0 }, // Bits per component
+ 24, // Bits per element
+ 3, // Bytes per element
+ 3, // Num components
+ false, // isSRGB
+ false, // isBC
+ false, // isSubsampled
+ { false, false, false, false }, // Is normalized?
+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+ 1, // bcWidth
+ 1, // bcHeight
+ false, // isLuminance
+ },
+};
diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.h b/src/gallium/drivers/swr/rasterizer/common/formats.h
new file mode 100644
index 00000000000..b9dd53ebaa4
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/formats.h
@@ -0,0 +1,251 @@
+
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file formats.h
+*
+* @brief auto-generated file
+*
+* DO NOT EDIT
+*
+******************************************************************************/
+
+#pragma once
+
+#include "common/os.h"
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_TYPE - Format component type
+//////////////////////////////////////////////////////////////////////////
+enum SWR_TYPE
+{
+ SWR_TYPE_UNKNOWN,
+ SWR_TYPE_UNUSED,
+ SWR_TYPE_UNORM,
+ SWR_TYPE_SNORM,
+ SWR_TYPE_UINT,
+ SWR_TYPE_SINT,
+ SWR_TYPE_FLOAT,
+ SWR_TYPE_SSCALED,
+ SWR_TYPE_USCALED,
+};
+//////////////////////////////////////////////////////////////////////////
+/// SWR_FORMAT
+//////////////////////////////////////////////////////////////////////////
+enum SWR_FORMAT
+{
+ R32G32B32A32_FLOAT = 0x0,
+ R32G32B32A32_SINT = 0x1,
+ R32G32B32A32_UINT = 0x2,
+ R32G32B32X32_FLOAT = 0x6,
+ R32G32B32A32_SSCALED = 0x7,
+ R32G32B32A32_USCALED = 0x8,
+ R32G32B32_FLOAT = 0x40,
+ R32G32B32_SINT = 0x41,
+ R32G32B32_UINT = 0x42,
+ R32G32B32_SSCALED = 0x45,
+ R32G32B32_USCALED = 0x46,
+ R16G16B16A16_UNORM = 0x80,
+ R16G16B16A16_SNORM = 0x81,
+ R16G16B16A16_SINT = 0x82,
+ R16G16B16A16_UINT = 0x83,
+ R16G16B16A16_FLOAT = 0x84,
+ R32G32_FLOAT = 0x85,
+ R32G32_SINT = 0x86,
+ R32G32_UINT = 0x87,
+ R32_FLOAT_X8X24_TYPELESS = 0x88,
+ X32_TYPELESS_G8X24_UINT = 0x89,
+ L32A32_FLOAT = 0x8A,
+ R16G16B16X16_UNORM = 0x8E,
+ R16G16B16X16_FLOAT = 0x8F,
+ L32X32_FLOAT = 0x91,
+ I32X32_FLOAT = 0x92,
+ R16G16B16A16_SSCALED = 0x93,
+ R16G16B16A16_USCALED = 0x94,
+ R32G32_SSCALED = 0x95,
+ R32G32_USCALED = 0x96,
+ R32_FLOAT_X8X24_TYPELESS_LD = 0x98,
+ B8G8R8A8_UNORM = 0xC0,
+ B8G8R8A8_UNORM_SRGB = 0xC1,
+ R10G10B10A2_UNORM = 0xC2,
+ R10G10B10A2_UNORM_SRGB = 0xC3,
+ R10G10B10A2_UINT = 0xC4,
+ R8G8B8A8_UNORM = 0xC7,
+ R8G8B8A8_UNORM_SRGB = 0xC8,
+ R8G8B8A8_SNORM = 0xC9,
+ R8G8B8A8_SINT = 0xCA,
+ R8G8B8A8_UINT = 0xCB,
+ R16G16_UNORM = 0xCC,
+ R16G16_SNORM = 0xCD,
+ R16G16_SINT = 0xCE,
+ R16G16_UINT = 0xCF,
+ R16G16_FLOAT = 0xD0,
+ B10G10R10A2_UNORM = 0xD1,
+ B10G10R10A2_UNORM_SRGB = 0xD2,
+ R11G11B10_FLOAT = 0xD3,
+ R32_SINT = 0xD6,
+ R32_UINT = 0xD7,
+ R32_FLOAT = 0xD8,
+ R24_UNORM_X8_TYPELESS = 0xD9,
+ R24_UNORM_X8_TYPELESS_LD = 0xDC,
+ L32_UNORM = 0xDD,
+ L16A16_UNORM = 0xDF,
+ I24X8_UNORM = 0xE0,
+ L24X8_UNORM = 0xE1,
+ I32_FLOAT = 0xE3,
+ L32_FLOAT = 0xE4,
+ A32_FLOAT = 0xE5,
+ B8G8R8X8_UNORM = 0xE9,
+ B8G8R8X8_UNORM_SRGB = 0xEA,
+ R8G8B8X8_UNORM = 0xEB,
+ R8G8B8X8_UNORM_SRGB = 0xEC,
+ R9G9B9E5_SHAREDEXP = 0xED,
+ B10G10R10X2_UNORM = 0xEE,
+ L16A16_FLOAT = 0xF0,
+ R10G10B10X2_USCALED = 0xF3,
+ R8G8B8A8_SSCALED = 0xF4,
+ R8G8B8A8_USCALED = 0xF5,
+ R16G16_SSCALED = 0xF6,
+ R16G16_USCALED = 0xF7,
+ R32_SSCALED = 0xF8,
+ R32_USCALED = 0xF9,
+ B5G6R5_UNORM = 0x100,
+ B5G6R5_UNORM_SRGB = 0x101,
+ B5G5R5A1_UNORM = 0x102,
+ B5G5R5A1_UNORM_SRGB = 0x103,
+ B4G4R4A4_UNORM = 0x104,
+ B4G4R4A4_UNORM_SRGB = 0x105,
+ R8G8_UNORM = 0x106,
+ R8G8_SNORM = 0x107,
+ R8G8_SINT = 0x108,
+ R8G8_UINT = 0x109,
+ R16_UNORM = 0x10A,
+ R16_SNORM = 0x10B,
+ R16_SINT = 0x10C,
+ R16_UINT = 0x10D,
+ R16_FLOAT = 0x10E,
+ I16_UNORM = 0x111,
+ L16_UNORM = 0x112,
+ A16_UNORM = 0x113,
+ L8A8_UNORM = 0x114,
+ I16_FLOAT = 0x115,
+ L16_FLOAT = 0x116,
+ A16_FLOAT = 0x117,
+ L8A8_UNORM_SRGB = 0x118,
+ B5G5R5X1_UNORM = 0x11A,
+ B5G5R5X1_UNORM_SRGB = 0x11B,
+ R8G8_SSCALED = 0x11C,
+ R8G8_USCALED = 0x11D,
+ R16_SSCALED = 0x11E,
+ R16_USCALED = 0x11F,
+ L8A8_UINT = 0x126,
+ L8A8_SINT = 0x127,
+ R8_UNORM = 0x140,
+ R8_SNORM = 0x141,
+ R8_SINT = 0x142,
+ R8_UINT = 0x143,
+ A8_UNORM = 0x144,
+ I8_UNORM = 0x145,
+ L8_UNORM = 0x146,
+ R8_SSCALED = 0x149,
+ R8_USCALED = 0x14A,
+ L8_UNORM_SRGB = 0x14C,
+ L8_UINT = 0x152,
+ L8_SINT = 0x153,
+ I8_UINT = 0x154,
+ I8_SINT = 0x155,
+ YCRCB_SWAPUVY = 0x183,
+ BC1_UNORM = 0x186,
+ BC2_UNORM = 0x187,
+ BC3_UNORM = 0x188,
+ BC4_UNORM = 0x189,
+ BC5_UNORM = 0x18A,
+ BC1_UNORM_SRGB = 0x18B,
+ BC2_UNORM_SRGB = 0x18C,
+ BC3_UNORM_SRGB = 0x18D,
+ YCRCB_SWAPUV = 0x18F,
+ R8G8B8_UNORM = 0x193,
+ R8G8B8_SNORM = 0x194,
+ R8G8B8_SSCALED = 0x195,
+ R8G8B8_USCALED = 0x196,
+ BC4_SNORM = 0x199,
+ BC5_SNORM = 0x19A,
+ R16G16B16_FLOAT = 0x19B,
+ R16G16B16_UNORM = 0x19C,
+ R16G16B16_SNORM = 0x19D,
+ R16G16B16_SSCALED = 0x19E,
+ R16G16B16_USCALED = 0x19F,
+ BC6H_SF16 = 0x1A1,
+ BC7_UNORM = 0x1A2,
+ BC7_UNORM_SRGB = 0x1A3,
+ BC6H_UF16 = 0x1A4,
+ R8G8B8_UNORM_SRGB = 0x1A8,
+ R16G16B16_UINT = 0x1B0,
+ R16G16B16_SINT = 0x1B1,
+ R10G10B10A2_SNORM = 0x1B3,
+ R10G10B10A2_USCALED = 0x1B4,
+ R10G10B10A2_SSCALED = 0x1B5,
+ R10G10B10A2_SINT = 0x1B6,
+ B10G10R10A2_SNORM = 0x1B7,
+ B10G10R10A2_USCALED = 0x1B8,
+ B10G10R10A2_SSCALED = 0x1B9,
+ B10G10R10A2_UINT = 0x1BA,
+ B10G10R10A2_SINT = 0x1BB,
+ R8G8B8_UINT = 0x1C8,
+ R8G8B8_SINT = 0x1C9,
+ NUM_SWR_FORMATS = 0x1CA,
+};
+//////////////////////////////////////////////////////////////////////////
+/// SWR_FORMAT_INFO - Format information
+//////////////////////////////////////////////////////////////////////////
+struct SWR_FORMAT_INFO
+{
+ const char* name;
+ SWR_TYPE type[4];
+ uint32_t defaults[4];
+ uint32_t swizzle[4]; ///< swizzle per component
+ uint32_t bpc[4]; ///< bits per component
+ uint32_t bpp; ///< bits per pixel
+ uint32_t Bpp; ///< bytes per pixel
+ uint32_t numComps; ///< number of components
+ bool isSRGB;
+ bool isBC;
+ bool isSubsampled;
+ bool isNormalized[4];
+ float toFloat[4];
+ uint32_t bcWidth;
+ uint32_t bcHeight;
+ bool isLuminance;
+};
+
+extern const SWR_FORMAT_INFO gFormatInfo[];
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves format info struct for given format.
+/// @param format - SWR format
+INLINE const SWR_FORMAT_INFO& GetFormatInfo(SWR_FORMAT format)
+{
+ return gFormatInfo[format];
+}
+
+// lookup table for unorm8 srgb -> float conversion
+extern const uint32_t srgb8Table[256];
diff --git a/src/gallium/drivers/swr/rasterizer/common/isa.hpp b/src/gallium/drivers/swr/rasterizer/common/isa.hpp
new file mode 100644
index 00000000000..ef381799bc3
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/isa.hpp
@@ -0,0 +1,235 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include <bitset>
+#include <array>
+#include <string>
+#include <algorithm>
+
+#if defined(_WIN32)
+#include <intrin.h>
+#else
+#include <string.h>
+#include <cpuid.h>
+#endif
+
+class InstructionSet
+{
+public:
+ InstructionSet() : CPU_Rep() {};
+
+ // getters
+ std::string Vendor(void) { return CPU_Rep.vendor_; }
+ std::string Brand(void) { return CPU_Rep.brand_; }
+
+ bool SSE3(void) { return CPU_Rep.f_1_ECX_[0]; }
+ bool PCLMULQDQ(void) { return CPU_Rep.f_1_ECX_[1]; }
+ bool MONITOR(void) { return CPU_Rep.f_1_ECX_[3]; }
+ bool SSSE3(void) { return CPU_Rep.f_1_ECX_[9]; }
+ bool FMA(void) { return CPU_Rep.f_1_ECX_[12]; }
+ bool CMPXCHG16B(void) { return CPU_Rep.f_1_ECX_[13]; }
+ bool SSE41(void) { return CPU_Rep.f_1_ECX_[19]; }
+ bool SSE42(void) { return CPU_Rep.f_1_ECX_[20]; }
+ bool MOVBE(void) { return CPU_Rep.f_1_ECX_[22]; }
+ bool POPCNT(void) { return CPU_Rep.f_1_ECX_[23]; }
+ bool AES(void) { return CPU_Rep.f_1_ECX_[25]; }
+ bool XSAVE(void) { return CPU_Rep.f_1_ECX_[26]; }
+ bool OSXSAVE(void) { return CPU_Rep.f_1_ECX_[27]; }
+ bool RDRAND(void) { return CPU_Rep.f_1_ECX_[30]; }
+
+ bool MSR(void) { return CPU_Rep.f_1_EDX_[5]; }
+ bool CX8(void) { return CPU_Rep.f_1_EDX_[8]; }
+ bool SEP(void) { return CPU_Rep.f_1_EDX_[11]; }
+ bool CMOV(void) { return CPU_Rep.f_1_EDX_[15]; }
+ bool CLFSH(void) { return CPU_Rep.f_1_EDX_[19]; }
+ bool MMX(void) { return CPU_Rep.f_1_EDX_[23]; }
+ bool FXSR(void) { return CPU_Rep.f_1_EDX_[24]; }
+ bool SSE(void) { return CPU_Rep.f_1_EDX_[25]; }
+ bool SSE2(void) { return CPU_Rep.f_1_EDX_[26]; }
+
+ bool FSGSBASE(void) { return CPU_Rep.f_7_EBX_[0]; }
+ bool BMI1(void) { return CPU_Rep.f_7_EBX_[3]; }
+ bool HLE(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[4]; }
+ bool BMI2(void) { return CPU_Rep.f_7_EBX_[8]; }
+ bool ERMS(void) { return CPU_Rep.f_7_EBX_[9]; }
+ bool INVPCID(void) { return CPU_Rep.f_7_EBX_[10]; }
+ bool RTM(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[11]; }
+ bool RDSEED(void) { return CPU_Rep.f_7_EBX_[18]; }
+ bool ADX(void) { return CPU_Rep.f_7_EBX_[19]; }
+ bool SHA(void) { return CPU_Rep.f_7_EBX_[29]; }
+
+ bool PREFETCHWT1(void) { return CPU_Rep.f_7_ECX_[0]; }
+
+ bool LAHF(void) { return CPU_Rep.f_81_ECX_[0]; }
+ bool LZCNT(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_ECX_[5]; }
+ bool ABM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[5]; }
+ bool SSE4a(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[6]; }
+ bool XOP(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[11]; }
+ bool TBM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[21]; }
+
+ bool SYSCALL(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[11]; }
+ bool MMXEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[22]; }
+ bool RDTSCP(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[27]; }
+ bool _3DNOWEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[30]; }
+ bool _3DNOW(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[31]; }
+
+ bool AVX(void) { return CPU_Rep.f_1_ECX_[28]; }
+ bool F16C(void) { return CPU_Rep.f_1_ECX_[29]; }
+ bool AVX2(void) { return CPU_Rep.f_7_EBX_[5]; }
+ bool AVX512F(void) { return CPU_Rep.f_7_EBX_[16]; }
+ bool AVX512PF(void) { return CPU_Rep.f_7_EBX_[26]; }
+ bool AVX512ER(void) { return CPU_Rep.f_7_EBX_[27]; }
+ bool AVX512CD(void) { return CPU_Rep.f_7_EBX_[28]; }
+
+private:
+ class InstructionSet_Internal
+ {
+ public:
+ InstructionSet_Internal()
+ : nIds_{ 0 },
+ nExIds_{ 0 },
+ isIntel_{ false },
+ isAMD_{ false },
+ f_1_ECX_{ 0 },
+ f_1_EDX_{ 0 },
+ f_7_EBX_{ 0 },
+ f_7_ECX_{ 0 },
+ f_81_ECX_{ 0 },
+ f_81_EDX_{ 0 },
+ data_{},
+ extdata_{}
+ {
+ //int cpuInfo[4] = {-1};
+ std::array<int, 4> cpui;
+
+ // Calling __cpuid with 0x0 as the function_id argument
+ // gets the number of the highest valid function ID.
+#if defined(_WIN32)
+ __cpuid(cpui.data(), 0);
+ nIds_ = cpui[0];
+#else
+ nIds_ = __get_cpuid_max(0, NULL);
+#endif
+
+ for (int i = 0; i <= nIds_; ++i)
+ {
+#if defined(_WIN32)
+ __cpuidex(cpui.data(), i, 0);
+#else
+ int *data = cpui.data();
+ __cpuid_count(i, 0, data[0], data[1], data[2], data[3]);
+#endif
+ data_.push_back(cpui);
+ }
+
+ // Capture vendor string
+ char vendor[0x20];
+ memset(vendor, 0, sizeof(vendor));
+ *reinterpret_cast<int*>(vendor) = data_[0][1];
+ *reinterpret_cast<int*>(vendor + 4) = data_[0][3];
+ *reinterpret_cast<int*>(vendor + 8) = data_[0][2];
+ vendor_ = vendor;
+ if (vendor_ == "GenuineIntel")
+ {
+ isIntel_ = true;
+ }
+ else if (vendor_ == "AuthenticAMD")
+ {
+ isAMD_ = true;
+ }
+
+ // load bitset with flags for function 0x00000001
+ if (nIds_ >= 1)
+ {
+ f_1_ECX_ = data_[1][2];
+ f_1_EDX_ = data_[1][3];
+ }
+
+ // load bitset with flags for function 0x00000007
+ if (nIds_ >= 7)
+ {
+ f_7_EBX_ = data_[7][1];
+ f_7_ECX_ = data_[7][2];
+ }
+
+ // Calling __cpuid with 0x80000000 as the function_id argument
+ // gets the number of the highest valid extended ID.
+#if defined(_WIN32)
+ __cpuid(cpui.data(), 0x80000000);
+ nExIds_ = cpui[0];
+#else
+ nExIds_ = __get_cpuid_max(0x80000000, NULL);
+#endif
+
+ char brand[0x40];
+ memset(brand, 0, sizeof(brand));
+
+ for (unsigned i = 0x80000000; i <= nExIds_; ++i)
+ {
+#if defined(_WIN32)
+ __cpuidex(cpui.data(), i, 0);
+#else
+ int *data = cpui.data();
+ __cpuid_count(i, 0, data[0], data[1], data[2], data[3]);
+#endif
+ extdata_.push_back(cpui);
+ }
+
+ // load bitset with flags for function 0x80000001
+ if (nExIds_ >= 0x80000001)
+ {
+ f_81_ECX_ = extdata_[1][2];
+ f_81_EDX_ = extdata_[1][3];
+ }
+
+ // Interpret CPU brand string if reported
+ if (nExIds_ >= 0x80000004)
+ {
+ memcpy(brand, extdata_[2].data(), sizeof(cpui));
+ memcpy(brand + 16, extdata_[3].data(), sizeof(cpui));
+ memcpy(brand + 32, extdata_[4].data(), sizeof(cpui));
+ brand_ = brand;
+ }
+ };
+
+ int nIds_;
+ unsigned nExIds_;
+ std::string vendor_;
+ std::string brand_;
+ bool isIntel_;
+ bool isAMD_;
+ std::bitset<32> f_1_ECX_;
+ std::bitset<32> f_1_EDX_;
+ std::bitset<32> f_7_EBX_;
+ std::bitset<32> f_7_ECX_;
+ std::bitset<32> f_81_ECX_;
+ std::bitset<32> f_81_EDX_;
+ std::vector<std::array<int, 4>> data_;
+ std::vector<std::array<int, 4>> extdata_;
+ };
+ const InstructionSet_Internal CPU_Rep;
+};
diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h
new file mode 100644
index 00000000000..736d29856a6
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -0,0 +1,221 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+
+#ifndef __SWR_OS_H__
+#define __SWR_OS_H__
+
+#include "core/knobs.h"
+
+#if (defined(FORCE_WINDOWS) || defined(_WIN32)) && !defined(FORCE_LINUX)
+
+#define SWR_API __cdecl
+
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include "Windows.h"
+#include <intrin.h>
+#include <cstdint>
+
+#define OSALIGN(RWORD, WIDTH) __declspec(align(WIDTH)) RWORD
+#define THREAD __declspec(thread)
+#define INLINE __forceinline
+#define DEBUGBREAK __debugbreak()
+
+#define PRAGMA_WARNING_PUSH_DISABLE(...) \
+ __pragma(warning(push));\
+ __pragma(warning(disable:__VA_ARGS__));
+
+#define PRAGMA_WARNING_POP() __pragma(warning(pop))
+
+#if defined(_WIN32)
+#if defined(_WIN64)
+#define BitScanForwardSizeT BitScanForward64
+#define _mm_popcount_sizeT _mm_popcnt_u64
+#else
+#define BitScanForwardSizeT BitScanForward
+#define _mm_popcount_sizeT _mm_popcnt_u32
+#endif
+#endif
+
+#elif defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
+
+#define SWR_API
+
+#include <stdlib.h>
+#include <string.h>
+#include <X11/Xmd.h>
+#include <x86intrin.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+typedef void VOID;
+typedef void* LPVOID;
+typedef CARD8 BOOL;
+typedef wchar_t WCHAR;
+typedef uint16_t UINT16;
+typedef int INT;
+typedef int INT32;
+typedef unsigned int UINT;
+typedef uint32_t UINT32;
+typedef uint64_t UINT64;
+typedef int64_t INT64;
+typedef void* HANDLE;
+typedef float FLOAT;
+typedef int LONG;
+typedef CARD8 BYTE;
+typedef unsigned char UCHAR;
+typedef unsigned int DWORD;
+
+#undef FALSE
+#define FALSE 0
+
+#undef TRUE
+#define TRUE 1
+
+#define OSALIGN(RWORD, WIDTH) RWORD __attribute__((aligned(WIDTH)))
+#define THREAD __thread
+#ifndef INLINE
+#define INLINE __inline
+#endif
+#define DEBUGBREAK asm ("int $3")
+#define __cdecl
+#define __declspec(X)
+
+#define GCC_VERSION (__GNUC__ * 10000 \
+ + __GNUC_MINOR__ * 100 \
+ + __GNUC_PATCHLEVEL__)
+
+#if !defined(__clang__) && (__GNUC__) && (GCC_VERSION < 40500)
+inline
+uint64_t __rdtsc()
+{
+ long low, high;
+ asm volatile("rdtsc" : "=a"(low), "=d"(high));
+ return (low | ((uint64_t)high << 32));
+}
+#endif
+
+#ifndef __clang__
+// Intrinsic not defined in gcc
+static INLINE
+void _mm256_storeu2_m128i(__m128i *hi, __m128i *lo, __m256i a)
+{
+ _mm_storeu_si128((__m128i*)lo, _mm256_castsi256_si128(a));
+ _mm_storeu_si128((__m128i*)hi, _mm256_extractf128_si256(a, 0x1));
+}
+#endif
+
+inline
+unsigned char _BitScanForward(unsigned long *Index, unsigned long Mask)
+{
+ *Index = __builtin_ctz(Mask);
+ return (Mask != 0);
+}
+
+inline
+unsigned char _BitScanForward(unsigned int *Index, unsigned int Mask)
+{
+ *Index = __builtin_ctz(Mask);
+ return (Mask != 0);
+}
+
+inline
+unsigned char _BitScanReverse(unsigned long *Index, unsigned long Mask)
+{
+ *Index = __builtin_clz(Mask);
+ return (Mask != 0);
+}
+
+inline
+unsigned char _BitScanReverse(unsigned int *Index, unsigned int Mask)
+{
+ *Index = __builtin_clz(Mask);
+ return (Mask != 0);
+}
+
+inline
+void *_aligned_malloc(unsigned int size, unsigned int alignment)
+{
+ void *ret;
+ if (posix_memalign(&ret, alignment, size))
+ {
+ return NULL;
+ }
+ return ret;
+}
+
+inline
+unsigned char _bittest(const LONG *a, LONG b)
+{
+ return ((*(unsigned *)(a) & (1 << b)) != 0);
+}
+
+#define GetCurrentProcessId getpid
+
+#define CreateDirectory(name, pSecurity) mkdir(name, 0777)
+
+#if defined(_WIN32)
+static inline
+unsigned int _mm_popcnt_u32(unsigned int v)
+{
+ return __builtin_popcount(v);
+}
+#endif
+
+#define _aligned_free free
+#define InterlockedCompareExchange(Dest, Exchange, Comparand) __sync_val_compare_and_swap(Dest, Comparand, Exchange)
+#define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value)
+#define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1)
+#define InterlockedIncrement(Append) __sync_add_and_fetch(Append, 1)
+#define _ReadWriteBarrier() asm volatile("" ::: "memory")
+#define __stdcall
+
+#define PRAGMA_WARNING_PUSH_DISABLE(...)
+#define PRAGMA_WARNING_POP()
+
+#else
+
+#error Unsupported OS/system.
+
+#endif
+
+// Universal types
+typedef BYTE KILOBYTE[1024];
+typedef KILOBYTE MEGABYTE[1024];
+typedef MEGABYTE GIGABYTE[1024];
+
+#define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64)
+#if KNOB_SIMD_WIDTH == 8
+#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, 32)
+#endif
+
+#include "common/swr_assert.h"
+
+#endif//__SWR_OS_H__
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
new file mode 100644
index 00000000000..454641b2751
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
@@ -0,0 +1,188 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file rdtsc_buckets.cpp
+*
+* @brief implementation of rdtsc buckets.
+*
+* Notes:
+*
+******************************************************************************/
+#include "rdtsc_buckets.h"
+#include <inttypes.h>
+
+THREAD UINT tlsThreadId = 0;
+
+void BucketManager::RegisterThread(const std::string& name)
+{
+ BUCKET_THREAD newThread;
+ newThread.name = name;
+ newThread.root.children.reserve(mBuckets.size());
+ newThread.root.id = 0;
+ newThread.root.pParent = nullptr;
+ newThread.pCurrent = &newThread.root;
+
+ mThreadMutex.lock();
+
+ // assign unique thread id for this thread
+ size_t id = mThreads.size();
+ newThread.id = (UINT)id;
+ tlsThreadId = (UINT)id;
+
+ // open threadviz file if enabled
+ if (mThreadViz)
+ {
+ std::stringstream ss;
+ ss << mThreadVizDir << "\\threadviz_thread." << newThread.id << ".dat";
+ newThread.vizFile = fopen(ss.str().c_str(), "wb");
+ }
+
+ // store new thread
+ mThreads.push_back(newThread);
+
+ mThreadMutex.unlock();
+}
+
+UINT BucketManager::RegisterBucket(const BUCKET_DESC& desc)
+{
+ size_t id = mBuckets.size();
+ mBuckets.push_back(desc);
+ return (UINT)id;
+}
+
+void BucketManager::PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket)
+{
+ const char *arrows[] = {
+ "",
+ "|-> ",
+ " |-> ",
+ " |-> ",
+ " |-> ",
+ " |-> ",
+ " |-> "
+ };
+
+ // compute percent of total cycles used by this bucket
+ float percentTotal = (float)((double)bucket.elapsed / (double)threadCycles * 100.0);
+
+ // compute percent of parent cycles used by this bucket
+ float percentParent = (float)((double)bucket.elapsed / (double)parentCycles * 100.0);
+
+ // compute average cycle count per invocation
+ UINT64 CPE = bucket.elapsed / bucket.count;
+
+ BUCKET_DESC &desc = mBuckets[bucket.id];
+
+ // construct hierarchy visualization
+ char hier[80];
+ strcpy(hier, arrows[level]);
+ strcat(hier, desc.name.c_str());
+
+ // print out
+ fprintf(f, "%6.2f %6.2f %-10" PRIu64 " %-10" PRIu64 " %-10u %-10lu %-10u %s\n",
+ percentTotal,
+ percentParent,
+ bucket.elapsed,
+ CPE,
+ bucket.count,
+ (unsigned long)0,
+ (uint32_t)0,
+ hier
+ );
+
+ // dump all children of this bucket
+ for (const BUCKET& child : bucket.children)
+ {
+ if (child.count)
+ {
+ PrintBucket(f, level + 1, threadCycles, bucket.elapsed, child);
+ }
+ }
+}
+
+void BucketManager::PrintThread(FILE* f, const BUCKET_THREAD& thread)
+{
+ // print header
+ fprintf(f, "\nThread %u (%s)\n", thread.id, thread.name.c_str());
+ fprintf(f, " %%Tot %%Par Cycles CPE NumEvent CPE2 NumEvent2 Bucket\n");
+
+ // compute thread level total cycle counts across all buckets from root
+ const BUCKET& root = thread.root;
+ UINT64 totalCycles = 0;
+ for (const BUCKET& child : root.children)
+ {
+ totalCycles += child.elapsed;
+ }
+
+ for (const BUCKET& child : root.children)
+ {
+ if (child.count)
+ {
+ PrintBucket(f, 0, totalCycles, totalCycles, child);
+ }
+ }
+}
+
+void BucketManager::DumpThreadViz()
+{
+ // ensure all thread data is flushed
+ mThreadMutex.lock();
+ for (auto& thread : mThreads)
+ {
+ fflush(thread.vizFile);
+ fclose(thread.vizFile);
+ }
+ mThreadMutex.unlock();
+
+ // dump bucket descriptions
+ std::stringstream ss;
+ ss << mThreadVizDir << "\\threadviz_buckets.dat";
+
+ FILE* f = fopen(ss.str().c_str(), "wb");
+ for (auto& bucket : mBuckets)
+ {
+ Serialize(f, bucket);
+ }
+ fclose(f);
+}
+
+void BucketManager::PrintReport(const std::string& filename)
+{
+ if (mThreadViz)
+ {
+ DumpThreadViz();
+ }
+ else
+ {
+ FILE* f = fopen(filename.c_str(), "w");
+
+ mThreadMutex.lock();
+ for (const BUCKET_THREAD& thread : mThreads)
+ {
+ PrintThread(f, thread);
+ fprintf(f, "\n");
+ }
+ mThreadMutex.unlock();
+
+ fclose(f);
+ }
+}
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
new file mode 100644
index 00000000000..99cb10ec6e8
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
@@ -0,0 +1,229 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file rdtsc_buckets.h
+*
+* @brief declaration for rdtsc buckets.
+*
+* Notes:
+*
+******************************************************************************/
+#pragma once
+
+#include "os.h"
+#include <vector>
+#include <mutex>
+#include <sstream>
+
+#include "rdtsc_buckets_shared.h"
+
+// unique thread id stored in thread local storage
+extern THREAD UINT tlsThreadId;
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief BucketManager encapsulates a single instance of the buckets
+/// functionality. There can be one or many bucket managers active
+/// at any time. The manager owns all the threads and
+/// bucket information that have been registered to it.
+class BucketManager
+{
+public:
+ BucketManager(bool enableThreadViz) : mThreadViz(enableThreadViz)
+ {
+ if (mThreadViz)
+ {
+ uint32_t pid = GetCurrentProcessId();
+ std::stringstream str;
+ str << "threadviz." << pid;
+ mThreadVizDir = str.str();
+ CreateDirectory(mThreadVizDir.c_str(), NULL);
+ }
+ }
+
+ // removes all registered thread data
+ void ClearThreads()
+ {
+ mThreadMutex.lock();
+ mThreads.clear();
+ mThreadMutex.unlock();
+ }
+
+ // removes all registered buckets
+ void ClearBuckets()
+ {
+ mBuckets.clear();
+ }
+
+ /// Registers a new thread with the manager.
+ /// @param name - name of thread, used for labels in reports and threadviz
+ void RegisterThread(const std::string& name);
+
+ /// Registers a new bucket type with the manager. Returns a unique
+ /// id which should be used in subsequent calls to start/stop the bucket
+ /// @param desc - description of the bucket
+ /// @return unique id
+ UINT RegisterBucket(const BUCKET_DESC& desc);
+
+ // dump threadviz data
+ void DumpThreadViz();
+
+ // print report
+ void PrintReport(const std::string& filename);
+
+ // start capturing
+ INLINE void StartCapture()
+ {
+ mCapturing = true;
+ }
+
+ // stop capturing
+ INLINE void StopCapture()
+ {
+ mCapturing = false;
+
+ // wait for all threads to pop back to root bucket
+ bool stillCapturing = true;
+ while (stillCapturing)
+ {
+ stillCapturing = false;
+ for (const BUCKET_THREAD& t : mThreads)
+ {
+ if (t.pCurrent != &t.root)
+ {
+ stillCapturing = true;
+ continue;
+ }
+ }
+ }
+ }
+
+ // start a bucket
+ // @param id generated by RegisterBucket
+ INLINE void StartBucket(UINT id)
+ {
+ if (!mCapturing) return;
+
+ SWR_ASSERT(tlsThreadId < mThreads.size());
+
+ BUCKET_THREAD& bt = mThreads[tlsThreadId];
+
+ // if threadviz is enabled, only need to dump start info to threads viz file
+ if (mThreadViz)
+ {
+ SWR_ASSERT(bt.vizFile != nullptr);
+ if (mBuckets[id].enableThreadViz)
+ {
+ VIZ_START_DATA data{ VIZ_START, id, __rdtsc() };
+ Serialize(bt.vizFile, data);
+ }
+ }
+ else
+ {
+ if (bt.pCurrent->children.size() < mBuckets.size())
+ {
+ bt.pCurrent->children.resize(mBuckets.size());
+ }
+ BUCKET &child = bt.pCurrent->children[id];
+ child.pParent = bt.pCurrent;
+ child.id = id;
+ child.start = __rdtsc();
+
+ // update thread's currently executing bucket
+ bt.pCurrent = &child;
+ }
+
+ bt.level++;
+ }
+
+ // stop the currently executing bucket
+ INLINE void StopBucket(UINT id)
+ {
+ SWR_ASSERT(tlsThreadId < mThreads.size());
+ BUCKET_THREAD &bt = mThreads[tlsThreadId];
+
+ if (bt.level == 0) return;
+
+ if (mThreadViz)
+ {
+ SWR_ASSERT(bt.vizFile != nullptr);
+ if (mBuckets[id].enableThreadViz)
+ {
+ VIZ_STOP_DATA data{ VIZ_STOP, __rdtsc() };
+ Serialize(bt.vizFile, data);
+ }
+ }
+ else
+ {
+ if (bt.pCurrent->start == 0) return;
+ SWR_ASSERT(bt.pCurrent->id == id, "Mismatched buckets detected");
+
+ bt.pCurrent->elapsed += (__rdtsc() - bt.pCurrent->start);
+ bt.pCurrent->count++;
+
+ // pop to parent
+ bt.pCurrent = bt.pCurrent->pParent;
+ }
+
+ bt.level--;
+ }
+
+ INLINE void AddEvent(uint32_t id, uint32_t count)
+ {
+ if (!mCapturing) return;
+
+ SWR_ASSERT(tlsThreadId < mThreads.size());
+
+ BUCKET_THREAD& bt = mThreads[tlsThreadId];
+
+ // don't record events for threadviz
+ if (!mThreadViz)
+ {
+ if (bt.pCurrent->children.size() < mBuckets.size())
+ {
+ bt.pCurrent->children.resize(mBuckets.size());
+ }
+ BUCKET &child = bt.pCurrent->children[id];
+ child.pParent = bt.pCurrent;
+ child.id = id;
+ child.count += count;
+ }
+ }
+
+private:
+ void PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket);
+ void PrintThread(FILE* f, const BUCKET_THREAD& thread);
+
+ // list of active threads that have registered with this manager
+ std::vector<BUCKET_THREAD> mThreads;
+
+ // list of buckets registered with this manager
+ std::vector<BUCKET_DESC> mBuckets;
+
+ // is capturing currently enabled
+ volatile bool mCapturing{ false };
+
+ std::mutex mThreadMutex;
+
+ // enable threadviz
+ bool mThreadViz{ false };
+ std::string mThreadVizDir;
+};
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
new file mode 100644
index 00000000000..41c6d5dec79
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
@@ -0,0 +1,167 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file rdtsc_buckets.h
+*
+* @brief declaration for rdtsc buckets.
+*
+* Notes:
+*
+******************************************************************************/
+#pragma once
+
+#include <vector>
+#include <cassert>
+
+struct BUCKET
+{
+ uint32_t id{ 0 };
+ uint64_t start{ 0 };
+ uint64_t elapsed{ 0 };
+ uint32_t count{ 0 };
+
+ BUCKET* pParent{ nullptr };
+ std::vector<BUCKET> children;
+};
+
+struct BUCKET_DESC
+{
+ // name of bucket, used in reports
+ std::string name;
+
+ // description of bucket, used in threadviz
+ std::string description;
+
+ // enable for threadviz dumping
+ bool enableThreadViz;
+
+ // threadviz color of bucket, in RGBA8_UNORM format
+ uint32_t color;
+};
+
+struct BUCKET_THREAD
+{
+ // name of thread, used in reports
+ std::string name;
+
+ // id for this thread, assigned by the thread manager
+ uint32_t id;
+
+ // root of the bucket hierarchy for this thread
+ BUCKET root;
+
+ // currently executing bucket somewhere in the hierarchy
+ BUCKET* pCurrent;
+
+ // currently executing hierarchy level
+ uint32_t level{ 0 };
+
+ // threadviz file object
+ FILE* vizFile{ nullptr };
+
+ BUCKET_THREAD() {}
+ BUCKET_THREAD(const BUCKET_THREAD& that)
+ {
+ name = that.name;
+ id = that.id;
+ root = that.root;
+ pCurrent = &root;
+ vizFile = that.vizFile;
+ }
+};
+
+enum VIZ_TYPE
+{
+ VIZ_START = 0,
+ VIZ_STOP = 1,
+ VIZ_DATA = 2
+};
+
+struct VIZ_START_DATA
+{
+ uint8_t type;
+ uint32_t bucketId;
+ uint64_t timestamp;
+};
+
+struct VIZ_STOP_DATA
+{
+ uint8_t type;
+ uint64_t timestamp;
+};
+
+inline void Serialize(FILE* f, const VIZ_START_DATA& data)
+{
+ fwrite(&data, sizeof(VIZ_START_DATA), 1, f);
+}
+
+inline void Deserialize(FILE* f, VIZ_START_DATA& data)
+{
+ fread(&data, sizeof(VIZ_START_DATA), 1, f);
+ assert(data.type == VIZ_START);
+}
+
+inline void Serialize(FILE* f, const VIZ_STOP_DATA& data)
+{
+ fwrite(&data, sizeof(VIZ_STOP_DATA), 1, f);
+}
+
+inline void Deserialize(FILE* f, VIZ_STOP_DATA& data)
+{
+ fread(&data, sizeof(VIZ_STOP_DATA), 1, f);
+ assert(data.type == VIZ_STOP);
+}
+
+inline void Serialize(FILE* f, const std::string& string)
+{
+ assert(string.size() <= 256);
+
+ uint8_t length = (uint8_t)string.size();
+ fwrite(&length, sizeof(length), 1, f);
+ fwrite(string.c_str(), string.size(), 1, f);
+}
+
+inline void Deserialize(FILE* f, std::string& string)
+{
+ char cstr[256];
+ uint8_t length;
+ fread(&length, sizeof(length), 1, f);
+ fread(cstr, length, 1, f);
+ cstr[length] = 0;
+ string.assign(cstr);
+}
+
+inline void Serialize(FILE* f, const BUCKET_DESC& desc)
+{
+ Serialize(f, desc.name);
+ Serialize(f, desc.description);
+ fwrite(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f);
+ fwrite(&desc.color, sizeof(desc.color), 1, f);
+}
+
+inline void Deserialize(FILE* f, BUCKET_DESC& desc)
+{
+ Deserialize(f, desc.name);
+ Deserialize(f, desc.description);
+ fread(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f);
+ fread(&desc.color, sizeof(desc.color), 1, f);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
new file mode 100644
index 00000000000..8fa6d9ef408
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -0,0 +1,787 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+
+#ifndef __SWR_SIMDINTRIN_H__
+#define __SWR_SIMDINTRIN_H__
+
+#include "os.h"
+
+#include <cassert>
+
+#include <emmintrin.h>
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#if KNOB_SIMD_WIDTH == 8
+typedef __m256 simdscalar;
+typedef __m256i simdscalari;
+typedef uint8_t simdmask;
+#else
+#error Unsupported vector width
+#endif
+
+// simd vector
+OSALIGNSIMD(union) simdvector
+{
+ simdscalar v[4];
+ struct
+ {
+ simdscalar x, y, z, w;
+ };
+
+ simdscalar& operator[] (const int i) { return v[i]; }
+ const simdscalar& operator[] (const int i) const { return v[i]; }
+};
+
+#if KNOB_SIMD_WIDTH == 8
+#define _simd128_maskstore_ps _mm_maskstore_ps
+#define _simd_load_ps _mm256_load_ps
+#define _simd_load1_ps _mm256_broadcast_ss
+#define _simd_loadu_ps _mm256_loadu_ps
+#define _simd_setzero_ps _mm256_setzero_ps
+#define _simd_set1_ps _mm256_set1_ps
+#define _simd_blend_ps _mm256_blend_ps
+#define _simd_blendv_ps _mm256_blendv_ps
+#define _simd_store_ps _mm256_store_ps
+#define _simd_mul_ps _mm256_mul_ps
+#define _simd_add_ps _mm256_add_ps
+#define _simd_sub_ps _mm256_sub_ps
+#define _simd_rsqrt_ps _mm256_rsqrt_ps
+#define _simd_min_ps _mm256_min_ps
+#define _simd_max_ps _mm256_max_ps
+#define _simd_movemask_ps _mm256_movemask_ps
+#define _simd_cvtps_epi32 _mm256_cvtps_epi32
+#define _simd_cvttps_epi32 _mm256_cvttps_epi32
+#define _simd_cvtepi32_ps _mm256_cvtepi32_ps
+#define _simd_cmplt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
+#define _simd_cmpgt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ)
+#define _simd_cmpneq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_NEQ_OQ)
+#define _simd_cmpeq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
+#define _simd_cmpge_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
+#define _simd_cmple_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
+#define _simd_cmp_ps(a, b, imm) _mm256_cmp_ps(a, b, imm)
+#define _simd_and_ps _mm256_and_ps
+#define _simd_or_ps _mm256_or_ps
+
+#define _simd_rcp_ps _mm256_rcp_ps
+#define _simd_div_ps _mm256_div_ps
+#define _simd_castsi_ps _mm256_castsi256_ps
+#define _simd_andnot_ps _mm256_andnot_ps
+#define _simd_round_ps _mm256_round_ps
+#define _simd_castpd_ps _mm256_castpd_ps
+#define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a))
+
+#define _simd_load_sd _mm256_load_sd
+#define _simd_movemask_pd _mm256_movemask_pd
+#define _simd_castsi_pd _mm256_castsi256_pd
+
+// emulated integer simd
+#define SIMD_EMU_EPI(func, intrin) \
+INLINE \
+__m256i func(__m256i a, __m256i b)\
+{\
+ __m128i aHi = _mm256_extractf128_si256(a, 1);\
+ __m128i bHi = _mm256_extractf128_si256(b, 1);\
+ __m128i aLo = _mm256_castsi256_si128(a);\
+ __m128i bLo = _mm256_castsi256_si128(b);\
+\
+ __m128i subLo = intrin(aLo, bLo);\
+ __m128i subHi = intrin(aHi, bHi);\
+\
+ __m256i result = _mm256_castsi128_si256(subLo);\
+ result = _mm256_insertf128_si256(result, subHi, 1);\
+\
+ return result;\
+}
+
+#if (KNOB_ARCH == KNOB_ARCH_AVX)
+#define _simd_mul_epi32 _simdemu_mul_epi32
+#define _simd_mullo_epi32 _simdemu_mullo_epi32
+#define _simd_sub_epi32 _simdemu_sub_epi32
+#define _simd_sub_epi64 _simdemu_sub_epi64
+#define _simd_min_epi32 _simdemu_min_epi32
+#define _simd_min_epu32 _simdemu_min_epu32
+#define _simd_max_epi32 _simdemu_max_epi32
+#define _simd_max_epu32 _simdemu_max_epu32
+#define _simd_add_epi32 _simdemu_add_epi32
+#define _simd_and_si _simdemu_and_si
+#define _simd_andnot_si _simdemu_andnot_si
+#define _simd_cmpeq_epi32 _simdemu_cmpeq_epi32
+#define _simd_cmplt_epi32 _simdemu_cmplt_epi32
+#define _simd_cmpgt_epi32 _simdemu_cmpgt_epi32
+#define _simd_or_si _simdemu_or_si
+#define _simd_castps_si _mm256_castps_si256
+#define _simd_adds_epu8 _simdemu_adds_epu8
+#define _simd_subs_epu8 _simdemu_subs_epu8
+#define _simd_add_epi8 _simdemu_add_epi8
+#define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64
+#define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64
+#define _simd_movemask_epi8 _simdemu_movemask_epi8
+
+SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32)
+SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32)
+SIMD_EMU_EPI(_simdemu_sub_epi32, _mm_sub_epi32)
+SIMD_EMU_EPI(_simdemu_sub_epi64, _mm_sub_epi64)
+SIMD_EMU_EPI(_simdemu_min_epi32, _mm_min_epi32)
+SIMD_EMU_EPI(_simdemu_min_epu32, _mm_min_epu32)
+SIMD_EMU_EPI(_simdemu_max_epi32, _mm_max_epi32)
+SIMD_EMU_EPI(_simdemu_max_epu32, _mm_max_epu32)
+SIMD_EMU_EPI(_simdemu_add_epi32, _mm_add_epi32)
+SIMD_EMU_EPI(_simdemu_and_si, _mm_and_si128)
+SIMD_EMU_EPI(_simdemu_andnot_si, _mm_andnot_si128)
+SIMD_EMU_EPI(_simdemu_cmpeq_epi32, _mm_cmpeq_epi32)
+SIMD_EMU_EPI(_simdemu_cmplt_epi32, _mm_cmplt_epi32)
+SIMD_EMU_EPI(_simdemu_cmpgt_epi32, _mm_cmpgt_epi32)
+SIMD_EMU_EPI(_simdemu_or_si, _mm_or_si128)
+SIMD_EMU_EPI(_simdemu_adds_epu8, _mm_adds_epu8)
+SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8)
+SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8)
+SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64)
+SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64)
+
+#define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
+#define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
+
+#define _simd_slli_epi32(a,i) _simdemu_slli_epi32(a,i)
+#define _simd_srai_epi32(a,i) _simdemu_srai_epi32(a,i)
+#define _simd_srli_epi32(a,i) _simdemu_srli_epi32(a,i)
+#define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
+
+#define _simd128_fmadd_ps _mm_fmaddemu_ps
+#define _simd_fmadd_ps _mm_fmaddemu256_ps
+#define _simd_fmsub_ps _mm_fmsubemu256_ps
+#define _simd_shuffle_epi8 _simdemu_shuffle_epi8
+SIMD_EMU_EPI(_simdemu_shuffle_epi8, _mm_shuffle_epi8)
+
+INLINE
+__m128 _mm_fmaddemu_ps(__m128 a, __m128 b, __m128 c)
+{
+ __m128 res = _mm_mul_ps(a, b);
+ res = _mm_add_ps(res, c);
+ return res;
+}
+
+INLINE
+__m256 _mm_fmaddemu256_ps(__m256 a, __m256 b, __m256 c)
+{
+ __m256 res = _mm256_mul_ps(a, b);
+ res = _mm256_add_ps(res, c);
+ return res;
+}
+
+INLINE
+__m256 _mm_fmsubemu256_ps(__m256 a, __m256 b, __m256 c)
+{
+ __m256 res = _mm256_mul_ps(a, b);
+ res = _mm256_sub_ps(res, c);
+ return res;
+}
+
+INLINE
+__m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale)
+{
+ uint32_t *pOffsets = (uint32_t*)&vOffsets;
+ simdscalar vResult;
+ float* pResult = (float*)&vResult;
+ for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
+ {
+ uint32_t offset = pOffsets[i];
+ offset = offset * scale;
+ pResult[i] = *(float*)(((const uint8_t*)pBase + offset));
+ }
+
+ return vResult;
+}
+
+INLINE
+__m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale)
+{
+ uint32_t *pOffsets = (uint32_t*)&vOffsets;
+ simdscalar vResult = vSrc;
+ float* pResult = (float*)&vResult;
+ DWORD index;
+ uint32_t mask = _simd_movemask_ps(vMask);
+ while (_BitScanForward(&index, mask))
+ {
+ mask &= ~(1 << index);
+ uint32_t offset = pOffsets[index];
+ offset = offset * scale;
+ pResult[index] = *(float*)(((const uint8_t*)pBase + offset));
+ }
+
+ return vResult;
+}
+
+INLINE
+__m256i _simd_abs_epi32(__m256i a)
+{
+ __m128i aHi = _mm256_extractf128_si256(a, 1);
+ __m128i aLo = _mm256_castsi256_si128(a);
+ __m128i absLo = _mm_abs_epi32(aLo);
+ __m128i absHi = _mm_abs_epi32(aHi);
+ __m256i result = _mm256_castsi128_si256(absLo);
+ result = _mm256_insertf128_si256(result, absHi, 1);
+ return result;
+}
+
+INLINE
+int _simdemu_movemask_epi8(__m256i a)
+{
+ __m128i aHi = _mm256_extractf128_si256(a, 1);
+ __m128i aLo = _mm256_castsi256_si128(a);
+
+ int resHi = _mm_movemask_epi8(aHi);
+ int resLo = _mm_movemask_epi8(aLo);
+
+ return (resHi << 16) | resLo;
+}
+#else
+
+#define _simd_mul_epi32 _mm256_mul_epi32
+#define _simd_mullo_epi32 _mm256_mullo_epi32
+#define _simd_sub_epi32 _mm256_sub_epi32
+#define _simd_sub_epi64 _mm256_sub_epi64
+#define _simd_min_epi32 _mm256_min_epi32
+#define _simd_max_epi32 _mm256_max_epi32
+#define _simd_min_epu32 _mm256_min_epu32
+#define _simd_max_epu32 _mm256_max_epu32
+#define _simd_add_epi32 _mm256_add_epi32
+#define _simd_and_si _mm256_and_si256
+#define _simd_andnot_si _mm256_andnot_si256
+#define _simd_cmpeq_epi32 _mm256_cmpeq_epi32
+#define _simd_cmplt_epi32(a,b) _mm256_cmpgt_epi32(b,a)
+#define _simd_cmpgt_epi32(a,b) _mm256_cmpgt_epi32(a,b)
+#define _simd_or_si _mm256_or_si256
+#define _simd_castps_si _mm256_castps_si256
+
+#define _simd_unpacklo_epi32 _mm256_unpacklo_epi32
+#define _simd_unpackhi_epi32 _mm256_unpackhi_epi32
+
+#define _simd_srli_si(a,i) _simdemu_srli_si128<i>(a)
+#define _simd_slli_epi32 _mm256_slli_epi32
+#define _simd_srai_epi32 _mm256_srai_epi32
+#define _simd_srli_epi32 _mm256_srli_epi32
+#define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
+#define _simd128_fmadd_ps _mm_fmadd_ps
+#define _simd_fmadd_ps _mm256_fmadd_ps
+#define _simd_fmsub_ps _mm256_fmsub_ps
+#define _simd_shuffle_epi8 _mm256_shuffle_epi8
+#define _simd_adds_epu8 _mm256_adds_epu8
+#define _simd_subs_epu8 _mm256_subs_epu8
+#define _simd_add_epi8 _mm256_add_epi8
+#define _simd_i32gather_ps _mm256_i32gather_ps
+#define _simd_mask_i32gather_ps _mm256_mask_i32gather_ps
+#define _simd_abs_epi32 _mm256_abs_epi32
+
+#define _simd_cmpeq_epi64 _mm256_cmpeq_epi64
+#define _simd_cmpgt_epi64 _mm256_cmpgt_epi64
+#define _simd_movemask_epi8 _mm256_movemask_epi8
+#endif
+
+#define _simd_shuffleps_epi32(vA, vB, imm) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(vA), _mm256_castsi256_ps(vB), imm))
+#define _simd_shuffle_ps _mm256_shuffle_ps
+#define _simd_set1_epi32 _mm256_set1_epi32
+#define _simd_set1_epi8 _mm256_set1_epi8
+#define _simd_setzero_si _mm256_setzero_si256
+#define _simd_cvttps_epi32 _mm256_cvttps_epi32
+#define _simd_store_si _mm256_store_si256
+#define _simd_broadcast_ss _mm256_broadcast_ss
+#define _simd_maskstore_ps _mm256_maskstore_ps
+#define _simd_load_si _mm256_load_si256
+#define _simd_loadu_si _mm256_loadu_si256
+#define _simd_sub_ps _mm256_sub_ps
+#define _simd_testz_ps _mm256_testz_ps
+#define _simd_xor_ps _mm256_xor_ps
+
+
+INLINE
+simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalar mask)
+{
+ return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), mask));
+}
+
+// convert bitmask to vector mask
+INLINE
+simdscalar vMask(int32_t mask)
+{
+ __m256i vec = _mm256_set1_epi32(mask);
+ const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+ vec = _simd_and_si(vec, bit);
+ vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
+ return _simd_castsi_ps(vec);
+}
+
+INLINE
+void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int slane)
+{
+ OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH];
+ _mm256_store_ps(rArray, r);
+ _mm256_store_ps(sArray, s);
+ rArray[rlane] = sArray[slane];
+ r = _mm256_load_ps(rArray);
+}
+
+INLINE __m256i _simdemu_slli_epi32(__m256i a, uint32_t i)
+{
+ __m128i aHi = _mm256_extractf128_si256(a, 1);
+ __m128i aLo = _mm256_castsi256_si128(a);
+
+ __m128i resHi = _mm_slli_epi32(aHi, i);
+ __m128i resLo = _mm_slli_epi32(aLo, i);
+
+ __m256i result = _mm256_castsi128_si256(resLo);
+ result = _mm256_insertf128_si256(result, resHi, 1);
+
+ return result;
+}
+
+INLINE __m256i _simdemu_srai_epi32(__m256i a, uint32_t i)
+{
+ __m128i aHi = _mm256_extractf128_si256(a, 1);
+ __m128i aLo = _mm256_castsi256_si128(a);
+
+ __m128i resHi = _mm_srai_epi32(aHi, i);
+ __m128i resLo = _mm_srai_epi32(aLo, i);
+
+ __m256i result = _mm256_castsi128_si256(resLo);
+ result = _mm256_insertf128_si256(result, resHi, 1);
+
+ return result;
+}
+
+INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i)
+{
+ __m128i aHi = _mm256_extractf128_si256(a, 1);
+ __m128i aLo = _mm256_castsi256_si128(a);
+
+ __m128i resHi = _mm_srli_epi32(aHi, i);
+ __m128i resLo = _mm_srli_epi32(aLo, i);
+
+ __m256i result = _mm256_castsi128_si256(resLo);
+ result = _mm256_insertf128_si256(result, resHi, 1);
+
+ return result;
+}
+
+INLINE
+void _simdvec_transpose(simdvector &v)
+{
+ SWR_ASSERT(false, "Need to implement 8 wide version");
+}
+
+#else
+#error Unsupported vector width
+#endif
+
+// Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
+INLINE
+void _simdvec_load_ps(simdvector& r, const float *p)
+{
+ r[0] = _simd_set1_ps(p[0]);
+ r[1] = _simd_set1_ps(p[1]);
+ r[2] = _simd_set1_ps(p[2]);
+ r[3] = _simd_set1_ps(p[3]);
+}
+
+INLINE
+void _simdvec_mov(simdvector& r, const simdscalar& s)
+{
+ r[0] = s;
+ r[1] = s;
+ r[2] = s;
+ r[3] = s;
+}
+
+INLINE
+void _simdvec_mov(simdvector& r, const simdvector& v)
+{
+ r[0] = v[0];
+ r[1] = v[1];
+ r[2] = v[2];
+ r[3] = v[3];
+}
+
+// just move a lane from the source simdvector to dest simdvector
+INLINE
+void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane)
+{
+ _simd_mov(r[0], rlane, s[0], slane);
+ _simd_mov(r[1], rlane, s[1], slane);
+ _simd_mov(r[2], rlane, s[2], slane);
+ _simd_mov(r[3], rlane, s[3], slane);
+}
+
+INLINE
+void _simdvec_dp3_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
+{
+ simdscalar tmp;
+ r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
+
+ tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
+ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
+
+ tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
+ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+}
+
+INLINE
+void _simdvec_dp4_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
+{
+ simdscalar tmp;
+ r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
+
+ tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
+ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
+
+ tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
+ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+
+ tmp = _simd_mul_ps(v0[3], v1[3]); // (v0.w*v1.w)
+ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+}
+
+INLINE
+simdscalar _simdvec_rcp_length_ps(const simdvector& v)
+{
+ simdscalar length;
+ _simdvec_dp4_ps(length, v, v);
+ return _simd_rsqrt_ps(length);
+}
+
+INLINE
+void _simdvec_normalize_ps(simdvector& r, const simdvector& v)
+{
+ simdscalar vecLength;
+ vecLength = _simdvec_rcp_length_ps(v);
+
+ r[0] = _simd_mul_ps(v[0], vecLength);
+ r[1] = _simd_mul_ps(v[1], vecLength);
+ r[2] = _simd_mul_ps(v[2], vecLength);
+ r[3] = _simd_mul_ps(v[3], vecLength);
+}
+
+INLINE
+void _simdvec_mul_ps(simdvector& r, const simdvector& v, const simdscalar& s)
+{
+ r[0] = _simd_mul_ps(v[0], s);
+ r[1] = _simd_mul_ps(v[1], s);
+ r[2] = _simd_mul_ps(v[2], s);
+ r[3] = _simd_mul_ps(v[3], s);
+}
+
+INLINE
+void _simdvec_mul_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
+{
+ r[0] = _simd_mul_ps(v0[0], v1[0]);
+ r[1] = _simd_mul_ps(v0[1], v1[1]);
+ r[2] = _simd_mul_ps(v0[2], v1[2]);
+ r[3] = _simd_mul_ps(v0[3], v1[3]);
+}
+
+INLINE
+void _simdvec_add_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
+{
+ r[0] = _simd_add_ps(v0[0], v1[0]);
+ r[1] = _simd_add_ps(v0[1], v1[1]);
+ r[2] = _simd_add_ps(v0[2], v1[2]);
+ r[3] = _simd_add_ps(v0[3], v1[3]);
+}
+
+INLINE
+void _simdvec_min_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
+{
+ r[0] = _simd_min_ps(v0[0], s);
+ r[1] = _simd_min_ps(v0[1], s);
+ r[2] = _simd_min_ps(v0[2], s);
+ r[3] = _simd_min_ps(v0[3], s);
+}
+
+INLINE
+void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
+{
+ r[0] = _simd_max_ps(v0[0], s);
+ r[1] = _simd_max_ps(v0[1], s);
+ r[2] = _simd_max_ps(v0[2], s);
+ r[3] = _simd_max_ps(v0[3], s);
+}
+
+// Matrix4x4 * Vector4
+// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
+// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
+// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
+// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
+INLINE
+void _simd_mat4x4_vec4_multiply(
+ simdvector& result,
+ const float *pMatrix,
+ const simdvector& v)
+{
+ simdscalar m;
+ simdscalar r0;
+ simdscalar r1;
+
+ m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
+ r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+ result[0] = r0;
+
+ m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
+ r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+ result[1] = r0;
+
+ m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
+ r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+ result[2] = r0;
+
+ m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3]
+ r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+ result[3] = r0;
+}
+
+// Matrix4x4 * Vector3 - Direction Vector where w = 0.
+// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
+// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
+// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
+// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
+INLINE
+void _simd_mat3x3_vec3_w0_multiply(
+ simdvector& result,
+ const float *pMatrix,
+ const simdvector& v)
+{
+ simdscalar m;
+ simdscalar r0;
+ simdscalar r1;
+
+ m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ result[0] = r0;
+
+ m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ result[1] = r0;
+
+ m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ result[2] = r0;
+
+ result[3] = _simd_setzero_ps();
+}
+
+// Matrix4x4 * Vector3 - Position vector where w = 1.
+// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
+// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
+// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
+// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
+INLINE
+void _simd_mat4x4_vec3_w1_multiply(
+ simdvector& result,
+ const float *pMatrix,
+ const simdvector& v)
+{
+ simdscalar m;
+ simdscalar r0;
+ simdscalar r1;
+
+ m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[0] = r0;
+
+ m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[1] = r0;
+
+ m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[2] = r0;
+
+ m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3]
+ result[3] = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+}
+
+INLINE
+void _simd_mat4x3_vec3_w1_multiply(
+ simdvector& result,
+ const float *pMatrix,
+ const simdvector& v)
+{
+ simdscalar m;
+ simdscalar r0;
+ simdscalar r1;
+
+ m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[0] = r0;
+
+ m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[1] = r0;
+
+ m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[2] = r0;
+ result[3] = _simd_set1_ps(1.0f);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Compute plane equation vA * vX + vB * vY + vC
+INLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscalar &vX, simdscalar &vY)
+{
+ simdscalar vOut = _simd_fmadd_ps(vA, vX, vC);
+ vOut = _simd_fmadd_ps(vB, vY, vOut);
+ return vOut;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Interpolates a single component.
+/// @param vI - barycentric I
+/// @param vJ - barycentric J
+/// @param pInterpBuffer - pointer to attribute barycentric coeffs
+template<UINT Attrib, UINT Comp>
+static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, const float *pInterpBuffer)
+{
+ const float *pInterpA = &pInterpBuffer[Attrib * 12 + 0 + Comp];
+ const float *pInterpB = &pInterpBuffer[Attrib * 12 + 4 + Comp];
+ const float *pInterpC = &pInterpBuffer[Attrib * 12 + 8 + Comp];
+
+ simdscalar vA = _simd_broadcast_ss(pInterpA);
+ simdscalar vB = _simd_broadcast_ss(pInterpB);
+ simdscalar vC = _simd_broadcast_ss(pInterpC);
+
+ simdscalar vk = _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f), vI), vJ);
+ vC = _simd_mul_ps(vk, vC);
+
+ return vplaneps(vA, vB, vC, vI, vJ);
+}
+
+
+#endif//__SWR_SIMDINTRIN_H__
diff --git a/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp b/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp
new file mode 100644
index 00000000000..0bffd2c8000
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp
@@ -0,0 +1,238 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+
+#include "common/os.h"
+#include <stdarg.h>
+#include <stdio.h>
+#include <assert.h>
+
+#if SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS
+
+#if defined(_WIN32)
+#pragma comment(lib, "user32.lib")
+#endif // _WIN32
+
+enum TextColor
+{
+ TEXT_BLACK = 0,
+ TEXT_RED = 1,
+ TEXT_GREEN = 2,
+ TEXT_BLUE = 4,
+ TEXT_PURPLE = TEXT_RED | TEXT_BLUE,
+ TEXT_CYAN = TEXT_GREEN | TEXT_BLUE,
+ TEXT_YELLOW = TEXT_RED | TEXT_GREEN,
+ TEXT_WHITE = TEXT_RED | TEXT_GREEN | TEXT_BLUE,
+};
+
+enum TextStyle
+{
+ TEXT_NORMAL = 0,
+ TEXT_INTENSITY = 1,
+};
+
+void SetTextColor(FILE* stream, TextColor color = TEXT_WHITE, TextStyle style = TEXT_NORMAL)
+{
+#if defined(_WIN32)
+
+ HANDLE hConsoleHandle = nullptr;
+ if (stream == stderr)
+ {
+ hConsoleHandle = GetStdHandle(STD_ERROR_HANDLE);
+ }
+ else if (stream == stdout)
+ {
+ hConsoleHandle = GetStdHandle(STD_OUTPUT_HANDLE);
+ }
+ else
+ {
+ // Not a console stream, do nothing
+ return;
+ }
+
+ WORD textAttributes = 0;
+ if (color & TEXT_RED)
+ {
+ textAttributes |= FOREGROUND_RED;
+ }
+ if (color & TEXT_GREEN)
+ {
+ textAttributes |= FOREGROUND_GREEN;
+ }
+ if (color & TEXT_BLUE)
+ {
+ textAttributes |= FOREGROUND_BLUE;
+ }
+ if (style & TEXT_INTENSITY)
+ {
+ textAttributes |= FOREGROUND_INTENSITY;
+ }
+ SetConsoleTextAttribute(hConsoleHandle, textAttributes);
+
+#else // !_WIN32
+
+ // Print ANSI codes
+ uint32_t cc = 30 + (style ? 60 : 0) + color;
+ fprintf(stream, "\033[0m\033[%d;%dm", style, cc);
+
+#endif
+}
+
+void ResetTextColor(FILE* stream)
+{
+#if defined(_WIN32)
+
+ SetTextColor(stream);
+
+#else // !_WIN32
+
+ // Print ANSI codes
+ fprintf(stream, "\033[0m");
+
+#endif
+}
+
+bool SwrAssert(
+ bool chkDebugger,
+ bool& enabled,
+ const char* pExpression,
+ const char* pFileName,
+ uint32_t lineNum,
+ const char* pFunction,
+ const char* pFmtString /* = nullptr */,
+ ...)
+{
+ if (!enabled) return false;
+
+ SetTextColor(stderr, TEXT_CYAN, TEXT_NORMAL);
+
+ fprintf(stderr, "%s(%d): ", pFileName, lineNum);
+
+ SetTextColor(stderr, TEXT_RED, TEXT_INTENSITY);
+
+ fprintf(stderr, "ASSERT: %s\n", pExpression);
+
+ SetTextColor(stderr, TEXT_CYAN, TEXT_INTENSITY);
+ fprintf(stderr, "\t%s\n", pFunction);
+
+ if (pFmtString)
+ {
+ SetTextColor(stderr, TEXT_YELLOW, TEXT_INTENSITY);
+ fprintf(stderr, "\t");
+ va_list args;
+ va_start(args, pFmtString);
+ vfprintf(stderr, pFmtString, args);
+ va_end(args);
+ fprintf(stderr, "\n");
+ }
+ ResetTextColor(stderr);
+ fflush(stderr);
+
+#if defined(_WIN32)
+ static const int MAX_MESSAGE_LEN = 2048;
+ char msgBuf[MAX_MESSAGE_LEN];
+
+ sprintf_s(msgBuf, "%s(%d): ASSERT: %s\n", pFileName, lineNum, pExpression);
+ msgBuf[MAX_MESSAGE_LEN - 2] = '\n';
+ msgBuf[MAX_MESSAGE_LEN - 1] = 0;
+ OutputDebugStringA(msgBuf);
+
+ sprintf_s(msgBuf, "\t%s\n", pFunction);
+ msgBuf[MAX_MESSAGE_LEN - 2] = '\n';
+ msgBuf[MAX_MESSAGE_LEN - 1] = 0;
+ OutputDebugStringA(msgBuf);
+
+ int offset = 0;
+
+ if (pFmtString)
+ {
+ va_list args;
+ va_start(args, pFmtString);
+ offset = _vsnprintf_s(
+ msgBuf,
+ sizeof(msgBuf),
+ sizeof(msgBuf),
+ pFmtString,
+ args);
+ va_end(args);
+
+ if (offset < 0) { return true; }
+
+ OutputDebugStringA("\t");
+ OutputDebugStringA(msgBuf);
+ OutputDebugStringA("\n");
+ }
+
+ if (KNOB_ENABLE_ASSERT_DIALOGS)
+ {
+ int retval = sprintf_s(
+ &msgBuf[offset],
+ MAX_MESSAGE_LEN - offset,
+ "\n\n"
+ "File: %s\n"
+ "Line: %d\n"
+ "\n"
+ "Expression: %s\n\n"
+ "Cancel: Disable this assert for the remainder of the process\n"
+ "Try Again: Break into the debugger\n"
+ "Continue: Continue execution (but leave assert enabled)",
+ pFileName,
+ lineNum,
+ pExpression);
+
+ if (retval < 0) { return true; }
+
+ offset += retval;
+
+ if (!IsDebuggerPresent())
+ {
+ sprintf_s(
+ &msgBuf[offset],
+ MAX_MESSAGE_LEN - offset,
+ "\n\n*** NO DEBUGGER DETECTED ***\n\nPressing \"Try Again\" will cause a program crash!");
+ }
+
+ retval = MessageBoxA(nullptr, msgBuf, "Assert Failed", MB_CANCELTRYCONTINUE | MB_ICONEXCLAMATION);
+
+ switch (retval)
+ {
+ case IDCANCEL:
+ enabled = false;
+ return false;
+
+ case IDTRYAGAIN:
+ return true;
+
+ case IDCONTINUE:
+ return false;
+ }
+ }
+ else
+ {
+ return IsDebuggerPresent() || !chkDebugger;
+ }
+#endif // _WIN32
+
+ return true;
+}
+
+#endif // SWR_ENABLE_ASSERTS
diff --git a/src/gallium/drivers/swr/rasterizer/common/swr_assert.h b/src/gallium/drivers/swr/rasterizer/common/swr_assert.h
new file mode 100644
index 00000000000..fecadb3d499
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/swr_assert.h
@@ -0,0 +1,109 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+
+#ifndef __SWR_ASSERT_H__
+#define __SWR_ASSERT_H__
+
+#if !defined(__SWR_OS_H__)
+#error swr_assert.h should not be included directly, please include "common/os.h" instead.
+#endif
+
+#if !defined(SWR_ENABLE_ASSERTS)
+
+#if !defined(NDEBUG)
+#define SWR_ENABLE_ASSERTS 1
+#else
+#define SWR_ENABLE_ASSERTS 0
+#endif // _DEBUG
+
+#endif // SWR_ENABLE_ASSERTS
+
+#if !defined(SWR_ENABLE_REL_ASSERTS)
+#define SWR_ENABLE_REL_ASSERTS 1
+#endif
+
+#if SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS
+#include "assert.h"
+
+#if !defined(__cplusplus)
+
+#pragma message("C++ is required for SWR Asserts, falling back to assert.h")
+
+#if SWR_ENABLE_ASSERTS
+#define SWR_ASSERT(e, ...) assert(e)
+#endif
+
+#if SWR_ENABLE_REL_ASSERTS
+#define SWR_REL_ASSERT(e, ...) assert(e)
+#endif
+
+#else
+
+#if SWR_ENABLE_ASSERTS
+#if defined(assert)
+#undef assert
+#endif
+#define assert(exp) SWR_ASSERT(exp)
+#endif
+
+bool SwrAssert(
+ bool chkDebugger,
+ bool& enabled,
+ const char* pExpression,
+ const char* pFileName,
+ uint32_t lineNum,
+ const char* function,
+ const char* pFmtString = nullptr,
+ ...);
+
+#define _SWR_ASSERT(chkDebugger, e, ...) {\
+ bool expFailed = !(e);\
+ if (expFailed) {\
+ static bool swrAssertEnabled = true;\
+ expFailed = SwrAssert(chkDebugger, swrAssertEnabled, #e, __FILE__, __LINE__, __FUNCTION__, ##__VA_ARGS__);\
+ if (expFailed) { DEBUGBREAK; }\
+ }\
+}
+
+#if SWR_ENABLE_ASSERTS
+#define SWR_ASSERT(e, ...) _SWR_ASSERT(true, e, ##__VA_ARGS__)
+#endif
+
+#if SWR_ENABLE_REL_ASSERTS
+#define SWR_REL_ASSERT(e, ...) _SWR_ASSERT(false, e, ##__VA_ARGS__)
+#endif
+#endif // C++
+
+#endif // SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS
+
+#if !SWR_ENABLE_ASSERTS
+#define SWR_ASSERT(e, ...)
+#endif
+
+#if !SWR_ENABLE_REL_ASSERTS
+#define SWR_REL_ASSERT(e, ...)
+#endif
+
+#define SWR_NOT_IMPL SWR_ASSERT(0, "%s not implemented", __FUNCTION__)
+
+#endif//__SWR_ASSERT_H__
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
new file mode 100644
index 00000000000..fccccab503c
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -0,0 +1,1511 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file api.cpp
+*
+* @brief API implementation
+*
+******************************************************************************/
+
+#include <cfloat>
+#include <cmath>
+#include <cstdio>
+
+#include "core/api.h"
+#include "core/backend.h"
+#include "core/context.h"
+#include "core/frontend.h"
+#include "core/rasterizer.h"
+#include "core/rdtsc_core.h"
+#include "core/threads.h"
+#include "core/tilemgr.h"
+#include "core/clip.h"
+
+#include "common/simdintrin.h"
+#include "common/os.h"
+
+void SetupDefaultState(SWR_CONTEXT *pContext);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Create SWR Context.
+/// @param pCreateInfo - pointer to creation info.
+HANDLE SwrCreateContext(
+ const SWR_CREATECONTEXT_INFO* pCreateInfo)
+{
+ RDTSC_RESET();
+ RDTSC_INIT(0);
+
+ void* pContextMem = _aligned_malloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4);
+ memset(pContextMem, 0, sizeof(SWR_CONTEXT));
+ SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT();
+
+ pContext->driverType = pCreateInfo->driver;
+ pContext->privateStateSize = pCreateInfo->privateStateSize;
+
+ pContext->dcRing = (DRAW_CONTEXT*)_aligned_malloc(sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT, 64);
+ memset(pContext->dcRing, 0, sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT);
+
+ pContext->dsRing = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT, 64);
+ memset(pContext->dsRing, 0, sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT);
+
+ pContext->numSubContexts = pCreateInfo->maxSubContexts;
+ if (pContext->numSubContexts > 1)
+ {
+ pContext->subCtxSave = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE) * pContext->numSubContexts, 64);
+ memset(pContext->subCtxSave, 0, sizeof(DRAW_STATE) * pContext->numSubContexts);
+ }
+
+ for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
+ {
+ pContext->dcRing[dc].pArena = new Arena();
+ pContext->dcRing[dc].inUse = false;
+ pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena));
+ pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
+
+ pContext->dsRing[dc].pArena = new Arena();
+ }
+
+ if (!KNOB_SINGLE_THREADED)
+ {
+ memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
+ memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
+ new (&pContext->WaitLock) std::mutex();
+ new (&pContext->FifosNotEmpty) std::condition_variable();
+
+ CreateThreadPool(pContext, &pContext->threadPool);
+ }
+
+ // Calling createThreadPool() above can set SINGLE_THREADED
+ if (KNOB_SINGLE_THREADED)
+ {
+ pContext->NumWorkerThreads = 1;
+ }
+
+ // Allocate scratch space for workers.
+ ///@note We could lazily allocate this but its rather small amount of memory.
+ for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
+ {
+ ///@todo Use numa API for allocations using numa information from thread data (if exists).
+ pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4);
+ }
+
+ pContext->nextDrawId = 1;
+ pContext->DrawEnqueued = 1;
+
+ // State setup AFTER context is fully initialized
+ SetupDefaultState(pContext);
+
+ // initialize hot tile manager
+ pContext->pHotTileMgr = new HotTileMgr();
+
+ // initialize function pointer tables
+ InitClearTilesTable();
+
+ // initialize store tiles function
+ pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
+ pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
+ pContext->pfnClearTile = pCreateInfo->pfnClearTile;
+
+ return (HANDLE)pContext;
+}
+
+void SwrDestroyContext(HANDLE hContext)
+{
+ SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+ DestroyThreadPool(pContext, &pContext->threadPool);
+
+ // free the fifos
+ for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
+ {
+ delete pContext->dcRing[i].pArena;
+ delete pContext->dsRing[i].pArena;
+ delete(pContext->dcRing[i].pTileMgr);
+ delete(pContext->dcRing[i].pDispatch);
+ }
+
+ // Free scratch space.
+ for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
+ {
+ _aligned_free(pContext->pScratch[i]);
+ }
+
+ _aligned_free(pContext->dcRing);
+ _aligned_free(pContext->dsRing);
+ _aligned_free(pContext->subCtxSave);
+
+ delete(pContext->pHotTileMgr);
+
+ pContext->~SWR_CONTEXT();
+ _aligned_free((SWR_CONTEXT*)hContext);
+}
+
+void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
+{
+ memcpy(&dst.state, &src.state, sizeof(API_STATE));
+}
+
+void WakeAllThreads(SWR_CONTEXT *pContext)
+{
+ pContext->FifosNotEmpty.notify_all();
+}
+
+bool StillDrawing(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC)
+{
+ // For single thread nothing should still be drawing.
+ if (KNOB_SINGLE_THREADED) { return false; }
+
+ if (pDC->isCompute)
+ {
+ if (pDC->doneCompute)
+ {
+ pDC->inUse = false;
+ return false;
+ }
+ }
+
+ // Check if backend work is done. First make sure all triangles have been binned.
+ if (pDC->doneFE == true)
+ {
+ // ensure workers have all moved passed this draw
+ if (pDC->threadsDoneFE != pContext->NumWorkerThreads)
+ {
+ return true;
+ }
+
+ if (pDC->threadsDoneBE != pContext->NumWorkerThreads)
+ {
+ return true;
+ }
+
+ pDC->inUse = false; // all work is done.
+ }
+
+ return pDC->inUse;
+}
+
+void QueueDraw(SWR_CONTEXT *pContext)
+{
+ SWR_ASSERT(pContext->pCurDrawContext->inUse == false);
+ pContext->pCurDrawContext->inUse = true;
+
+ _ReadWriteBarrier();
+ {
+ std::unique_lock<std::mutex> lock(pContext->WaitLock);
+ pContext->DrawEnqueued++;
+ }
+
+ if (KNOB_SINGLE_THREADED)
+ {
+ // flush denormals to 0
+ uint32_t mxcsr = _mm_getcsr();
+ _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
+
+ std::unordered_set<uint32_t> lockedTiles;
+ uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
+ WorkOnFifoFE(pContext, 0, curDraw[0], 0);
+ WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles);
+
+ // restore csr
+ _mm_setcsr(mxcsr);
+ }
+ else
+ {
+ RDTSC_START(APIDrawWakeAllThreads);
+ WakeAllThreads(pContext);
+ RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
+ }
+
+ // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
+ pContext->pPrevDrawContext = pContext->pCurDrawContext;
+ pContext->pCurDrawContext = nullptr;
+}
+
+///@todo Combine this with QueueDraw
+void QueueDispatch(SWR_CONTEXT *pContext)
+{
+ SWR_ASSERT(pContext->pCurDrawContext->inUse == false);
+ pContext->pCurDrawContext->inUse = true;
+
+ _ReadWriteBarrier();
+ {
+ std::unique_lock<std::mutex> lock(pContext->WaitLock);
+ pContext->DrawEnqueued++;
+ }
+
+ if (KNOB_SINGLE_THREADED)
+ {
+ // flush denormals to 0
+ uint32_t mxcsr = _mm_getcsr();
+ _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
+
+ uint64_t curDispatch = pContext->pCurDrawContext->drawId;
+ WorkOnCompute(pContext, 0, curDispatch);
+
+ // restore csr
+ _mm_setcsr(mxcsr);
+ }
+ else
+ {
+ RDTSC_START(APIDrawWakeAllThreads);
+ WakeAllThreads(pContext);
+ RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
+ }
+
+ // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
+ pContext->pPrevDrawContext = pContext->pCurDrawContext;
+ pContext->pCurDrawContext = nullptr;
+}
+
+DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
+{
+ RDTSC_START(APIGetDrawContext);
+ // If current draw context is null then need to obtain a new draw context to use from ring.
+ if (pContext->pCurDrawContext == nullptr)
+ {
+ uint32_t dcIndex = pContext->nextDrawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+
+ DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
+ pContext->pCurDrawContext = pCurDrawContext;
+
+ // Need to wait until this draw context is available to use.
+ while (StillDrawing(pContext, pCurDrawContext))
+ {
+ _mm_pause();
+ }
+
+ // Assign next available entry in DS ring to this DC.
+ uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
+ pCurDrawContext->pState = &pContext->dsRing[dsIndex];
+
+ Arena& stateArena = *(pCurDrawContext->pState->pArena);
+
+ // Copy previous state to current state.
+ if (pContext->pPrevDrawContext)
+ {
+ DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext;
+
+ // If we're splitting our draw then we can just use the same state from the previous
+ // draw. In this case, we won't increment the DS ring index so the next non-split
+ // draw can receive the state.
+ if (isSplitDraw == false)
+ {
+ CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
+
+ stateArena.Reset(true); // Reset memory.
+ pCurDrawContext->pState->pPrivateState = nullptr;
+
+ pContext->curStateId++; // Progress state ring index forward.
+ }
+ else
+ {
+ // If its a split draw then just copy the state pointer over
+ // since its the same draw.
+ pCurDrawContext->pState = pPrevDrawContext->pState;
+ }
+ }
+ else
+ {
+ stateArena.Reset(); // Reset memory.
+ pContext->curStateId++; // Progress state ring index forward.
+ }
+
+ pCurDrawContext->dependency = 0;
+ pCurDrawContext->pArena->Reset();
+ pCurDrawContext->pContext = pContext;
+ pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
+ pCurDrawContext->inUse = false;
+
+ pCurDrawContext->doneCompute = false;
+ pCurDrawContext->doneFE = false;
+ pCurDrawContext->FeLock = 0;
+ pCurDrawContext->threadsDoneFE = 0;
+ pCurDrawContext->threadsDoneBE = 0;
+
+ pCurDrawContext->pTileMgr->initialize();
+
+ // Assign unique drawId for this DC
+ pCurDrawContext->drawId = pContext->nextDrawId++;
+ }
+ else
+ {
+ SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
+ }
+
+ RDTSC_STOP(APIGetDrawContext, 0, 0);
+ return pContext->pCurDrawContext;
+}
+
+void SWR_API SwrSetActiveSubContext(
+ HANDLE hContext,
+ uint32_t subContextIndex)
+{
+ SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+ if (subContextIndex >= pContext->numSubContexts)
+ {
+ return;
+ }
+
+ if (subContextIndex != pContext->curSubCtxId)
+ {
+ // Save and restore draw state
+ DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+ CopyState(
+ pContext->subCtxSave[pContext->curSubCtxId],
+ *(pDC->pState));
+
+ CopyState(
+ *(pDC->pState),
+ pContext->subCtxSave[subContextIndex]);
+
+ pContext->curSubCtxId = subContextIndex;
+ }
+}
+
+API_STATE* GetDrawState(SWR_CONTEXT *pContext)
+{
+ DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+ SWR_ASSERT(pDC->pState != nullptr);
+
+ return &pDC->pState->state;
+}
+
+void SetupDefaultState(SWR_CONTEXT *pContext)
+{
+ API_STATE* pState = GetDrawState(pContext);
+
+ pState->rastState.cullMode = SWR_CULLMODE_NONE;
+ pState->rastState.frontWinding = SWR_FRONTWINDING_CCW;
+}
+
+static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
+{
+ return (SWR_CONTEXT*)hContext;
+}
+
+void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2, uint64_t userData3)
+{
+ RDTSC_START(APISync);
+
+ SWR_ASSERT(pfnFunc != nullptr);
+
+ SWR_CONTEXT *pContext = GetContext(hContext);
+ DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+ pDC->FeWork.type = SYNC;
+ pDC->FeWork.pfnWork = ProcessSync;
+ pDC->FeWork.desc.sync.pfnCallbackFunc = pfnFunc;
+ pDC->FeWork.desc.sync.userData = userData;
+ pDC->FeWork.desc.sync.userData2 = userData2;
+ pDC->FeWork.desc.sync.userData3 = userData3;
+
+ // cannot execute until all previous draws have completed
+ pDC->dependency = pDC->drawId - 1;
+
+ //enqueue
+ QueueDraw(pContext);
+
+ RDTSC_STOP(APISync, 1, 0);
+}
+
+void SwrWaitForIdle(HANDLE hContext)
+{
+ SWR_CONTEXT *pContext = GetContext(hContext);
+
+ RDTSC_START(APIWaitForIdle);
+ // Wait for all work to complete.
+ for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
+ {
+ DRAW_CONTEXT *pDC = &pContext->dcRing[dc];
+
+ while (StillDrawing(pContext, pDC))
+ {
+ _mm_pause();
+ }
+ }
+ RDTSC_STOP(APIWaitForIdle, 1, 0);
+}
+
+void SwrSetVertexBuffers(
+ HANDLE hContext,
+ uint32_t numBuffers,
+ const SWR_VERTEX_BUFFER_STATE* pVertexBuffers)
+{
+ API_STATE* pState = GetDrawState(GetContext(hContext));
+
+ for (uint32_t i = 0; i < numBuffers; ++i)
+ {
+ const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i];
+ pState->vertexBuffers[pVB->index] = *pVB;
+ }
+}
+
+void SwrSetIndexBuffer(
+ HANDLE hContext,
+ const SWR_INDEX_BUFFER_STATE* pIndexBuffer)
+{
+ API_STATE* pState = GetDrawState(GetContext(hContext));
+
+ pState->indexBuffer = *pIndexBuffer;
+}
+
+void SwrSetFetchFunc(
+ HANDLE hContext,
+ PFN_FETCH_FUNC pfnFetchFunc)
+{
+ API_STATE* pState = GetDrawState(GetContext(hContext));
+
+ pState->pfnFetchFunc = pfnFetchFunc;
+}
+
+void SwrSetSoFunc(
+ HANDLE hContext,
+ PFN_SO_FUNC pfnSoFunc,
+ uint32_t streamIndex)
+{
+ API_STATE* pState = GetDrawState(GetContext(hContext));
+
+ SWR_ASSERT(streamIndex < MAX_SO_STREAMS);
+
+ pState->pfnSoFunc[streamIndex] = pfnSoFunc;
+}
+
+void SwrSetSoState(
+ HANDLE hContext,
+ SWR_STREAMOUT_STATE* pSoState)
+{
+ API_STATE* pState = GetDrawState(GetContext(hContext));
+
+ pState->soState = *pSoState;
+}
+
+void SwrSetSoBuffers(
+ HANDLE hContext,
+ SWR_STREAMOUT_BUFFER* pSoBuffer,
+ uint32_t slot)
+{
+ API_STATE* pState = GetDrawState(GetContext(hContext));
+
+ SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot);
+
+ pState->soBuffer[slot] = *pSoBuffer;
+}
+
+void SwrSetVertexFunc(
+ HANDLE hContext,
+ PFN_VERTEX_FUNC pfnVertexFunc)
+{
+ API_STATE* pState = GetDrawState(GetContext(hContext));
+
+ pState->pfnVertexFunc = pfnVertexFunc;
+}
+
+void SwrSetFrontendState(
+ HANDLE hContext,
+ SWR_FRONTEND_STATE *pFEState)
+{
+ API_STATE* pState = GetDrawState(GetContext(hContext));
+ pState->frontendState = *pFEState;
+}
+
+void SwrSetGsState(
+ HANDLE hContext,
+ SWR_GS_STATE *pGSState)
+{
+ API_STATE* pState = GetDrawState(GetContext(hContext));
+ pState->gsState = *pGSState;
+}
+
+void SwrSetGsFunc(
+ HANDLE hContext,
+ PFN_GS_FUNC pfnGsFunc)
+{
+ API_STATE* pState = GetDrawState(GetContext(hContext));
+ pState->pfnGsFunc = pfnGsFunc;
+}
+
+void SwrSetCsFunc(
+ HANDLE hContext,
+ PFN_CS_FUNC pfnCsFunc,
+ uint32_t totalThreadsInGroup)
+{
+ API_STATE* pState = GetDrawState(GetContext(hContext));
+ pState->pfnCsFunc = pfnCsFunc;
+ pState->totalThreadsInGroup = totalThreadsInGroup;
+}
+
+void SwrSetTsState(
+ HANDLE hContext,
+ SWR_TS_STATE *pState)
+{
+ API_STATE* pApiState = GetDrawState(GetContext(hContext));
+ pApiState->tsState = *pState;
+}
+
+void SwrSetHsFunc(
+ HANDLE hContext,
+ PFN_HS_FUNC pfnFunc)
+{
+ API_STATE* pApiState = GetDrawState(GetContext(hContext));
+ pApiState->pfnHsFunc = pfnFunc;
+}
+
+void SwrSetDsFunc(
+ HANDLE hContext,
+ PFN_DS_FUNC pfnFunc)
+{
+ API_STATE* pApiState = GetDrawState(GetContext(hContext));
+ pApiState->pfnDsFunc = pfnFunc;
+}
+
+void SwrSetDepthStencilState(
+ HANDLE hContext,
+ SWR_DEPTH_STENCIL_STATE *pDSState)
+{
+ API_STATE* pState = GetDrawState(GetContext(hContext));
+
+ pState->depthStencilState = *pDSState;
+}
+
+void SwrSetBackendState(
+ HANDLE hContext,
+ SWR_BACKEND_STATE *pBEState)
+{
+ API_STATE* pState = GetDrawState(GetContext(hContext));
+
+ pState->backendState = *pBEState;
+}
+
+void SwrSetPixelShaderState(
+ HANDLE hContext,
+ SWR_PS_STATE *pPSState)
+{
+ API_STATE *pState = GetDrawState(GetContext(hContext));
+ pState->psState = *pPSState;
+}
+
+void SwrSetBlendState(
+ HANDLE hContext,
+ SWR_BLEND_STATE *pBlendState)
+{
+ API_STATE *pState = GetDrawState(GetContext(hContext));
+ memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE));
+}
+
+void SwrSetBlendFunc(
+ HANDLE hContext,
+ uint32_t renderTarget,
+ PFN_BLEND_JIT_FUNC pfnBlendFunc)
+{
+ SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS);
+ API_STATE *pState = GetDrawState(GetContext(hContext));
+ pState->pfnBlendFunc[renderTarget] = pfnBlendFunc;
+}
+
+void SwrSetLinkage(
+ HANDLE hContext,
+ uint32_t mask,
+ const uint8_t* pMap)
+{
+ API_STATE* pState = GetDrawState(GetContext(hContext));
+
+ static const uint8_t IDENTITY_MAP[] =
+ {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ };
+ static_assert(sizeof(IDENTITY_MAP) == sizeof(pState->linkageMap),
+ "Update for new value of MAX_ATTRIBUTES");
+
+ pState->linkageMask = mask;
+ pState->linkageCount = _mm_popcnt_u32(mask);
+
+ if (!pMap)
+ {
+ pMap = IDENTITY_MAP;
+ }
+ memcpy(pState->linkageMap, pMap, pState->linkageCount);
+}
+
+// update guardband multipliers for the viewport
+void updateGuardband(API_STATE *pState)
+{
+ // guardband center is viewport center
+ pState->gbState.left = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
+ pState->gbState.right = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
+ pState->gbState.top = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
+ pState->gbState.bottom = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
+}
+
+void SwrSetRastState(
+ HANDLE hContext,
+ const SWR_RASTSTATE *pRastState)
+{
+ SWR_CONTEXT *pContext = GetContext(hContext);
+ API_STATE* pState = GetDrawState(pContext);
+
+ memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE));
+}
+
+void SwrSetViewports(
+ HANDLE hContext,
+ uint32_t numViewports,
+ const SWR_VIEWPORT* pViewports,
+ const SWR_VIEWPORT_MATRIX* pMatrices)
+{
+ SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS,
+ "Invalid number of viewports.");
+
+ SWR_CONTEXT *pContext = GetContext(hContext);
+ API_STATE* pState = GetDrawState(pContext);
+
+ memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
+
+ if (pMatrices != nullptr)
+ {
+ memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports);
+ }
+ else
+ {
+ // Compute default viewport transform.
+ for (uint32_t i = 0; i < numViewports; ++i)
+ {
+ if (pContext->driverType == DX)
+ {
+ pState->vpMatrix[i].m00 = pState->vp[i].width / 2.0f;
+ pState->vpMatrix[i].m11 = -pState->vp[i].height / 2.0f;
+ pState->vpMatrix[i].m22 = pState->vp[i].maxZ - pState->vp[i].minZ;
+ pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
+ pState->vpMatrix[i].m31 = pState->vp[i].y - pState->vpMatrix[i].m11;
+ pState->vpMatrix[i].m32 = pState->vp[i].minZ;
+ }
+ else
+ {
+ // Standard, with the exception that Y is inverted.
+ pState->vpMatrix[i].m00 = (pState->vp[i].width - pState->vp[i].x) / 2.0f;
+ pState->vpMatrix[i].m11 = (pState->vp[i].y - pState->vp[i].height) / 2.0f;
+ pState->vpMatrix[i].m22 = (pState->vp[i].maxZ - pState->vp[i].minZ) / 2.0f;
+ pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
+ pState->vpMatrix[i].m31 = pState->vp[i].height + pState->vpMatrix[i].m11;
+ pState->vpMatrix[i].m32 = pState->vp[i].minZ + pState->vpMatrix[i].m22;
+
+ // Now that the matrix is calculated, clip the view coords to screen size.
+ // OpenGL allows for -ve x,y in the viewport.
+ pState->vp[i].x = std::max(pState->vp[i].x, 0.0f);
+ pState->vp[i].y = std::max(pState->vp[i].y, 0.0f);
+ }
+ }
+ }
+
+ updateGuardband(pState);
+}
+
+void SwrSetScissorRects(
+ HANDLE hContext,
+ uint32_t numScissors,
+ const BBOX* pScissors)
+{
+ SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS,
+ "Invalid number of scissor rects.");
+
+ API_STATE* pState = GetDrawState(GetContext(hContext));
+ memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(BBOX));
+};
+
+void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
+{
+ API_STATE *pState = &pDC->pState->state;
+ uint32_t left, right, top, bottom;
+
+ // Set up scissor dimensions based on scissor or viewport
+ if (pState->rastState.scissorEnable)
+ {
+ // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
+ left = pState->scissorRects[0].left;
+ right = pState->scissorRects[0].right;
+ top = pState->scissorRects[0].top;
+ bottom = pState->scissorRects[0].bottom;
+ }
+ else
+ {
+ left = (int32_t)pState->vp[0].x;
+ right = (int32_t)pState->vp[0].x + (int32_t)pState->vp[0].width;
+ top = (int32_t)pState->vp[0].y;
+ bottom = (int32_t)pState->vp[0].y + (int32_t)pState->vp[0].height;
+ }
+
+ right = std::min<uint32_t>(right, KNOB_MAX_SCISSOR_X);
+ bottom = std::min<uint32_t>(bottom, KNOB_MAX_SCISSOR_Y);
+
+ if (left > KNOB_MAX_SCISSOR_X || top > KNOB_MAX_SCISSOR_Y)
+ {
+ pState->scissorInFixedPoint.left = 0;
+ pState->scissorInFixedPoint.right = 0;
+ pState->scissorInFixedPoint.top = 0;
+ pState->scissorInFixedPoint.bottom = 0;
+ }
+ else
+ {
+ pState->scissorInFixedPoint.left = left * FIXED_POINT_SCALE;
+ pState->scissorInFixedPoint.right = right * FIXED_POINT_SCALE - 1;
+ pState->scissorInFixedPoint.top = top * FIXED_POINT_SCALE;
+ pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
+ }
+}
+
+void SetupPipeline(DRAW_CONTEXT *pDC)
+{
+ DRAW_STATE* pState = pDC->pState;
+ const SWR_RASTSTATE &rastState = pState->state.rastState;
+ BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
+ const uint32_t forcedSampleCount = (rastState.bForcedSampleCount) ? 1 : 0;
+
+ // setup backend
+ if (pState->state.psState.pfnPixelShader == nullptr)
+ {
+ backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
+ // always need to generate I & J per sample for Z interpolation
+ backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[1];
+ }
+ else
+ {
+ const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0;
+ const uint32_t centroid = ((pState->state.psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
+
+ // currently only support 'normal' input coverage
+ SWR_ASSERT(pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
+ pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
+
+ SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)pState->state.psState.barycentricsMask;
+
+ // select backend function
+ switch(pState->state.psState.shadingRate)
+ {
+ case SWR_SHADING_RATE_PIXEL:
+ if(bMultisampleEnable)
+ {
+ // always need to generate I & J per sample for Z interpolation
+ barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
+ backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][pState->state.psState.inputCoverage][centroid][forcedSampleCount];
+ backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount];
+ }
+ else
+ {
+ // always need to generate I & J per pixel for Z interpolation
+ barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
+ backendFuncs.pfnBackend = gBackendSingleSample[pState->state.psState.inputCoverage][centroid];
+ backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][SWR_MULTISAMPLE_1X];
+ }
+ break;
+ case SWR_SHADING_RATE_SAMPLE:
+ SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
+ // always need to generate I & J per sample for Z interpolation
+ barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
+ backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][pState->state.psState.inputCoverage][centroid];
+ backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount];
+ break;
+ case SWR_SHADING_RATE_COARSE:
+ default:
+ SWR_ASSERT(0 && "Invalid shading rate");
+ break;
+ }
+
+ // setup pointer to function that generates necessary barycentrics required by the PS
+ bool bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_PIXEL_MASK) > 0 ? 1 : 0;
+ backendFuncs.pfnCalcPixelBarycentrics = gPixelBarycentricTable[bBarycentrics];
+
+ bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_SAMPLE_MASK) > 0 ? 1 : 0;
+ backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[bBarycentrics];
+
+ bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0 ? 1 : 0;
+ backendFuncs.pfnCalcCentroidBarycentrics = gCentroidBarycentricTable[rastState.sampleCount][bBarycentrics][rastState.samplePattern][forcedSampleCount];
+ }
+
+ PFN_PROCESS_PRIMS pfnBinner;
+ switch (pState->state.topology)
+ {
+ case TOP_POINT_LIST:
+ pState->pfnProcessPrims = ClipPoints;
+ pfnBinner = BinPoints;
+ break;
+ case TOP_LINE_LIST:
+ case TOP_LINE_STRIP:
+ case TOP_LINE_LOOP:
+ case TOP_LINE_LIST_ADJ:
+ case TOP_LISTSTRIP_ADJ:
+ pState->pfnProcessPrims = ClipLines;
+ pfnBinner = BinLines;
+ break;
+ default:
+ pState->pfnProcessPrims = ClipTriangles;
+ pfnBinner = BinTriangles;
+ break;
+ };
+
+ // disable clipper if viewport transform is disabled
+ if (pState->state.frontendState.vpTransformDisable)
+ {
+ pState->pfnProcessPrims = pfnBinner;
+ }
+
+ if ((pState->state.psState.pfnPixelShader == nullptr) &&
+ (pState->state.depthStencilState.depthTestEnable == FALSE) &&
+ (pState->state.depthStencilState.depthWriteEnable == FALSE) &&
+ (pState->state.depthStencilState.stencilTestEnable == FALSE) &&
+ (pState->state.depthStencilState.stencilWriteEnable == FALSE) &&
+ (pState->state.linkageCount == 0))
+ {
+ pState->pfnProcessPrims = nullptr;
+ pState->state.linkageMask = 0;
+ }
+
+ if (pState->state.soState.rasterizerDisable == true)
+ {
+ pState->pfnProcessPrims = nullptr;
+ pState->state.linkageMask = 0;
+ }
+
+ // set up the frontend attrib mask
+ pState->state.feAttribMask = pState->state.linkageMask;
+ if (pState->state.soState.soEnable)
+ {
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ pState->state.feAttribMask |= pState->state.soState.streamMasks[i];
+ }
+ }
+
+ // complicated logic to test for cases where we don't need backing hottile memory for a draw
+ // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
+ pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable &&
+ !pState->state.depthStencilState.depthWriteEnable &&
+ pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) &&
+ (pState->state.depthStencilState.depthTestEnable ||
+ pState->state.depthStencilState.depthWriteEnable)) ? true : false;
+
+ pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable &&
+ !pState->state.depthStencilState.stencilWriteEnable &&
+ pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) ||
+ // for stencil we have to check the double sided state as well
+ (!(pState->state.depthStencilState.doubleSidedStencilTestEnable &&
+ !pState->state.depthStencilState.stencilWriteEnable &&
+ pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) &&
+ (pState->state.depthStencilState.stencilTestEnable ||
+ pState->state.depthStencilState.stencilWriteEnable)) ? true : false;
+
+ uint32_t numRTs = pState->state.psState.numRenderTargets;
+ pState->state.colorHottileEnable = 0;
+ if(pState->state.psState.pfnPixelShader != nullptr)
+ {
+ for (uint32_t rt = 0; rt < numRTs; ++rt)
+ {
+ pState->state.colorHottileEnable |=
+ (!pState->state.blendState.renderTarget[rt].writeDisableAlpha ||
+ !pState->state.blendState.renderTarget[rt].writeDisableRed ||
+ !pState->state.blendState.renderTarget[rt].writeDisableGreen ||
+ !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0;
+ }
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief InitDraw
+/// @param pDC - Draw context to initialize for this draw.
+void InitDraw(
+ DRAW_CONTEXT *pDC,
+ bool isSplitDraw)
+{
+ // We don't need to re-setup the scissors/pipeline state again for split draw.
+ if (isSplitDraw == false)
+ {
+ SetupMacroTileScissors(pDC);
+ SetupPipeline(pDC);
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief We can split the draw for certain topologies for better performance.
+/// @param totalVerts - Total vertices for draw
+/// @param topology - Topology used for draw
+uint32_t MaxVertsPerDraw(
+ DRAW_CONTEXT* pDC,
+ uint32_t totalVerts,
+ PRIMITIVE_TOPOLOGY topology)
+{
+ API_STATE& state = pDC->pState->state;
+
+ uint32_t vertsPerDraw = totalVerts;
+
+ if (state.soState.soEnable)
+ {
+ return totalVerts;
+ }
+
+ switch (topology)
+ {
+ case TOP_POINT_LIST:
+ case TOP_TRIANGLE_LIST:
+ vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW;
+ break;
+
+ case TOP_PATCHLIST_1:
+ case TOP_PATCHLIST_2:
+ case TOP_PATCHLIST_3:
+ case TOP_PATCHLIST_4:
+ case TOP_PATCHLIST_5:
+ case TOP_PATCHLIST_6:
+ case TOP_PATCHLIST_7:
+ case TOP_PATCHLIST_8:
+ case TOP_PATCHLIST_9:
+ case TOP_PATCHLIST_10:
+ case TOP_PATCHLIST_11:
+ case TOP_PATCHLIST_12:
+ case TOP_PATCHLIST_13:
+ case TOP_PATCHLIST_14:
+ case TOP_PATCHLIST_15:
+ case TOP_PATCHLIST_16:
+ case TOP_PATCHLIST_17:
+ case TOP_PATCHLIST_18:
+ case TOP_PATCHLIST_19:
+ case TOP_PATCHLIST_20:
+ case TOP_PATCHLIST_21:
+ case TOP_PATCHLIST_22:
+ case TOP_PATCHLIST_23:
+ case TOP_PATCHLIST_24:
+ case TOP_PATCHLIST_25:
+ case TOP_PATCHLIST_26:
+ case TOP_PATCHLIST_27:
+ case TOP_PATCHLIST_28:
+ case TOP_PATCHLIST_29:
+ case TOP_PATCHLIST_30:
+ case TOP_PATCHLIST_31:
+ case TOP_PATCHLIST_32:
+ if (pDC->pState->state.tsState.tsEnable)
+ {
+ uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE;
+ vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW;
+ }
+ break;
+
+ default:
+ // We are not splitting up draws for other topologies.
+ break;
+ }
+
+ return vertsPerDraw;
+}
+
+// Recursive template used to auto-nest conditionals. Converts dynamic boolean function
+// arguments to static template arguments.
+template <bool... ArgsB>
+struct FEDrawChooser
+{
+ // Last Arg Terminator
+ static PFN_FE_WORK_FUNC GetFunc(bool bArg)
+ {
+ if (bArg)
+ {
+ return ProcessDraw<ArgsB..., true>;
+ }
+
+ return ProcessDraw<ArgsB..., false>;
+ }
+
+ // Recursively parse args
+ template <typename... TArgsT>
+ static PFN_FE_WORK_FUNC GetFunc(bool bArg, TArgsT... remainingArgs)
+ {
+ if (bArg)
+ {
+ return FEDrawChooser<ArgsB..., true>::GetFunc(remainingArgs...);
+ }
+
+ return FEDrawChooser<ArgsB..., false>::GetFunc(remainingArgs...);
+ }
+};
+
+// Selector for correct templated Draw front-end function
+INLINE
+static PFN_FE_WORK_FUNC GetFEDrawFunc(bool IsIndexed, bool HasTessellation, bool HasGeometryShader, bool HasStreamOut, bool RasterizerEnabled)
+{
+ return FEDrawChooser<>::GetFunc(IsIndexed, HasTessellation, HasGeometryShader, HasStreamOut, RasterizerEnabled);
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief DrawInstanced
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param topology - Specifies topology for draw.
+/// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
+/// @param startVertex - Specifies start vertex for draw. (vertex data)
+/// @param numInstances - How many instances to render.
+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
+void DrawInstanced(
+ HANDLE hContext,
+ PRIMITIVE_TOPOLOGY topology,
+ uint32_t numVertices,
+ uint32_t startVertex,
+ uint32_t numInstances = 1,
+ uint32_t startInstance = 0)
+{
+ if (KNOB_TOSS_DRAW)
+ {
+ return;
+ }
+
+ RDTSC_START(APIDraw);
+
+ SWR_CONTEXT *pContext = GetContext(hContext);
+ DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+ int32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
+ uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
+ int32_t remainingVerts = numVertices;
+
+ API_STATE *pState = &pDC->pState->state;
+ pState->topology = topology;
+ pState->forceFront = false;
+
+ // disable culling for points/lines
+ uint32_t oldCullMode = pState->rastState.cullMode;
+ if (topology == TOP_POINT_LIST)
+ {
+ pState->rastState.cullMode = SWR_CULLMODE_NONE;
+ pState->forceFront = true;
+ }
+
+ int draw = 0;
+ while (remainingVerts)
+ {
+ uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ?
+ remainingVerts : maxVertsPerDraw;
+
+ bool isSplitDraw = (draw > 0) ? true : false;
+ DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw);
+ InitDraw(pDC, isSplitDraw);
+
+ pDC->FeWork.type = DRAW;
+ pDC->FeWork.pfnWork = GetFEDrawFunc(
+ false, // IsIndexed
+ pState->tsState.tsEnable,
+ pState->gsState.gsEnable,
+ pState->soState.soEnable,
+ pDC->pState->pfnProcessPrims != nullptr);
+ pDC->FeWork.desc.draw.numVerts = numVertsForDraw;
+ pDC->FeWork.desc.draw.startVertex = startVertex;
+ pDC->FeWork.desc.draw.numInstances = numInstances;
+ pDC->FeWork.desc.draw.startInstance = startInstance;
+ pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
+ pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
+
+ //enqueue DC
+ QueueDraw(pContext);
+
+ remainingVerts -= numVertsForDraw;
+ draw++;
+ }
+
+ // restore culling state
+ pDC = GetDrawContext(pContext);
+ pDC->pState->state.rastState.cullMode = oldCullMode;
+
+ RDTSC_STOP(APIDraw, numVertices * numInstances, 0);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDraw
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param topology - Specifies topology for draw.
+/// @param startVertex - Specifies start vertex in vertex buffer for draw.
+/// @param primCount - Number of vertices.
+void SwrDraw(
+ HANDLE hContext,
+ PRIMITIVE_TOPOLOGY topology,
+ uint32_t startVertex,
+ uint32_t numVertices)
+{
+ DrawInstanced(hContext, topology, numVertices, startVertex);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDrawInstanced
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param topology - Specifies topology for draw.
+/// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
+/// @param numInstances - How many instances to render.
+/// @param startVertex - Specifies start vertex for draw. (vertex data)
+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
+void SwrDrawInstanced(
+ HANDLE hContext,
+ PRIMITIVE_TOPOLOGY topology,
+ uint32_t numVertsPerInstance,
+ uint32_t numInstances,
+ uint32_t startVertex,
+ uint32_t startInstance
+ )
+{
+ DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief DrawIndexedInstanced
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param topology - Specifies topology for draw.
+/// @param numIndices - Number of indices to read sequentially from index buffer.
+/// @param indexOffset - Starting index into index buffer.
+/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
+/// @param numInstances - Number of instances to render.
+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
+void DrawIndexedInstance(
+ HANDLE hContext,
+ PRIMITIVE_TOPOLOGY topology,
+ uint32_t numIndices,
+ uint32_t indexOffset,
+ int32_t baseVertex,
+ uint32_t numInstances = 1,
+ uint32_t startInstance = 0)
+{
+ if (KNOB_TOSS_DRAW)
+ {
+ return;
+ }
+
+ RDTSC_START(APIDrawIndexed);
+
+ SWR_CONTEXT *pContext = GetContext(hContext);
+ DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+ API_STATE* pState = &pDC->pState->state;
+
+ int32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
+ uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
+ int32_t remainingIndices = numIndices;
+
+ uint32_t indexSize = 0;
+ switch (pState->indexBuffer.format)
+ {
+ case R32_UINT: indexSize = sizeof(uint32_t); break;
+ case R16_UINT: indexSize = sizeof(uint16_t); break;
+ case R8_UINT: indexSize = sizeof(uint8_t); break;
+ default:
+ SWR_ASSERT(0);
+ }
+
+ int draw = 0;
+ uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices;
+ pIB += (uint64_t)indexOffset * (uint64_t)indexSize;
+
+ pState->topology = topology;
+ pState->forceFront = false;
+
+ // disable culling for points/lines
+ uint32_t oldCullMode = pState->rastState.cullMode;
+ if (topology == TOP_POINT_LIST)
+ {
+ pState->rastState.cullMode = SWR_CULLMODE_NONE;
+ pState->forceFront = true;
+ }
+
+ while (remainingIndices)
+ {
+ uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ?
+ remainingIndices : maxIndicesPerDraw;
+
+ // When breaking up draw, we need to obtain new draw context for each iteration.
+ bool isSplitDraw = (draw > 0) ? true : false;
+ pDC = GetDrawContext(pContext, isSplitDraw);
+ InitDraw(pDC, isSplitDraw);
+
+ pDC->FeWork.type = DRAW;
+ pDC->FeWork.pfnWork = GetFEDrawFunc(
+ true, // IsIndexed
+ pState->tsState.tsEnable,
+ pState->gsState.gsEnable,
+ pState->soState.soEnable,
+ pDC->pState->pfnProcessPrims != nullptr);
+ pDC->FeWork.desc.draw.pDC = pDC;
+ pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
+ pDC->FeWork.desc.draw.pIB = (int*)pIB;
+ pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;
+
+ pDC->FeWork.desc.draw.numInstances = numInstances;
+ pDC->FeWork.desc.draw.startInstance = startInstance;
+ pDC->FeWork.desc.draw.baseVertex = baseVertex;
+ pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
+
+ //enqueue DC
+ QueueDraw(pContext);
+
+ pIB += maxIndicesPerDraw * indexSize;
+ remainingIndices -= numIndicesForDraw;
+ draw++;
+ }
+
+ // restore culling state
+ pDC = GetDrawContext(pContext);
+ pDC->pState->state.rastState.cullMode = oldCullMode;
+
+ RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0);
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief DrawIndexed
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param topology - Specifies topology for draw.
+/// @param numIndices - Number of indices to read sequentially from index buffer.
+/// @param indexOffset - Starting index into index buffer.
+/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
+void SwrDrawIndexed(
+ HANDLE hContext,
+ PRIMITIVE_TOPOLOGY topology,
+ uint32_t numIndices,
+ uint32_t indexOffset,
+ int32_t baseVertex
+ )
+{
+ DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDrawIndexedInstanced
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param topology - Specifies topology for draw.
+/// @param numIndices - Number of indices to read sequentially from index buffer.
+/// @param numInstances - Number of instances to render.
+/// @param indexOffset - Starting index into index buffer.
+/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
+void SwrDrawIndexedInstanced(
+ HANDLE hContext,
+ PRIMITIVE_TOPOLOGY topology,
+ uint32_t numIndices,
+ uint32_t numInstances,
+ uint32_t indexOffset,
+ int32_t baseVertex,
+ uint32_t startInstance)
+{
+ DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
+}
+
+// Attach surfaces to pipeline
+void SwrInvalidateTiles(
+ HANDLE hContext,
+ uint32_t attachmentMask)
+{
+ SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+ DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+ // Queue a load to the hottile
+ pDC->FeWork.type = INVALIDATETILES;
+ pDC->FeWork.pfnWork = ProcessInvalidateTiles;
+ pDC->FeWork.desc.invalidateTiles.attachmentMask = attachmentMask;
+
+ //enqueue
+ QueueDraw(pContext);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDispatch
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param threadGroupCountX - Number of thread groups dispatched in X direction
+/// @param threadGroupCountY - Number of thread groups dispatched in Y direction
+/// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
+void SwrDispatch(
+ HANDLE hContext,
+ uint32_t threadGroupCountX,
+ uint32_t threadGroupCountY,
+ uint32_t threadGroupCountZ)
+{
+ if (KNOB_TOSS_DRAW)
+ {
+ return;
+ }
+
+ RDTSC_START(APIDispatch);
+ SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+ DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+ pDC->isCompute = true; // This is a compute context.
+
+ // Ensure spill fill pointers are initialized to nullptr.
+ memset(pDC->pSpillFill, 0, sizeof(pDC->pSpillFill));
+
+ COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
+
+ pTaskData->threadGroupCountX = threadGroupCountX;
+ pTaskData->threadGroupCountY = threadGroupCountY;
+ pTaskData->threadGroupCountZ = threadGroupCountZ;
+
+ uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
+ pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
+
+ QueueDispatch(pContext);
+ RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0);
+}
+
+// Deswizzles, converts and stores current contents of the hot tiles to surface
+// described by pState
+void SwrStoreTiles(
+ HANDLE hContext,
+ SWR_RENDERTARGET_ATTACHMENT attachment,
+ SWR_TILE_STATE postStoreTileState)
+{
+ RDTSC_START(APIStoreTiles);
+
+ SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+ DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+ SetupMacroTileScissors(pDC);
+
+ pDC->FeWork.type = STORETILES;
+ pDC->FeWork.pfnWork = ProcessStoreTiles;
+ pDC->FeWork.desc.storeTiles.attachment = attachment;
+ pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;
+
+ //enqueue
+ QueueDraw(pContext);
+
+ RDTSC_STOP(APIStoreTiles, 0, 0);
+}
+
+void SwrClearRenderTarget(
+ HANDLE hContext,
+ uint32_t clearMask,
+ const float clearColor[4],
+ float z,
+ BYTE stencil)
+{
+ RDTSC_START(APIClearRenderTarget);
+
+ SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+
+ DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+ SetupMacroTileScissors(pDC);
+
+ CLEAR_FLAGS flags;
+ flags.mask = clearMask;
+
+ pDC->FeWork.type = CLEAR;
+ pDC->FeWork.pfnWork = ProcessClear;
+ pDC->FeWork.desc.clear.flags = flags;
+ pDC->FeWork.desc.clear.clearDepth = z;
+ pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0];
+ pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1];
+ pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2];
+ pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3];
+ pDC->FeWork.desc.clear.clearStencil = stencil;
+
+ // enqueue draw
+ QueueDraw(pContext);
+
+ RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Returns a pointer to the private context state for the current
+/// draw operation. This is used for external componets such as the
+/// sampler.
+/// SWR is responsible for the allocation of the private context state.
+/// @param hContext - Handle passed back from SwrCreateContext
+VOID* SwrGetPrivateContextState(
+ HANDLE hContext)
+{
+ SWR_CONTEXT* pContext = GetContext(hContext);
+ DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+ DRAW_STATE* pState = pDC->pState;
+
+ if (pState->pPrivateState == nullptr)
+ {
+ pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float));
+ }
+
+ return pState->pPrivateState;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Clients can use this to allocate memory for draw/dispatch
+/// operations. The memory will automatically be freed once operation
+/// has completed. Client can use this to allocate binding tables,
+/// etc. needed for shader execution.
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param size - Size of allocation
+/// @param align - Alignment needed for allocation.
+VOID* SwrAllocDrawContextMemory(
+ HANDLE hContext,
+ uint32_t size,
+ uint32_t align)
+{
+ SWR_CONTEXT* pContext = GetContext(hContext);
+ DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+ return pDC->pState->pArena->AllocAligned(size, align);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Returns pointer to SWR stats.
+/// @note The counters are atomically incremented by multiple threads.
+/// When calling this, you need to ensure all previous operations
+/// have completed.
+/// @todo If necessary, add a callback to avoid stalling the pipe to
+/// sample the counters.
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pStats - SWR will fill this out for caller.
+void SwrGetStats(
+ HANDLE hContext,
+ SWR_STATS* pStats)
+{
+ SWR_CONTEXT *pContext = GetContext(hContext);
+ DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+ pDC->FeWork.type = QUERYSTATS;
+ pDC->FeWork.pfnWork = ProcessQueryStats;
+ pDC->FeWork.desc.queryStats.pStats = pStats;
+
+ // cannot execute until all previous draws have completed
+ pDC->dependency = pDC->drawId - 1;
+
+ //enqueue
+ QueueDraw(pContext);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Enables stats counting
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param enable - If true then counts are incremented.
+void SwrEnableStats(
+ HANDLE hContext,
+ bool enable)
+{
+ SWR_CONTEXT *pContext = GetContext(hContext);
+ DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+ pDC->pState->state.enableStats = enable;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Mark end of frame - used for performance profiling
+/// @param hContext - Handle passed back from SwrCreateContext
+void SWR_API SwrEndFrame(
+ HANDLE hContext)
+{
+ RDTSC_ENDFRAME();
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h
new file mode 100644
index 00000000000..72fae8b2c21
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -0,0 +1,500 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file api.h
+*
+* @brief API definitions
+*
+******************************************************************************/
+
+#ifndef __SWR_API_H__
+#define __SWR_API_H__
+
+#include "common/os.h"
+
+#include <assert.h>
+#include <vector>
+
+#include "common/simdintrin.h"
+#include "common/formats.h"
+#include "core/utils.h"
+#include "core/state.h"
+
+///@todo place all the API functions into the 'swr' namespace.
+
+typedef void(SWR_API *PFN_CALLBACK_FUNC)(uint64_t data, uint64_t data2, uint64_t data3);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Function signature for load hot tiles
+/// @param hPrivateContext - handle to private data
+/// @param dstFormat - format of the hot tile
+/// @param renderTargetIndex - render target to store, can be color, depth or stencil
+/// @param x - destination x coordinate
+/// @param y - destination y coordinate
+/// @param pDstHotTile - pointer to the hot tile surface
+typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT dstFormat,
+ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
+ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pDstHotTile);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Function signature for store hot tiles
+/// @param hPrivateContext - handle to private data
+/// @param srcFormat - format of the hot tile
+/// @param renderTargetIndex - render target to store, can be color, depth or stencil
+/// @param x - destination x coordinate
+/// @param y - destination y coordinate
+/// @param pSrcHotTile - pointer to the hot tile surface
+typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, SWR_FORMAT srcFormat,
+ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
+ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pSrcHotTile);
+
+/// @brief Function signature for clearing from the hot tiles clear value
+/// @param hPrivateContext - handle to private data
+/// @param renderTargetIndex - render target to store, can be color, depth or stencil
+/// @param x - destination x coordinate
+/// @param y - destination y coordinate
+/// @param pClearColor - pointer to the hot tile's clear value
+typedef void(SWR_API *PFN_CLEAR_TILE)(HANDLE hPrivateContext,
+ SWR_RENDERTARGET_ATTACHMENT rtIndex,
+ uint32_t x, uint32_t y, const float* pClearColor);
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_CREATECONTEXT_INFO
+/////////////////////////////////////////////////////////////////////////
+struct SWR_CREATECONTEXT_INFO
+{
+ DRIVER_TYPE driver;
+
+ // External functions (e.g. sampler) need per draw context state.
+ // Use SwrGetPrivateContextState() to access private state.
+ uint32_t privateStateSize;
+
+ // Each SWR context can have multiple sets of active state
+ uint32_t maxSubContexts;
+
+ // tile manipulation functions
+ PFN_LOAD_TILE pfnLoadTile;
+ PFN_STORE_TILE pfnStoreTile;
+ PFN_CLEAR_TILE pfnClearTile;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_RECT
+/////////////////////////////////////////////////////////////////////////
+struct SWR_RECT
+{
+ uint32_t left;
+ uint32_t right;
+ uint32_t top;
+ uint32_t bottom;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Create SWR Context.
+/// @param pCreateInfo - pointer to creation info.
+HANDLE SWR_API SwrCreateContext(
+ const SWR_CREATECONTEXT_INFO* pCreateInfo);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Destroys SWR Context.
+/// @param hContext - Handle passed back from SwrCreateContext
+void SWR_API SwrDestroyContext(
+ HANDLE hContext);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set currently active state context
+/// @param subContextIndex - value from 0 to
+/// SWR_CREATECONTEXT_INFO.maxSubContexts. Defaults to 0.
+void SWR_API SwrSetActiveSubContext(
+ HANDLE hContext,
+ uint32_t subContextIndex);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Sync cmd. Executes the callback func when all rendering up to this sync
+/// has been completed
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pfnFunc - pointer to callback function,
+/// @param userData - user data to pass back
+void SWR_API SwrSync(
+ HANDLE hContext,
+ PFN_CALLBACK_FUNC pfnFunc,
+ uint64_t userData,
+ uint64_t userData2,
+ uint64_t userData3 = 0);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Blocks until all rendering has been completed.
+/// @param hContext - Handle passed back from SwrCreateContext
+void SWR_API SwrWaitForIdle(
+ HANDLE hContext);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set vertex buffer state.
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param numBuffers - Number of vertex buffer state descriptors.
+/// @param pVertexBuffers - Array of vertex buffer state descriptors.
+void SWR_API SwrSetVertexBuffers(
+ HANDLE hContext,
+ uint32_t numBuffers,
+ const SWR_VERTEX_BUFFER_STATE* pVertexBuffers);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set index buffer
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pIndexBuffer - Index buffer.
+void SWR_API SwrSetIndexBuffer(
+ HANDLE hContext,
+ const SWR_INDEX_BUFFER_STATE* pIndexBuffer);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set fetch shader pointer.
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pfnFetchFunc - Pointer to shader.
+void SWR_API SwrSetFetchFunc(
+ HANDLE hContext,
+ PFN_FETCH_FUNC pfnFetchFunc);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set streamout shader pointer.
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pfnSoFunc - Pointer to shader.
+/// @param streamIndex - specifies stream
+void SWR_API SwrSetSoFunc(
+ HANDLE hContext,
+ PFN_SO_FUNC pfnSoFunc,
+ uint32_t streamIndex);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set streamout state
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pSoState - Pointer to streamout state.
+void SWR_API SwrSetSoState(
+ HANDLE hContext,
+ SWR_STREAMOUT_STATE* pSoState);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set streamout buffer state
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pSoBuffer - Pointer to streamout buffer.
+/// @param slot - Slot to bind SO buffer to.
+void SWR_API SwrSetSoBuffers(
+ HANDLE hContext,
+ SWR_STREAMOUT_BUFFER* pSoBuffer,
+ uint32_t slot);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set vertex shader pointer.
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pfnVertexFunc - Pointer to shader.
+void SWR_API SwrSetVertexFunc(
+ HANDLE hContext,
+ PFN_VERTEX_FUNC pfnVertexFunc);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set frontend state.
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pState - Pointer to state
+void SWR_API SwrSetFrontendState(
+ HANDLE hContext,
+ SWR_FRONTEND_STATE *pState);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set geometry shader state.
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pState - Pointer to state
+void SWR_API SwrSetGsState(
+ HANDLE hContext,
+ SWR_GS_STATE *pState);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set geometry shader
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pState - Pointer to geometry shader function
+void SWR_API SwrSetGsFunc(
+ HANDLE hContext,
+ PFN_GS_FUNC pfnGsFunc);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set compute shader
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pState - Pointer to compute shader function
+/// @param totalThreadsInGroup - product of thread group dimensions.
+void SWR_API SwrSetCsFunc(
+ HANDLE hContext,
+ PFN_CS_FUNC pfnCsFunc,
+ uint32_t totalThreadsInGroup);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set tessellation state.
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pState - Pointer to state
+void SWR_API SwrSetTsState(
+ HANDLE hContext,
+ SWR_TS_STATE *pState);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set hull shader
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pfnFunc - Pointer to shader function
+void SWR_API SwrSetHsFunc(
+ HANDLE hContext,
+ PFN_HS_FUNC pfnFunc);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set domain shader
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pfnFunc - Pointer to shader function
+void SWR_API SwrSetDsFunc(
+ HANDLE hContext,
+ PFN_DS_FUNC pfnFunc);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set depth stencil state
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pState - Pointer to state.
+void SWR_API SwrSetDepthStencilState(
+ HANDLE hContext,
+ SWR_DEPTH_STENCIL_STATE *pState);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set backend state
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pState - Pointer to state.
+void SWR_API SwrSetBackendState(
+ HANDLE hContext,
+ SWR_BACKEND_STATE *pState);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set pixel shader state
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pState - Pointer to state.
+void SWR_API SwrSetPixelShaderState(
+ HANDLE hContext,
+ SWR_PS_STATE *pState);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set blend state
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pState - Pointer to state.
+void SWR_API SwrSetBlendState(
+ HANDLE hContext,
+ SWR_BLEND_STATE *pState);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set blend function
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param renderTarget - render target index
+/// @param pfnBlendFunc - function pointer
+void SWR_API SwrSetBlendFunc(
+ HANDLE hContext,
+ uint32_t renderTarget,
+ PFN_BLEND_JIT_FUNC pfnBlendFunc);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set linkage mask
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param mask - Specifies which vertex outputs are are needed by PS.
+/// @param pMap - (Optional)Linkage map to specify where FE attributes are
+/// gathered from to supply PS attribute values. The length
+/// of the map buffer needs to match the number of set bits
+/// in "mask".
+void SWR_API SwrSetLinkage(
+ HANDLE hContext,
+ uint32_t mask,
+ const uint8_t* pMap);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDraw
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param topology - Specifies topology for draw.
+/// @param startVertex - Specifies start vertex in vertex buffer for draw.
+/// @param primCount - Number of vertices.
+void SWR_API SwrDraw(
+ HANDLE hContext,
+ PRIMITIVE_TOPOLOGY topology,
+ uint32_t startVertex,
+ uint32_t primCount);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDrawInstanced
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param topology - Specifies topology for draw.
+/// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
+/// @param numInstances - How many instances to render.
+/// @param startVertex - Specifies start vertex for draw. (vertex data)
+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
+void SWR_API SwrDrawInstanced(
+ HANDLE hContext,
+ PRIMITIVE_TOPOLOGY topology,
+ uint32_t numVertsPerInstance,
+ uint32_t numInstances,
+ uint32_t startVertex,
+ uint32_t startInstance);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief DrawIndexed
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param topology - Specifies topology for draw.
+/// @param numIndices - Number of indices to read sequentially from index buffer.
+/// @param indexOffset - Starting index into index buffer.
+/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
+void SWR_API SwrDrawIndexed(
+ HANDLE hContext,
+ PRIMITIVE_TOPOLOGY topology,
+ uint32_t numIndices,
+ uint32_t indexOffset,
+ int32_t baseVertex);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDrawIndexedInstanced
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param topology - Specifies topology for draw.
+/// @param numIndices - Number of indices to read sequentially from index buffer.
+/// @param numInstances - Number of instances to render.
+/// @param indexOffset - Starting index into index buffer.
+/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
+void SWR_API SwrDrawIndexedInstanced(
+ HANDLE hContext,
+ PRIMITIVE_TOPOLOGY topology,
+ uint32_t numIndices,
+ uint32_t numInstances,
+ uint32_t indexOffset,
+ int32_t baseVertex,
+ uint32_t startInstance);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrInvalidateTiles
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
+void SWR_API SwrInvalidateTiles(
+ HANDLE hContext,
+ uint32_t attachmentMask);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDispatch
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param threadGroupCountX - Number of thread groups dispatched in X direction
+/// @param threadGroupCountY - Number of thread groups dispatched in Y direction
+/// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
+void SWR_API SwrDispatch(
+ HANDLE hContext,
+ uint32_t threadGroupCountX,
+ uint32_t threadGroupCountY,
+ uint32_t threadGroupCountZ);
+
+
+enum SWR_TILE_STATE
+{
+ SWR_TILE_INVALID = 0, // tile is in unitialized state and should be loaded with surface contents before rendering
+ SWR_TILE_DIRTY = 2, // tile contains newer data than surface it represents
+ SWR_TILE_RESOLVED = 3, // is in sync with surface it represents
+};
+
+/// @todo Add a good description for what attachments are and when and why you would use the different SWR_TILE_STATEs.
+void SWR_API SwrStoreTiles(
+ HANDLE hContext,
+ SWR_RENDERTARGET_ATTACHMENT attachment,
+ SWR_TILE_STATE postStoreTileState);
+
+void SWR_API SwrClearRenderTarget(
+ HANDLE hContext,
+ uint32_t clearMask,
+ const FLOAT clearColor[4],
+ float z,
+ BYTE stencil);
+
+void SWR_API SwrSetRastState(
+ HANDLE hContext,
+ const SWR_RASTSTATE *pRastState);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrSetViewports
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param numViewports - number of viewports passed in
+/// @param pViewports - Specifies extents of viewport.
+/// @param pMatrices - If not specified then SWR computes a default one.
+void SWR_API SwrSetViewports(
+ HANDLE hContext,
+ uint32_t numViewports,
+ const SWR_VIEWPORT* pViewports,
+ const SWR_VIEWPORT_MATRIX* pMatrices);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrSetScissorRects
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param numScissors - number of scissors passed in
+/// @param pScissors - array of scissors
+void SWR_API SwrSetScissorRects(
+ HANDLE hContext,
+ uint32_t numScissors,
+ const BBOX* pScissors);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Returns a pointer to the private context state for the current
+/// draw operation. This is used for external componets such as the
+/// sampler.
+///
+/// @note Client needs to resend private state prior to each draw call.
+/// Also, SWR is responsible for the private state memory.
+/// @param hContext - Handle passed back from SwrCreateContext
+VOID* SWR_API SwrGetPrivateContextState(
+ HANDLE hContext);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Clients can use this to allocate memory for draw/dispatch
+/// operations. The memory will automatically be freed once operation
+/// has completed. Client can use this to allocate binding tables,
+/// etc. needed for shader execution.
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param size - Size of allocation
+/// @param align - Alignment needed for allocation.
+VOID* SWR_API SwrAllocDrawContextMemory(
+ HANDLE hContext,
+ uint32_t size,
+ uint32_t align);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Returns pointer to SWR stats.
+/// @note The counters are incremented by multiple threads.
+/// When calling this, you need to ensure all previous operations
+/// have completed.
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pStats - SWR will fill this out for caller.
+void SWR_API SwrGetStats(
+ HANDLE hContext,
+ SWR_STATS* pStats);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Enables stats counting
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param enable - If true then counts are incremented.
+void SWR_API SwrEnableStats(
+ HANDLE hContext,
+ bool enable);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Mark end of frame - used for performance profiling
+/// @param hContext - Handle passed back from SwrCreateContext
+void SWR_API SwrEndFrame(
+ HANDLE hContext);
+#endif//__SWR_API_H__
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.cpp b/src/gallium/drivers/swr/rasterizer/core/arena.cpp
new file mode 100644
index 00000000000..8184c8d3f4c
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/arena.cpp
@@ -0,0 +1,166 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file arena.cpp
+*
+* @brief Arena memory manager
+* The arena is convenient and fast for managing allocations for any of
+* our allocations that are associated with operations and can all be freed
+* once when their operation has completed. Allocations are cheap since
+* most of the time its simply an increment of an offset. Also, no need to
+* free individual allocations. All of the arena memory can be freed at once.
+*
+******************************************************************************/
+
+#include "context.h"
+#include "arena.h"
+
+#include <cmath>
+
+Arena::Arena()
+ : m_pCurBlock(nullptr), m_size(0)
+{
+ m_pMutex = new std::mutex();
+}
+
+Arena::~Arena()
+{
+ Reset(); // Reset just in case to avoid leaking memory.
+
+ if (m_pCurBlock)
+ {
+ _aligned_free(m_pCurBlock->pMem);
+ delete m_pCurBlock;
+ }
+
+ delete m_pMutex;
+}
+
+///@todo Remove this when all users have stopped using this.
+void Arena::Init()
+{
+ m_size = 0;
+ m_pCurBlock = nullptr;
+
+ m_pMutex = new std::mutex();
+}
+
+void* Arena::AllocAligned(size_t size, size_t align)
+{
+ if (m_pCurBlock)
+ {
+ ArenaBlock* pCurBlock = m_pCurBlock;
+ pCurBlock->offset = AlignUp(pCurBlock->offset, align);
+
+ if ((pCurBlock->offset + size) <= pCurBlock->blockSize)
+ {
+ void* pMem = PtrAdd(pCurBlock->pMem, pCurBlock->offset);
+ pCurBlock->offset += size;
+ m_size += size;
+ return pMem;
+ }
+
+ // Not enough memory in this block, fall through to allocate
+ // a new block
+ }
+
+ static const size_t ArenaBlockSize = 1024*1024;
+ size_t blockSize = std::max(m_size + ArenaBlockSize, std::max(size, ArenaBlockSize));
+ blockSize = AlignUp(blockSize, KNOB_SIMD_WIDTH*4);
+
+ void *pMem = _aligned_malloc(blockSize, KNOB_SIMD_WIDTH*4); // Arena blocks are always simd byte aligned.
+ SWR_ASSERT(pMem != nullptr);
+
+ ArenaBlock* pNewBlock = new (std::nothrow) ArenaBlock();
+ SWR_ASSERT(pNewBlock != nullptr);
+
+ if (pNewBlock != nullptr)
+ {
+ pNewBlock->pNext = m_pCurBlock;
+
+ m_pCurBlock = pNewBlock;
+ m_pCurBlock->pMem = pMem;
+ m_pCurBlock->blockSize = blockSize;
+
+ }
+
+ return AllocAligned(size, align);
+}
+
+void* Arena::Alloc(size_t size)
+{
+ return AllocAligned(size, 1);
+}
+
+void* Arena::AllocAlignedSync(size_t size, size_t align)
+{
+ void* pAlloc = nullptr;
+
+ SWR_ASSERT(m_pMutex != nullptr);
+
+ m_pMutex->lock();
+ pAlloc = AllocAligned(size, align);
+ m_pMutex->unlock();
+
+ return pAlloc;
+}
+
+void* Arena::AllocSync(size_t size)
+{
+ void* pAlloc = nullptr;
+
+ SWR_ASSERT(m_pMutex != nullptr);
+
+ m_pMutex->lock();
+ pAlloc = Alloc(size);
+ m_pMutex->unlock();
+
+ return pAlloc;
+}
+
+void Arena::Reset(bool removeAll)
+{
+ if (m_pCurBlock)
+ {
+ m_pCurBlock->offset = 0;
+
+ ArenaBlock *pUsedBlocks = m_pCurBlock->pNext;
+ m_pCurBlock->pNext = nullptr;
+ while(pUsedBlocks)
+ {
+ ArenaBlock* pBlock = pUsedBlocks;
+ pUsedBlocks = pBlock->pNext;
+
+ _aligned_free(pBlock->pMem);
+ delete pBlock;
+ }
+
+ if (removeAll)
+ {
+ _aligned_free(m_pCurBlock->pMem);
+ delete m_pCurBlock;
+ m_pCurBlock = nullptr;
+ }
+ }
+
+ m_size = 0;
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h
new file mode 100644
index 00000000000..76eee11fb08
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/arena.h
@@ -0,0 +1,69 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file arena.h
+*
+* @brief Arena memory manager
+* The arena is convenient and fast for managing allocations for any of
+* our allocations that are associated with operations and can all be freed
+* once when their operation has completed. Allocations are cheap since
+* most of the time its simply an increment of an offset. Also, no need to
+* free individual allocations. All of the arena memory can be freed at once.
+*
+******************************************************************************/
+#pragma once
+
+#include <mutex>
+
+class Arena
+{
+public:
+ Arena();
+ ~Arena();
+
+ void Init();
+
+ void* AllocAligned(size_t size, size_t align);
+ void* Alloc(size_t size);
+
+ void* AllocAlignedSync(size_t size, size_t align);
+ void* AllocSync(size_t size);
+
+ void Reset(bool removeAll = false);
+ size_t Size() { return m_size; }
+
+private:
+
+ struct ArenaBlock
+ {
+ void* pMem = nullptr;
+ size_t blockSize = 0;
+ size_t offset = 0;
+ ArenaBlock* pNext = nullptr;
+ };
+
+ ArenaBlock* m_pCurBlock = nullptr;
+ size_t m_size = 0;
+
+ /// @note Mutex is only used by sync allocation functions.
+ std::mutex* m_pMutex;
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
new file mode 100644
index 00000000000..4a472bc9e5c
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -0,0 +1,1899 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file backend.cpp
+*
+* @brief Backend handles rasterization, pixel shading and output merger
+* operations.
+*
+******************************************************************************/
+
+#include <smmintrin.h>
+
+#include "rdtsc_core.h"
+#include "backend.h"
+#include "depthstencil.h"
+#include "tilemgr.h"
+#include "memory/tilingtraits.h"
+#include "core/multisample.h"
+
+#include <algorithm>
+
+const __m128 vTileOffsetsX = {0.5, KNOB_TILE_X_DIM - 0.5, 0.5, KNOB_TILE_X_DIM - 0.5};
+const __m128 vTileOffsetsY = {0.5, 0.5, KNOB_TILE_Y_DIM - 0.5, KNOB_TILE_Y_DIM - 0.5};
+
+/// @todo move to common lib
+#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
+static const __m128 gMaskToVec[] = {
+ MASKTOVEC(0,0,0,0),
+ MASKTOVEC(0,0,0,1),
+ MASKTOVEC(0,0,1,0),
+ MASKTOVEC(0,0,1,1),
+ MASKTOVEC(0,1,0,0),
+ MASKTOVEC(0,1,0,1),
+ MASKTOVEC(0,1,1,0),
+ MASKTOVEC(0,1,1,1),
+ MASKTOVEC(1,0,0,0),
+ MASKTOVEC(1,0,0,1),
+ MASKTOVEC(1,0,1,0),
+ MASKTOVEC(1,0,1,1),
+ MASKTOVEC(1,1,0,0),
+ MASKTOVEC(1,1,0,1),
+ MASKTOVEC(1,1,1,0),
+ MASKTOVEC(1,1,1,1),
+};
+
+typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, DWORD[4]);
+static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Process compute work.
+/// @param pDC - pointer to draw context (dispatch).
+/// @param workerId - The unique worker ID that is assigned to this thread.
+/// @param threadGroupId - the linear index for the thread group within the dispatch.
+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId)
+{
+ RDTSC_START(BEDispatch);
+
+ SWR_CONTEXT *pContext = pDC->pContext;
+
+ const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
+ SWR_ASSERT(pTaskData != nullptr);
+
+ // Ensure spill fill memory has been allocated.
+ if (pDC->pSpillFill[workerId] == nullptr)
+ {
+ ///@todo Add state which indicates the spill fill size.
+ pDC->pSpillFill[workerId] = (uint8_t*)pDC->pArena->AllocAlignedSync(4096 * 1024, sizeof(float) * 8);
+ }
+
+ const API_STATE& state = GetApiState(pDC);
+
+ SWR_CS_CONTEXT csContext{ 0 };
+ csContext.tileCounter = threadGroupId;
+ csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
+ csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
+ csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
+ csContext.pTGSM = pContext->pScratch[workerId];
+ csContext.pSpillFillBuffer = pDC->pSpillFill[workerId];
+
+ state.pfnCsFunc(GetPrivateState(pDC), &csContext);
+
+ UPDATE_STAT(CsInvocations, state.totalThreadsInGroup);
+
+ RDTSC_STOP(BEDispatch, 1, 0);
+}
+
+void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
+{
+ SYNC_DESC *pSync = (SYNC_DESC*)pUserData;
+
+ uint32_t x, y;
+ MacroTileMgr::getTileIndices(macroTile, x, y);
+ SWR_ASSERT(x == 0 && y == 0);
+
+ if (pSync->pfnCallbackFunc != nullptr)
+ {
+ pSync->pfnCallbackFunc(pSync->userData, pSync->userData2, pSync->userData3);
+ }
+}
+
+void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
+{
+ QUERY_DESC* pQueryDesc = (QUERY_DESC*)pUserData;
+ SWR_STATS* pStats = pQueryDesc->pStats;
+ SWR_CONTEXT *pContext = pDC->pContext;
+
+ SWR_ASSERT(pStats != nullptr);
+
+ for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
+ {
+ pStats->DepthPassCount += pContext->stats[i].DepthPassCount;
+
+ pStats->IaVertices += pContext->stats[i].IaVertices;
+ pStats->IaPrimitives += pContext->stats[i].IaPrimitives;
+ pStats->VsInvocations += pContext->stats[i].VsInvocations;
+ pStats->HsInvocations += pContext->stats[i].HsInvocations;
+ pStats->DsInvocations += pContext->stats[i].DsInvocations;
+ pStats->GsInvocations += pContext->stats[i].GsInvocations;
+ pStats->PsInvocations += pContext->stats[i].PsInvocations;
+ pStats->CInvocations += pContext->stats[i].CInvocations;
+ pStats->CsInvocations += pContext->stats[i].CsInvocations;
+ pStats->CPrimitives += pContext->stats[i].CPrimitives;
+ pStats->GsPrimitives += pContext->stats[i].GsPrimitives;
+
+ for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
+ {
+ pStats->SoWriteOffset[stream] += pContext->stats[i].SoWriteOffset[stream];
+
+ /// @note client is required to provide valid write offset before every draw, so we clear
+ /// out the contents of the write offset when storing stats
+ pContext->stats[i].SoWriteOffset[stream] = 0;
+
+ pStats->SoPrimStorageNeeded[stream] += pContext->stats[i].SoPrimStorageNeeded[stream];
+ pStats->SoNumPrimsWritten[stream] += pContext->stats[i].SoNumPrimsWritten[stream];
+ }
+ }
+}
+
+template<SWR_FORMAT format>
+void ClearRasterTile(BYTE *pTileBuffer, simdvector &value)
+{
+ auto lambda = [&](int comp)
+ {
+ FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
+ pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
+ };
+
+ const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
+ for (uint32_t i = 0; i < numIter; ++i)
+ {
+ UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
+ }
+}
+
+template<SWR_FORMAT format>
+INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, DWORD clear[4])
+{
+ // convert clear color to hottile format
+ // clear color is in RGBA float/uint32
+ simdvector vClear;
+ for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
+ {
+ simdscalar vComp;
+ vComp = _simd_load1_ps((const float*)&clear[comp]);
+ if (FormatTraits<format>::isNormalized(comp))
+ {
+ vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
+ vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
+ }
+ vComp = FormatTraits<format>::pack(comp, vComp);
+ vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
+ }
+
+ uint32_t tileX, tileY;
+ MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
+ const API_STATE& state = GetApiState(pDC);
+
+ int top = KNOB_MACROTILE_Y_DIM_FIXED * tileY;
+ int bottom = top + KNOB_MACROTILE_Y_DIM_FIXED - 1;
+ int left = KNOB_MACROTILE_X_DIM_FIXED * tileX;
+ int right = left + KNOB_MACROTILE_X_DIM_FIXED - 1;
+
+ // intersect with scissor
+ top = std::max(top, state.scissorInFixedPoint.top);
+ left = std::max(left, state.scissorInFixedPoint.left);
+ bottom = std::min(bottom, state.scissorInFixedPoint.bottom);
+ right = std::min(right, state.scissorInFixedPoint.right);
+
+ // translate to local hottile origin
+ top -= KNOB_MACROTILE_Y_DIM_FIXED * tileY;
+ bottom -= KNOB_MACROTILE_Y_DIM_FIXED * tileY;
+ left -= KNOB_MACROTILE_X_DIM_FIXED * tileX;
+ right -= KNOB_MACROTILE_X_DIM_FIXED * tileX;
+
+ // convert to raster tiles
+ top >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
+ bottom >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
+ left >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
+ right >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
+
+ const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
+ // compute steps between raster tile samples / raster tiles / macro tile rows
+ const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
+ const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
+ const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
+ const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
+
+ HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples);
+ uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, left, top)) * numSamples;
+ uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
+
+ // loop over all raster tiles in the current hot tile
+ for (int y = top; y <= bottom; ++y)
+ {
+ uint8_t* pRasterTile = pRasterTileRow;
+ for (int x = left; x <= right; ++x)
+ {
+ for( int sampleNum = 0; sampleNum < numSamples; sampleNum++)
+ {
+ ClearRasterTile<format>(pRasterTile, vClear);
+ pRasterTile += rasterTileSampleStep;
+ }
+ }
+ pRasterTileRow += macroTileRowStep;
+ }
+
+ pHotTile->state = HOTTILE_DIRTY;
+}
+
+
+void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
+{
+ if (KNOB_FAST_CLEAR)
+ {
+ CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
+ SWR_CONTEXT *pContext = pDC->pContext;
+ SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
+ uint32_t numSamples = GetNumSamples(sampleCount);
+
+ SWR_ASSERT(pClear->flags.bits != 0); // shouldn't be here without a reason.
+
+ RDTSC_START(BEClear);
+
+ if (pClear->flags.mask & SWR_CLEAR_COLOR)
+ {
+ HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_COLOR0, true, numSamples);
+ // All we want to do here is to mark the hot tile as being in a "needs clear" state.
+ pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
+ pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
+ pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
+ pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
+ pHotTile->state = HOTTILE_CLEAR;
+ }
+
+ if (pClear->flags.mask & SWR_CLEAR_DEPTH)
+ {
+ HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples);
+ pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
+ pHotTile->state = HOTTILE_CLEAR;
+ }
+
+ if (pClear->flags.mask & SWR_CLEAR_STENCIL)
+ {
+ HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples);
+
+ pHotTile->clearData[0] = *(DWORD*)&pClear->clearStencil;
+ pHotTile->state = HOTTILE_CLEAR;
+ }
+
+ RDTSC_STOP(BEClear, 0, 0);
+ }
+ else
+ {
+ // Legacy clear
+ CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
+ RDTSC_START(BEClear);
+
+ if (pClear->flags.mask & SWR_CLEAR_COLOR)
+ {
+ /// @todo clear data should come in as RGBA32_FLOAT
+ DWORD clearData[4];
+ float clearFloat[4];
+ clearFloat[0] = ((BYTE*)(&pClear->clearRTColor))[0] / 255.0f;
+ clearFloat[1] = ((BYTE*)(&pClear->clearRTColor))[1] / 255.0f;
+ clearFloat[2] = ((BYTE*)(&pClear->clearRTColor))[2] / 255.0f;
+ clearFloat[3] = ((BYTE*)(&pClear->clearRTColor))[3] / 255.0f;
+ clearData[0] = *(DWORD*)&clearFloat[0];
+ clearData[1] = *(DWORD*)&clearFloat[1];
+ clearData[2] = *(DWORD*)&clearFloat[2];
+ clearData[3] = *(DWORD*)&clearFloat[3];
+
+ PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
+ SWR_ASSERT(pfnClearTiles != nullptr);
+
+ pfnClearTiles(pDC, SWR_ATTACHMENT_COLOR0, macroTile, clearData);
+ }
+
+ if (pClear->flags.mask & SWR_CLEAR_DEPTH)
+ {
+ DWORD clearData[4];
+ clearData[0] = *(DWORD*)&pClear->clearDepth;
+ PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
+ SWR_ASSERT(pfnClearTiles != nullptr);
+
+ pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, clearData);
+ }
+
+ if (pClear->flags.mask & SWR_CLEAR_STENCIL)
+ {
+ uint32_t value = pClear->clearStencil;
+ DWORD clearData[4];
+ clearData[0] = *(DWORD*)&value;
+ PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
+
+ pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, clearData);
+ }
+
+ RDTSC_STOP(BEClear, 0, 0);
+ }
+}
+
+
+void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
+{
+ RDTSC_START(BEStoreTiles);
+ STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;
+ SWR_CONTEXT *pContext = pDC->pContext;
+
+#ifdef KNOB_ENABLE_RDTSC
+ uint32_t numTiles = 0;
+#endif
+ SWR_FORMAT srcFormat;
+ switch (pDesc->attachment)
+ {
+ case SWR_ATTACHMENT_COLOR0:
+ case SWR_ATTACHMENT_COLOR1:
+ case SWR_ATTACHMENT_COLOR2:
+ case SWR_ATTACHMENT_COLOR3:
+ case SWR_ATTACHMENT_COLOR4:
+ case SWR_ATTACHMENT_COLOR5:
+ case SWR_ATTACHMENT_COLOR6:
+ case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
+ case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
+ case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
+ default: SWR_ASSERT(false, "Unknown attachment: %d", pDesc->attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
+ }
+
+ uint32_t x, y;
+ MacroTileMgr::getTileIndices(macroTile, x, y);
+
+ // Only need to store the hottile if it's been rendered to...
+ HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, pDesc->attachment, false);
+ if (pHotTile)
+ {
+ // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
+ if (pHotTile->state == HOTTILE_CLEAR)
+ {
+ PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
+ SWR_ASSERT(pfnClearTiles != nullptr);
+
+ pfnClearTiles(pDC, pDesc->attachment, macroTile, pHotTile->clearData);
+ }
+
+ if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
+ {
+ int destX = KNOB_MACROTILE_X_DIM * x;
+ int destY = KNOB_MACROTILE_Y_DIM * y;
+
+ pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
+ pDesc->attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+ }
+
+
+ if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
+ {
+ pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
+ }
+ }
+ RDTSC_STOP(BEStoreTiles, numTiles, pDC->drawId);
+}
+
+
+void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
+{
+ INVALIDATE_TILES_DESC *pDesc = (INVALIDATE_TILES_DESC*)pData;
+ SWR_CONTEXT *pContext = pDC->pContext;
+
+ for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
+ {
+ if (pDesc->attachmentMask & (1 << i))
+ {
+ HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, false);
+ if (pHotTile)
+ {
+ pHotTile->state = HOTTILE_INVALID;
+ }
+ }
+ }
+}
+
+#if KNOB_SIMD_WIDTH == 8
+const __m256 vQuadCenterOffsetsX = { 0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5 };
+const __m256 vQuadCenterOffsetsY = { 0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5 };
+const __m256 vQuadULOffsetsX ={0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
+const __m256 vQuadULOffsetsY ={0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
+#define MASK 0xff
+#else
+#error Unsupported vector width
+#endif
+
+INLINE
+bool CanEarlyZ(const SWR_PS_STATE *pPSState)
+{
+ return (pPSState->forceEarlyZ || (!pPSState->writesODepth && !pPSState->usesSourceDepth && !pPSState->usesUAV));
+}
+
+simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
+{
+ simdscalar vClipMask = _simd_setzero_ps();
+ uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
+
+ for (uint32_t i = 0; i < numClipDistance; ++i)
+ {
+ // pull triangle clip distance values from clip buffer
+ simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
+ simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
+ simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
+
+ // interpolate
+ simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
+
+ // clip if interpolated clip distance is < 0 || NAN
+ simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
+
+ vClipMask = _simd_or_ps(vClipMask, vCull);
+ }
+
+ return _simd_movemask_ps(vClipMask);
+}
+
+template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
+INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
+{
+
+ // will need to update for avx512
+ assert(KNOB_SIMD_WIDTH == 8);
+
+ __m256i mask[2];
+ __m256i sampleCoverage[2];
+ if(bIsStandardPattern)
+ {
+ __m256i src = _mm256_set1_epi32(0);
+ __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
+
+ if(MultisampleTraits<sampleCountT>::numSamples == 1)
+ {
+ mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 2)
+ {
+ mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 4)
+ {
+ mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 8)
+ {
+ mask[0] = _mm256_set1_epi32(-1);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 16)
+ {
+ mask[0] = _mm256_set1_epi32(-1);
+ mask[1] = _mm256_set1_epi32(-1);
+ index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
+ }
+
+ // gather coverage for samples 0-7
+ sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
+ if(MultisampleTraits<sampleCountT>::numSamples > 8)
+ {
+ // gather coverage for samples 8-15
+ sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
+ }
+ }
+ else
+ {
+ // center coverage is the same for all samples; just broadcast to the sample slots
+ uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
+ if(MultisampleTraits<sampleCountT>::numSamples == 1)
+ {
+ sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 2)
+ {
+ sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 4)
+ {
+ sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 8)
+ {
+ sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 16)
+ {
+ sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+ sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
+ }
+ }
+
+ mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
+ // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
+ __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
+
+ __m256i packedCoverage1;
+ if(MultisampleTraits<sampleCountT>::numSamples > 8)
+ {
+ // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
+ packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
+ }
+
+#if (KNOB_ARCH == KNOB_ARCH_AVX)
+ // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
+ __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
+ __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+ packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
+
+ __m256i packedSampleCoverage;
+ if(MultisampleTraits<sampleCountT>::numSamples > 8)
+ {
+ // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+ hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
+ shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+ shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
+ packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
+ packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
+ }
+ else
+ {
+ packedSampleCoverage = packedCoverage0;
+ }
+#else
+ __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
+ // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
+ packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
+
+ __m256i packedSampleCoverage;
+ if(MultisampleTraits<sampleCountT>::numSamples > 8)
+ {
+ permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
+ // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+ packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
+
+ // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
+ packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
+ }
+ else
+ {
+ packedSampleCoverage = packedCoverage0;
+ }
+#endif
+
+ for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
+ {
+ // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
+ inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
+
+ if(!bForcedSampleCount)
+ {
+ // input coverage has to be anded with sample mask if MSAA isn't forced on
+ inputMask[i] &= sampleMask;
+ }
+
+ // shift to the next pixel in the 4x2
+ packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
+ }
+}
+
+template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
+INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
+{
+ uint32_t inputMask[KNOB_SIMD_WIDTH];
+ generateInputCoverage<sampleCountT, bIsStandardPattern, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
+ inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
+}
+
+template<bool perspMask>
+INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
+{
+ if(perspMask)
+ {
+ // evaluate I,J
+ psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
+ psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
+ psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
+ psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
+
+ // interpolate 1/w
+ psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
+ }
+}
+
+template<bool perspMask>
+INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
+{
+ if(perspMask)
+ {
+ // evaluate I,J
+ psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
+ psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
+ psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
+ psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
+
+ // interpolate 1/w
+ psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
+ }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Centroid behaves exactly as follows :
+// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to
+// have a sample location there).
+// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the
+// coverage with the SampleMask Rasterizer State.
+// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is
+// evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the
+// SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template<SWR_MULTISAMPLE_COUNT sampleCount, bool bForcedSampleCount>
+INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const coverageMask, const uint32_t sampleMask,
+ const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
+{
+ uint32_t inputMask[KNOB_SIMD_WIDTH];
+
+ generateInputCoverage<sampleCount, 1, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
+
+ // Case (2) - partially covered pixel
+
+ // scan for first covered sample per pixel in the 4x2 span
+ unsigned long sampleNum[KNOB_SIMD_WIDTH];
+ (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
+ (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
+ (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
+ (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
+ (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
+ (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
+ (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
+ (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
+
+ // look up and set the sample offsets from UL pixel corner for first covered sample
+ __m256 vXSample = _mm256_set_ps(MultisampleTraits<sampleCount>::X(sampleNum[7]),
+ MultisampleTraits<sampleCount>::X(sampleNum[6]),
+ MultisampleTraits<sampleCount>::X(sampleNum[5]),
+ MultisampleTraits<sampleCount>::X(sampleNum[4]),
+ MultisampleTraits<sampleCount>::X(sampleNum[3]),
+ MultisampleTraits<sampleCount>::X(sampleNum[2]),
+ MultisampleTraits<sampleCount>::X(sampleNum[1]),
+ MultisampleTraits<sampleCount>::X(sampleNum[0]));
+
+ __m256 vYSample = _mm256_set_ps(MultisampleTraits<sampleCount>::Y(sampleNum[7]),
+ MultisampleTraits<sampleCount>::Y(sampleNum[6]),
+ MultisampleTraits<sampleCount>::Y(sampleNum[5]),
+ MultisampleTraits<sampleCount>::Y(sampleNum[4]),
+ MultisampleTraits<sampleCount>::Y(sampleNum[3]),
+ MultisampleTraits<sampleCount>::Y(sampleNum[2]),
+ MultisampleTraits<sampleCount>::Y(sampleNum[1]),
+ MultisampleTraits<sampleCount>::Y(sampleNum[0]));
+ // add sample offset to UL pixel corner
+ vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
+ vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
+
+ // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
+ static const __m256i vFullyCoveredMask = MultisampleTraits<sampleCount>::FullSampleMask();
+ __m256i vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
+ __m256i vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
+
+ static const __m256i vZero = _simd_setzero_si();
+ const __m256i vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
+ __m256i vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
+ __m256i vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
+ __m256i vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
+
+ __m256i vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
+
+ // set the centroid position based on results from above
+ psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
+ psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
+
+ // Case (3a) No samples covered and partial sample mask
+ __m256i vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
+ // sample mask should never be all 0's for this case, but handle it anyways
+ unsigned long firstCoveredSampleMaskSample = 0;
+ (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
+
+ __m256i vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
+
+ vXSample = _simd_set1_ps(MultisampleTraits<sampleCount>::X(firstCoveredSampleMaskSample));
+ vYSample = _simd_set1_ps(MultisampleTraits<sampleCount>::Y(firstCoveredSampleMaskSample));
+
+ // blend in case 3a pixel locations
+ psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
+ psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
+}
+
+template<uint32_t sampleCount, uint32_t persp, uint32_t standardPattern, uint32_t forcedMultisampleCount>
+INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
+ const uint64_t *const coverageMask, const uint32_t sampleMask,
+ const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
+{
+ static const bool bPersp = (bool)persp;
+ static const bool bIsStandardPattern = (bool)standardPattern;
+ static const bool bForcedMultisampleCount = (bool)forcedMultisampleCount;
+
+ // calculate centroid positions
+ if(bPersp)
+ {
+ if(bIsStandardPattern)
+ {
+ ///@ todo: don't need to generate input coverage 2x if input coverage and centroid
+ CalcCentroidPos<(SWR_MULTISAMPLE_COUNT)sampleCount, bForcedMultisampleCount>(psContext, coverageMask, sampleMask, vXSamplePosUL, vYSamplePosUL);
+ }
+ else
+ {
+ static const __m256 pixelCenter = _simd_set1_ps(0.5f);
+ psContext.vX.centroid = _simd_add_ps(vXSamplePosUL, pixelCenter);
+ psContext.vY.centroid = _simd_add_ps(vYSamplePosUL, pixelCenter);
+ }
+ // evaluate I,J
+ psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
+ psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
+ psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
+ psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
+
+ // interpolate 1/w
+ psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
+ }
+}
+
+template<uint32_t NumRT, uint32_t sampleCountT>
+void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
+ const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask)
+{
+ // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
+ static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
+ uint32_t rasterTileColorOffset = MultisampleTraits<sampleCount>::RasterTileColorOffset(sample);
+ for(uint32_t rt = 0; rt < NumRT; ++rt)
+ {
+ uint8_t *pColorSample;
+ if(sampleCount == SWR_MULTISAMPLE_1X)
+ {
+ pColorSample = pColorBase[rt];
+ }
+ else
+ {
+ pColorSample = pColorBase[rt] + rasterTileColorOffset;
+ }
+
+ const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
+
+ // Blend outputs and update coverage mask for alpha test
+ if(pfnBlendFunc[rt] != nullptr)
+ {
+ pfnBlendFunc[rt](
+ pBlendState,
+ psContext.shaded[rt],
+ psContext.shaded[1],
+ sample,
+ pColorSample,
+ psContext.shaded[rt],
+ &psContext.oMask,
+ (simdscalari*)&coverageMask);
+ }
+
+ // final write mask
+ simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
+
+ ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
+ static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
+
+ const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
+
+ // store with color mask
+ if(!pRTBlend->writeDisableRed)
+ {
+ _simd_maskstore_ps((float*)pColorSample, outputMask, psContext.shaded[rt].x);
+ }
+ if(!pRTBlend->writeDisableGreen)
+ {
+ _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, psContext.shaded[rt].y);
+ }
+ if(!pRTBlend->writeDisableBlue)
+ {
+ _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, psContext.shaded[rt].z);
+ }
+ if(!pRTBlend->writeDisableAlpha)
+ {
+ _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, psContext.shaded[rt].w);
+ }
+ }
+}
+
+template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount>
+void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+{
+ RDTSC_START(BESetup);
+ // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
+ static const bool bInputCoverage = (bool)inputCoverage;
+ static const bool bCentroidPos = (bool)centroidPos;
+
+ SWR_CONTEXT *pContext = pDC->pContext;
+ const API_STATE& state = GetApiState(pDC);
+ const SWR_RASTSTATE& rastState = state.rastState;
+ const SWR_PS_STATE *pPSState = &state.psState;
+ const SWR_BLEND_STATE *pBlendState = &state.blendState;
+ const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+ uint64_t coverageMask = work.coverageMask[0];
+
+ // broadcast scalars
+ BarycentricCoeffs coeffs;
+ coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
+ coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
+ coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
+
+ coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
+ coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
+ coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
+
+ coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
+ coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
+ coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
+
+ coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
+
+ coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
+ coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
+ coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
+
+ uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
+ uint32_t NumRT = state.psState.numRenderTargets;
+ for(uint32_t rt = 0; rt < NumRT; ++rt)
+ {
+ pColorBase[rt] = renderBuffers.pColor[rt];
+ }
+ uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
+ RDTSC_STOP(BESetup, 0, 0);
+
+ SWR_PS_CONTEXT psContext;
+ psContext.pAttribs = work.pAttribs;
+ psContext.pPerspAttribs = work.pPerspAttribs;
+ psContext.frontFace = work.triFlags.frontFacing;
+ psContext.primID = work.triFlags.primID;
+
+ // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
+ psContext.I = work.I;
+ psContext.J = work.J;
+ psContext.recipDet = work.recipDet;
+ psContext.pRecipW = work.pRecipW;
+ psContext.pSamplePosX = (const float*)&MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosX;
+ psContext.pSamplePosY = (const float*)&MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosY;
+
+ for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
+ {
+ // UL pixel corner
+ psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+ // pixel center
+ psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+
+ for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
+ {
+ if(bInputCoverage)
+ {
+ generateInputCoverage<SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, false>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
+ }
+
+ if(coverageMask & MASK)
+ {
+ RDTSC_START(BEBarycentric);
+ psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+ // pixel center
+ psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+
+ backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
+
+ if(bCentroidPos)
+ {
+ // for 1x case, centroid is pixel center
+ psContext.vX.centroid = psContext.vX.center;
+ psContext.vY.centroid = psContext.vY.center;
+ psContext.vI.centroid = psContext.vI.center;
+ psContext.vJ.centroid = psContext.vJ.center;
+ psContext.vOneOverW.centroid = psContext.vOneOverW.center;
+ }
+
+ // interpolate z
+ psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+ RDTSC_STOP(BEBarycentric, 0, 0);
+
+ simdmask clipCoverageMask = coverageMask & MASK;
+
+ // interpolate user clip distance if available
+ if(rastState.clipDistanceMask)
+ {
+ clipCoverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
+ psContext.vI.center, psContext.vJ.center);
+ }
+
+ simdscalar vCoverageMask = vMask(clipCoverageMask);
+ simdscalar depthPassMask = vCoverageMask;
+ simdscalar stencilPassMask = vCoverageMask;
+
+ // Early-Z?
+ if(CanEarlyZ(pPSState))
+ {
+ RDTSC_START(BEEarlyDepthTest);
+ depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+ psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
+ RDTSC_STOP(BEEarlyDepthTest, 0, 0);
+
+ // early-exit if no pixels passed depth or earlyZ is forced on
+ if(pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
+ {
+ DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+ pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
+
+ if (!_simd_movemask_ps(depthPassMask))
+ {
+ goto Endtile;
+ }
+ }
+ }
+
+ psContext.sampleIndex = 0;
+ psContext.activeMask = _simd_castps_si(vCoverageMask);
+
+ // execute pixel shader
+ RDTSC_START(BEPixelShader);
+ UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
+ state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+ RDTSC_STOP(BEPixelShader, 0, 0);
+
+ vCoverageMask = _simd_castsi_ps(psContext.activeMask);
+
+ // late-Z
+ if(!CanEarlyZ(pPSState))
+ {
+ RDTSC_START(BELateDepthTest);
+ depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+ psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
+ RDTSC_STOP(BELateDepthTest, 0, 0);
+
+ if(!_simd_movemask_ps(depthPassMask))
+ {
+ // need to call depth/stencil write for stencil write
+ DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+ pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
+ goto Endtile;
+ }
+ }
+
+ uint32_t statMask = _simd_movemask_ps(depthPassMask);
+ uint32_t statCount = _mm_popcnt_u32(statMask);
+ UPDATE_STAT(DepthPassCount, statCount);
+
+ // output merger
+ RDTSC_START(BEOutputMerger);
+ backendFuncs.pfnOutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc,
+ vCoverageMask, depthPassMask);
+
+ // do final depth write after all pixel kills
+ if (!pPSState->forceEarlyZ)
+ {
+ DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+ pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
+ }
+ RDTSC_STOP(BEOutputMerger, 0, 0);
+ }
+
+Endtile:
+ RDTSC_START(BEEndTile);
+ coverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+ pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+ pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+
+ for(uint32_t rt = 0; rt < NumRT; ++rt)
+ {
+ pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+ }
+ RDTSC_STOP(BEEndTile, 0, 0);
+ }
+ }
+}
+
+template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount>
+void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+{
+ // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
+ static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
+ static const bool bInputCoverage = (bool)inputCoverage;
+ static const bool bCentroidPos = (bool)centroidPos;
+
+ RDTSC_START(BESetup);
+
+ SWR_CONTEXT *pContext = pDC->pContext;
+ const API_STATE& state = GetApiState(pDC);
+ const SWR_RASTSTATE& rastState = state.rastState;
+ const SWR_PS_STATE *pPSState = &state.psState;
+ const SWR_BLEND_STATE *pBlendState = &state.blendState;
+ const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+
+ // broadcast scalars
+ BarycentricCoeffs coeffs;
+ coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
+ coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
+ coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
+
+ coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
+ coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
+ coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
+
+ coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
+ coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
+ coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
+
+ coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
+
+ coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
+ coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
+ coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
+
+ uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
+ uint32_t NumRT = state.psState.numRenderTargets;
+ for(uint32_t rt = 0; rt < NumRT; ++rt)
+ {
+ pColorBase[rt] = renderBuffers.pColor[rt];
+ }
+ uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
+ RDTSC_STOP(BESetup, 0, 0);
+
+ SWR_PS_CONTEXT psContext;
+ psContext.pAttribs = work.pAttribs;
+ psContext.pPerspAttribs = work.pPerspAttribs;
+ psContext.pRecipW = work.pRecipW;
+ psContext.frontFace = work.triFlags.frontFacing;
+ psContext.primID = work.triFlags.primID;
+
+ // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
+ psContext.I = work.I;
+ psContext.J = work.J;
+ psContext.recipDet = work.recipDet;
+ psContext.pSamplePosX = (const float*)&MultisampleTraits<sampleCount>::samplePosX;
+ psContext.pSamplePosY = (const float*)&MultisampleTraits<sampleCount>::samplePosY;
+ const uint32_t numSamples = MultisampleTraits<sampleCount>::numSamples;
+
+ for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
+ {
+ // UL pixel corner
+ psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+ // pixel center
+ psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+
+ for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
+ {
+ psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+ // pixel center
+ psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+
+ RDTSC_START(BEBarycentric);
+ backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
+ RDTSC_STOP(BEBarycentric, 0, 0);
+
+ if(bInputCoverage)
+ {
+ generateInputCoverage<sampleCount, SWR_MSAA_STANDARD_PATTERN, false>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
+ }
+
+ if(bCentroidPos)
+ {
+ ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
+ RDTSC_START(BEBarycentric);
+ backendFuncs.pfnCalcCentroidBarycentrics(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
+ RDTSC_STOP(BEBarycentric, 0, 0);
+ }
+
+ for(uint32_t sample = 0; sample < numSamples; sample++)
+ {
+ if (work.coverageMask[sample] & MASK)
+ {
+ RDTSC_START(BEBarycentric);
+
+ // calculate per sample positions
+ psContext.vX.sample = _simd_add_ps(psContext.vX.UL, MultisampleTraits<sampleCount>::vX(sample));
+ psContext.vY.sample = _simd_add_ps(psContext.vY.UL, MultisampleTraits<sampleCount>::vY(sample));
+
+ simdmask coverageMask = work.coverageMask[sample] & MASK;
+ simdscalar vCoverageMask = vMask(coverageMask);
+
+ backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
+
+ // interpolate z
+ psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+
+ RDTSC_STOP(BEBarycentric, 0, 0);
+
+ // interpolate user clip distance if available
+ if (rastState.clipDistanceMask)
+ {
+ coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
+ psContext.vI.sample, psContext.vJ.sample);
+ }
+
+ simdscalar depthPassMask = vCoverageMask;
+ simdscalar stencilPassMask = vCoverageMask;
+
+ // offset depth/stencil buffers current sample
+ uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
+ uint8_t *pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
+
+ // Early-Z?
+ if (CanEarlyZ(pPSState))
+ {
+ RDTSC_START(BEEarlyDepthTest);
+ depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+ psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
+ RDTSC_STOP(BEEarlyDepthTest, 0, 0);
+
+ // early-exit if no samples passed depth or earlyZ is forced on.
+ if (pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
+ {
+ DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+ pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
+
+ if (!_simd_movemask_ps(depthPassMask))
+ {
+ work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+ continue;
+ }
+ }
+ }
+
+ psContext.sampleIndex = sample;
+ psContext.activeMask = _simd_castps_si(vCoverageMask);
+
+ // execute pixel shader
+ RDTSC_START(BEPixelShader);
+ UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
+ state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+ RDTSC_STOP(BEPixelShader, 0, 0);
+
+ vCoverageMask = _simd_castsi_ps(psContext.activeMask);
+
+ //// late-Z
+ if (!CanEarlyZ(pPSState))
+ {
+ RDTSC_START(BELateDepthTest);
+ depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+ psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
+ RDTSC_STOP(BELateDepthTest, 0, 0);
+
+ if (!_simd_movemask_ps(depthPassMask))
+ {
+ // need to call depth/stencil write for stencil write
+ DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+ pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
+
+ work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+ continue;
+ }
+ }
+
+ uint32_t statMask = _simd_movemask_ps(depthPassMask);
+ uint32_t statCount = _mm_popcnt_u32(statMask);
+ UPDATE_STAT(DepthPassCount, statCount);
+
+ // output merger
+ RDTSC_START(BEOutputMerger);
+ backendFuncs.pfnOutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc,
+ vCoverageMask, depthPassMask);
+
+ // do final depth write after all pixel kills
+ if (!pPSState->forceEarlyZ)
+ {
+ DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+ pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
+ }
+ RDTSC_STOP(BEOutputMerger, 0, 0);
+ }
+ work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+ }
+ RDTSC_START(BEEndTile);
+ pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+ pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+
+ for (uint32_t rt = 0; rt < NumRT; ++rt)
+ {
+ pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+ }
+ RDTSC_STOP(BEEndTile, 0, 0);
+ }
+ }
+}
+
+template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount>
+void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+{
+ // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
+ static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
+ static const bool bIsStandardPattern = (bool)samplePattern;
+ static const bool bInputCoverage = (bool)inputCoverage;
+ static const bool bCentroidPos = (bool)centroidPos;
+ static const bool bForcedSampleCount = (bool)forcedSampleCount;
+
+ RDTSC_START(BESetup);
+
+ SWR_CONTEXT *pContext = pDC->pContext;
+ const API_STATE& state = GetApiState(pDC);
+ const SWR_RASTSTATE& rastState = state.rastState;
+ const SWR_PS_STATE *pPSState = &state.psState;
+ const SWR_BLEND_STATE *pBlendState = &state.blendState;
+ const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+
+ // broadcast scalars
+ BarycentricCoeffs coeffs;
+ coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
+ coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
+ coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
+
+ coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
+ coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
+ coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
+
+ coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
+ coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
+ coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
+
+ coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
+
+ coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
+ coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
+ coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
+
+ uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
+ uint32_t NumRT = state.psState.numRenderTargets;
+ for(uint32_t rt = 0; rt < NumRT; ++rt)
+ {
+ pColorBase[rt] = renderBuffers.pColor[rt];
+ }
+ uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
+ RDTSC_STOP(BESetup, 0, 0);
+
+ SWR_PS_CONTEXT psContext;
+ psContext.pAttribs = work.pAttribs;
+ psContext.pPerspAttribs = work.pPerspAttribs;
+ psContext.frontFace = work.triFlags.frontFacing;
+ psContext.primID = work.triFlags.primID;
+ psContext.pRecipW = work.pRecipW;
+ // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
+ psContext.I = work.I;
+ psContext.J = work.J;
+ psContext.recipDet = work.recipDet;
+ psContext.pSamplePosX = (const float*)&MultisampleTraits<sampleCount>::samplePosX;
+ psContext.pSamplePosY = (const float*)&MultisampleTraits<sampleCount>::samplePosY;
+ psContext.sampleIndex = 0;
+
+ uint32_t numCoverageSamples;
+ if(bIsStandardPattern)
+ {
+ numCoverageSamples = MultisampleTraits<sampleCount>::numSamples;
+ }
+ else
+ {
+ numCoverageSamples = 1;
+ }
+
+ uint32_t numOMSamples;
+ // RT has to be single sample if we're in forcedMSAA mode
+ if(bForcedSampleCount && (sampleCount > SWR_MULTISAMPLE_1X))
+ {
+ numOMSamples = 1;
+ }
+ // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
+ else if(bForcedSampleCount && (sampleCount == SWR_MULTISAMPLE_1X))
+ {
+ numOMSamples = GetNumSamples(pBlendState->sampleCount);
+ }
+ // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
+ else
+ {
+ numOMSamples = MultisampleTraits<sampleCount>::numSamples;
+ }
+
+ for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
+ {
+ psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+ psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+ for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
+ {
+ simdscalar vZ[MultisampleTraits<sampleCount>::numSamples];
+ psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+ // set pixel center positions
+ psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+
+ if (bInputCoverage)
+ {
+ generateInputCoverage<sampleCount, bIsStandardPattern, bForcedSampleCount>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
+ }
+
+ if(bCentroidPos)
+ {
+ ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
+ RDTSC_START(BEBarycentric);
+ backendFuncs.pfnCalcCentroidBarycentrics(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
+ RDTSC_STOP(BEBarycentric, 0, 0);
+ }
+
+ // if oDepth written to, or there is a potential to discard any samples, we need to
+ // run the PS early, then interp or broadcast Z and test
+ if(pPSState->writesODepth || pPSState->killsPixel)
+ {
+ RDTSC_START(BEBarycentric);
+ backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
+
+ // interpolate z
+ psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+ RDTSC_STOP(BEBarycentric, 0, 0);
+
+ // execute pixel shader
+ RDTSC_START(BEPixelShader);
+ state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+ RDTSC_STOP(BEPixelShader, 0, 0);
+ }
+ else
+ {
+ psContext.activeMask = _simd_set1_epi32(-1);
+ }
+
+ // need to declare enough space for all samples
+ simdscalar vCoverageMask[MultisampleTraits<sampleCount>::numSamples];
+ simdscalar depthPassMask[MultisampleTraits<sampleCount>::numSamples];
+ simdscalar stencilPassMask[MultisampleTraits<sampleCount>::numSamples];
+ simdscalar anyDepthSamplePassed = _simd_setzero_ps();
+ simdscalar anyStencilSamplePassed = _simd_setzero_ps();
+ for(uint32_t sample = 0; sample < numCoverageSamples; sample++)
+ {
+ vCoverageMask[sample] = vMask(work.coverageMask[sample] & MASK);
+
+ // pull mask back out for any discards and and with coverage
+ vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_castsi_ps(psContext.activeMask));
+
+ if (!_simd_movemask_ps(vCoverageMask[sample]))
+ {
+ vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
+ continue;
+ }
+
+ if(bForcedSampleCount)
+ {
+ // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
+ const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si()));
+ anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, _simd_and_ps(vCoverageMask[sample], vSampleMask));
+ continue;
+ }
+
+ depthPassMask[sample] = vCoverageMask[sample];
+
+ // if oDepth isn't written to, we need to interpolate Z for each sample
+ // if clip distances are enabled, we need to interpolate for each sample
+ if(!pPSState->writesODepth || rastState.clipDistanceMask)
+ {
+ RDTSC_START(BEBarycentric);
+ if(bIsStandardPattern)
+ {
+ // calculate per sample positions
+ psContext.vX.sample = _simd_add_ps(psContext.vX.UL, MultisampleTraits<sampleCount>::vX(sample));
+ psContext.vY.sample = _simd_add_ps(psContext.vY.UL, MultisampleTraits<sampleCount>::vY(sample));
+ }
+ else
+ {
+ psContext.vX.sample = psContext.vX.center;
+ psContext.vY.sample = psContext.vY.center;
+ }
+
+ // calc I & J per sample
+ backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
+
+ // interpolate z
+ if (!pPSState->writesODepth)
+ {
+ vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+ }
+
+ ///@todo: perspective correct vs non-perspective correct clipping?
+ // interpolate clip distances
+ if (rastState.clipDistanceMask)
+ {
+ uint8_t clipMask = ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
+ psContext.vI.sample, psContext.vJ.sample);
+ vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
+ }
+ RDTSC_STOP(BEBarycentric, 0, 0);
+ }
+ // else 'broadcast' and test psContext.vZ written from the PS each sample
+ else
+ {
+ vZ[sample] = psContext.vZ;
+ }
+
+ // offset depth/stencil buffers current sample
+ uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
+ uint8_t * pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
+
+ // ZTest for this sample
+ RDTSC_START(BEEarlyDepthTest);
+ stencilPassMask[sample] = vCoverageMask[sample];
+ depthPassMask[sample] = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+ vZ[sample], pDepthSample, vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]);
+ RDTSC_STOP(BEEarlyDepthTest, 0, 0);
+
+ anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
+ anyStencilSamplePassed = _simd_or_ps(anyStencilSamplePassed, stencilPassMask[sample]);
+ uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
+ uint32_t statCount = _mm_popcnt_u32(statMask);
+ UPDATE_STAT(DepthPassCount, statCount);
+ }
+
+ // if we didn't have to execute the PS early, and at least 1 sample passed the depth test, run the PS
+ if(!pPSState->writesODepth && !pPSState->killsPixel && _simd_movemask_ps(anyDepthSamplePassed))
+ {
+ RDTSC_START(BEBarycentric);
+ backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
+ // interpolate z
+ psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+ RDTSC_STOP(BEBarycentric, 0, 0);
+
+ // execute pixel shader
+ RDTSC_START(BEPixelShader);
+ state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+ RDTSC_STOP(BEPixelShader, 0, 0);
+ }
+ ///@todo: make sure this works for kill pixel
+ else if(!_simd_movemask_ps(anyStencilSamplePassed))
+ {
+ goto Endtile;
+ }
+
+ // loop over all samples, broadcasting the results of the PS to all passing pixels
+ for(uint32_t sample = 0; sample < numOMSamples; sample++)
+ {
+ uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
+ uint8_t * pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
+
+ // output merger
+ RDTSC_START(BEOutputMerger);
+
+ // skip if none of the pixels for this sample passed
+ simdscalar coverageMaskSample;
+ simdscalar depthMaskSample;
+ simdscalar stencilMaskSample;
+ simdscalar vInterpolatedZ;
+
+ // forcedSampleCount outputs to any pixels with covered samples not masked off by SampleMask
+ // depth test is disabled, so just set the z val to 0.
+ if(bForcedSampleCount)
+ {
+ coverageMaskSample = depthMaskSample = anyDepthSamplePassed;
+ vInterpolatedZ = _simd_setzero_ps();
+ }
+ else if(bIsStandardPattern)
+ {
+ if(!_simd_movemask_ps(depthPassMask[sample]))
+ {
+ depthPassMask[sample] = _simd_setzero_ps();
+ DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample], pDepthSample, depthPassMask[sample],
+ vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
+ continue;
+ }
+ coverageMaskSample = vCoverageMask[sample];
+ depthMaskSample = depthPassMask[sample];
+ stencilMaskSample = stencilPassMask[sample];
+ vInterpolatedZ = vZ[sample];
+ }
+ else
+ {
+ // center pattern only needs to use a single depth test as all samples are at the same position
+ if(!_simd_movemask_ps(depthPassMask[0]))
+ {
+ depthPassMask[0] = _simd_setzero_ps();
+ DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[0], pDepthSample, depthPassMask[0],
+ vCoverageMask[0], pStencilSample, stencilPassMask[0]);
+ continue;
+ }
+ coverageMaskSample = (vCoverageMask[0]);
+ depthMaskSample = depthPassMask[0];
+ stencilMaskSample = stencilPassMask[0];
+ vInterpolatedZ = vZ[0];
+ }
+
+ // output merger
+ RDTSC_START(BEOutputMerger);
+ backendFuncs.pfnOutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc,
+ coverageMaskSample, depthMaskSample);
+
+ DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vInterpolatedZ, pDepthSample, depthMaskSample,
+ coverageMaskSample, pStencilSample, stencilMaskSample);
+ RDTSC_STOP(BEOutputMerger, 0, 0);
+ }
+
+Endtile:
+ RDTSC_START(BEEndTile);
+ for(uint32_t sample = 0; sample < numCoverageSamples; sample++)
+ {
+ work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+ }
+
+ pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+ pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+
+ for(uint32_t rt = 0; rt < NumRT; ++rt)
+ {
+ pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+ }
+ RDTSC_STOP(BEEndTile, 0, 0);
+ }
+ }
+}
+// optimized backend flow with NULL PS
+template<uint32_t sampleCountT>
+void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+{
+ RDTSC_START(BESetup);
+
+ static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
+ SWR_CONTEXT *pContext = pDC->pContext;
+ const API_STATE& state = GetApiState(pDC);
+ const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+
+ // broadcast scalars
+ BarycentricCoeffs coeffs;
+ coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
+ coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
+ coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
+
+ coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
+ coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
+ coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
+
+ coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
+ coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
+ coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
+
+ coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
+
+ BYTE *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
+
+ RDTSC_STOP(BESetup, 0, 0);
+
+ SWR_PS_CONTEXT psContext;
+ for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
+ {
+ // UL pixel corner
+ simdscalar vYSamplePosUL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+
+ for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
+ {
+ // UL pixel corners
+ simdscalar vXSamplePosUL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+
+ // iterate over active samples
+ unsigned long sample = 0;
+ uint32_t sampleMask = state.blendState.sampleMask;
+ while (_BitScanForward(&sample, sampleMask))
+ {
+ sampleMask &= ~(1 << sample);
+ if (work.coverageMask[sample] & MASK)
+ {
+ RDTSC_START(BEBarycentric);
+ // calculate per sample positions
+ psContext.vX.sample = _simd_add_ps(vXSamplePosUL, MultisampleTraits<sampleCount>::vX(sample));
+ psContext.vY.sample = _simd_add_ps(vYSamplePosUL, MultisampleTraits<sampleCount>::vY(sample));
+
+ backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
+
+ // interpolate z
+ psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+
+ RDTSC_STOP(BEBarycentric, 0, 0);
+
+ simdscalar vCoverageMask = vMask(work.coverageMask[sample] & MASK);
+ simdscalar stencilPassMask = vCoverageMask;
+
+ // offset depth/stencil buffers current sample
+ uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
+ uint8_t *pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
+
+ RDTSC_START(BEEarlyDepthTest);
+ simdscalar depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+ psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
+ DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+ pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
+ RDTSC_STOP(BEEarlyDepthTest, 0, 0);
+
+ uint32_t statMask = _simd_movemask_ps(depthPassMask);
+ uint32_t statCount = _mm_popcnt_u32(statMask);
+ UPDATE_STAT(DepthPassCount, statCount);
+ }
+ work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+ }
+ pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+ pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+ }
+ }
+}
+
+void InitClearTilesTable()
+{
+ memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
+
+ sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
+ sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
+ sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
+ sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
+ sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
+}
+
+PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
+PFN_BACKEND_FUNC gBackendSingleSample[2][2] = {};
+PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2] = {};
+PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2] = {};
+PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX] = {};
+PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2] = {};
+PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2] = {};
+PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2] = {};
+
+// Recursive template used to auto-nest conditionals. Converts dynamic enum function
+// arguments to static template arguments.
+template <uint32_t... ArgsT>
+struct OMChooser
+{
+ // Last Arg Terminator
+ static PFN_OUTPUT_MERGER GetFunc(SWR_MULTISAMPLE_COUNT tArg)
+ {
+ switch(tArg)
+ {
+ case SWR_MULTISAMPLE_1X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_1X>; break;
+ case SWR_MULTISAMPLE_2X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_2X>; break;
+ case SWR_MULTISAMPLE_4X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_4X>; break;
+ case SWR_MULTISAMPLE_8X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_8X>; break;
+ case SWR_MULTISAMPLE_16X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_16X>; break;
+ default:
+ SWR_ASSERT(0 && "Invalid sample count\n");
+ return nullptr;
+ break;
+ }
+ }
+
+ // Recursively parse args
+ template <typename... TArgsT>
+ static PFN_OUTPUT_MERGER GetFunc(uint32_t tArg, TArgsT... remainingArgs)
+ {
+ switch(tArg)
+ {
+ case 0: return OMChooser<ArgsT..., 0>::GetFunc(remainingArgs...); break;
+ case 1: return OMChooser<ArgsT..., 1>::GetFunc(remainingArgs...); break;
+ case 2: return OMChooser<ArgsT..., 2>::GetFunc(remainingArgs...); break;
+ case 3: return OMChooser<ArgsT..., 3>::GetFunc(remainingArgs...); break;
+ case 4: return OMChooser<ArgsT..., 4>::GetFunc(remainingArgs...); break;
+ case 5: return OMChooser<ArgsT..., 5>::GetFunc(remainingArgs...); break;
+ case 6: return OMChooser<ArgsT..., 6>::GetFunc(remainingArgs...); break;
+ case 7: return OMChooser<ArgsT..., 7>::GetFunc(remainingArgs...); break;
+ case 8: return OMChooser<ArgsT..., 8>::GetFunc(remainingArgs...); break;
+ default:
+ SWR_ASSERT(0 && "Invalid RT index\n");
+ return nullptr;
+ break;
+ }
+ }
+};
+
+// Recursive template used to auto-nest conditionals. Converts dynamic enum function
+// arguments to static template arguments.
+template <uint32_t... ArgsT>
+struct BECentroidBarycentricChooser
+{
+
+ // Last Arg Terminator
+ template <typename... TArgsT>
+ static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(uint32_t tArg)
+ {
+ if(tArg > 0)
+ {
+ return CalcCentroidBarycentrics<ArgsT..., 1>;
+ }
+
+ return CalcCentroidBarycentrics<ArgsT..., 0>;
+ }
+
+ // Recursively parse args
+ template <typename... TArgsT>
+ static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
+ {
+ switch(tArg)
+ {
+ case SWR_MULTISAMPLE_1X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
+ case SWR_MULTISAMPLE_2X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
+ case SWR_MULTISAMPLE_4X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
+ case SWR_MULTISAMPLE_8X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
+ case SWR_MULTISAMPLE_16X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
+ default:
+ SWR_ASSERT(0 && "Invalid sample count\n");
+ return nullptr;
+ break;
+ }
+ }
+
+ // Recursively parse args
+ template <typename... TArgsT>
+ static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(uint32_t tArg, TArgsT... remainingArgs)
+ {
+ if(tArg > 0)
+ {
+ return BECentroidBarycentricChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
+ }
+
+ return BECentroidBarycentricChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
+ }
+};
+
+// Recursive template used to auto-nest conditionals. Converts dynamic enum function
+// arguments to static template arguments.
+template <uint32_t... ArgsT>
+struct BEChooser
+{
+ // Last Arg Terminator
+ static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
+ {
+ switch(tArg)
+ {
+ case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<ArgsT...>; break;
+ case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<ArgsT...>; break;
+ case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<ArgsT...>; break;
+ default:
+ SWR_ASSERT(0 && "Invalid backend func\n");
+ return nullptr;
+ break;
+ }
+ }
+
+
+ // Recursively parse args
+ template <typename... TArgsT>
+ static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
+ {
+ switch(tArg)
+ {
+ case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
+ case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
+ case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
+ case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
+ case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
+ default:
+ SWR_ASSERT(0 && "Invalid sample count\n");
+ return nullptr;
+ break;
+ }
+ }
+
+ // Recursively parse args
+ template <typename... TArgsT>
+ static PFN_BACKEND_FUNC GetFunc(uint32_t tArg, TArgsT... remainingArgs)
+ {
+ if(tArg > 0)
+ {
+ return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
+ }
+
+ return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
+ }
+};
+
+template <uint32_t numRenderTargets, SWR_MULTISAMPLE_COUNT numSampleRates>
+void InitBackendOMFuncTable(PFN_OUTPUT_MERGER (&table)[numRenderTargets][numSampleRates])
+{
+ for(uint32_t rtNum = SWR_ATTACHMENT_COLOR0; rtNum < numRenderTargets; rtNum++)
+ {
+ for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
+ {
+ table[rtNum][sampleCount] =
+ OMChooser<>::GetFunc((SWR_RENDERTARGET_ATTACHMENT)rtNum, (SWR_MULTISAMPLE_COUNT)sampleCount);
+ }
+ }
+}
+
+template <SWR_MULTISAMPLE_COUNT numSampleRates>
+void InitBackendBarycentricsTables(PFN_CALC_PIXEL_BARYCENTRICS (&pixelTable)[2],
+ PFN_CALC_SAMPLE_BARYCENTRICS (&sampleTable)[2],
+ PFN_CALC_CENTROID_BARYCENTRICS (&centroidTable)[numSampleRates][2][2][2])
+{
+ pixelTable[0] = CalcPixelBarycentrics<0>;
+ pixelTable[1] = CalcPixelBarycentrics<1>;
+
+ sampleTable[0] = CalcSampleBarycentrics<0>;
+ sampleTable[1] = CalcSampleBarycentrics<1>;
+
+ for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
+ {
+ for(uint32_t baryMask = 0; baryMask < 2; baryMask++)
+ {
+ for(uint32_t patternNum = 0; patternNum < 2; patternNum++)
+ {
+ for(uint32_t forcedSampleEnable = 0; forcedSampleEnable < 2; forcedSampleEnable++)
+ {
+ centroidTable[sampleCount][baryMask][patternNum][forcedSampleEnable]=
+ BECentroidBarycentricChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, baryMask, patternNum, forcedSampleEnable);
+ }
+ }
+ }
+ }
+}
+
+void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[2][2])
+{
+ gBackendSingleSample[0][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NONE, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
+ gBackendSingleSample[0][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NONE, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
+ gBackendSingleSample[1][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NORMAL, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
+ gBackendSingleSample[1][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NORMAL, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
+}
+
+template <SWR_MULTISAMPLE_COUNT numSampleRates, SWR_MSAA_SAMPLE_PATTERN numSamplePatterns, SWR_INPUT_COVERAGE numCoverageModes>
+void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numSamplePatterns][numCoverageModes][2][2])
+{
+ for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
+ {
+ for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < numSamplePatterns; samplePattern++)
+ {
+ for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < numCoverageModes; inputCoverage++)
+ {
+ for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
+ {
+ table[sampleCount][samplePattern][inputCoverage][isCentroid][0] =
+ BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, samplePattern, inputCoverage, isCentroid, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_PIXEL_RATE);
+ table[sampleCount][samplePattern][inputCoverage][isCentroid][1] =
+ BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, samplePattern, inputCoverage, isCentroid, 1, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_PIXEL_RATE);
+ }
+ }
+ }
+ }
+}
+
+template <uint32_t numSampleRates, uint32_t numCoverageModes>
+void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numCoverageModes][2])
+{
+ for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
+ {
+ for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < numCoverageModes; inputCoverage++)
+ {
+ table[sampleCount][inputCoverage][0] =
+ BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, inputCoverage, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
+ table[sampleCount][inputCoverage][1] =
+ BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, inputCoverage, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
+ }
+ }
+}
+
+void InitBackendFuncTables()
+{
+ InitBackendSampleFuncTable(gBackendSingleSample);
+ InitBackendPixelFuncTable<(SWR_MULTISAMPLE_COUNT)SWR_MULTISAMPLE_TYPE_MAX, SWR_MSAA_SAMPLE_PATTERN_MAX, SWR_INPUT_COVERAGE_MAX>(gBackendPixelRateTable);
+ InitBackendSampleFuncTable<SWR_MULTISAMPLE_TYPE_MAX, SWR_INPUT_COVERAGE_MAX>(gBackendSampleRateTable);
+ InitBackendOMFuncTable<SWR_NUM_RENDERTARGETS+1, SWR_MULTISAMPLE_TYPE_MAX>(gBackendOutputMergerTable);
+ InitBackendBarycentricsTables<(SWR_MULTISAMPLE_COUNT)(SWR_MULTISAMPLE_TYPE_MAX)>(gPixelBarycentricTable, gSampleBarycentricTable, gCentroidBarycentricTable);
+
+ gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
+ gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
+ gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ;
+ gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ;
+ gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ;
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h
new file mode 100644
index 00000000000..53089e5047b
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.h
@@ -0,0 +1,59 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file backend.h
+*
+* @brief Backend handles rasterization, pixel shading and output merger
+* operations.
+*
+******************************************************************************/
+#pragma once
+
+#include "common/os.h"
+#include "core/context.h"
+
+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId);
+void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
+void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
+void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
+void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
+void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
+void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers);
+void InitClearTilesTable();
+
+enum SWR_BACKEND_FUNCS
+{
+ SWR_BACKEND_SINGLE_SAMPLE,
+ SWR_BACKEND_MSAA_PIXEL_RATE,
+ SWR_BACKEND_MSAA_SAMPLE_RATE,
+ SWR_BACKEND_FUNCS_MAX,
+};
+void InitBackendFuncTables();
+
+extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
+extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
+extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
+extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
+extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX];
+extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
+extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
+extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
diff --git a/src/gallium/drivers/swr/rasterizer/core/blend.h b/src/gallium/drivers/swr/rasterizer/core/blend.h
new file mode 100644
index 00000000000..626c237d75b
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/blend.h
@@ -0,0 +1,318 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file blend.cpp
+*
+* @brief Implementation for blending operations.
+*
+******************************************************************************/
+#include "state.h"
+
+template<bool Color, bool Alpha>
+INLINE
+void GenerateBlendFactor(SWR_BLEND_FACTOR func, simdvector &constantColor, simdvector &src, simdvector &src1, simdvector &dst, simdvector &out)
+{
+ simdvector result;
+
+ switch (func)
+ {
+ case BLENDFACTOR_ZERO:
+ result.x = _simd_setzero_ps();
+ result.y = _simd_setzero_ps();
+ result.z = _simd_setzero_ps();
+ result.w = _simd_setzero_ps();
+ break;
+
+ case BLENDFACTOR_ONE:
+ result.x = _simd_set1_ps(1.0);
+ result.y = _simd_set1_ps(1.0);
+ result.z = _simd_set1_ps(1.0);
+ result.w = _simd_set1_ps(1.0);
+ break;
+
+ case BLENDFACTOR_SRC_COLOR:
+ result = src;
+ break;
+
+ case BLENDFACTOR_DST_COLOR:
+ result = dst;
+ break;
+
+ case BLENDFACTOR_INV_SRC_COLOR:
+ result.x = _simd_sub_ps(_simd_set1_ps(1.0), src.x);
+ result.y = _simd_sub_ps(_simd_set1_ps(1.0), src.y);
+ result.z = _simd_sub_ps(_simd_set1_ps(1.0), src.z);
+ result.w = _simd_sub_ps(_simd_set1_ps(1.0), src.w);
+ break;
+
+ case BLENDFACTOR_INV_DST_COLOR:
+ result.x = _simd_sub_ps(_simd_set1_ps(1.0), dst.x);
+ result.y = _simd_sub_ps(_simd_set1_ps(1.0), dst.y);
+ result.z = _simd_sub_ps(_simd_set1_ps(1.0), dst.z);
+ result.w = _simd_sub_ps(_simd_set1_ps(1.0), dst.w);
+ break;
+
+ case BLENDFACTOR_SRC_ALPHA: result.x = src.w;
+ result.y = src.w;
+ result.z = src.w;
+ result.w = src.w;
+ break;
+
+ case BLENDFACTOR_INV_SRC_ALPHA:
+ {
+ simdscalar oneMinusSrcA = _simd_sub_ps(_simd_set1_ps(1.0), src.w);
+ result.x = oneMinusSrcA;
+ result.y = oneMinusSrcA;
+ result.z = oneMinusSrcA;
+ result.w = oneMinusSrcA;
+ break;
+ }
+
+ case BLENDFACTOR_DST_ALPHA: result.x = dst.w;
+ result.y = dst.w;
+ result.z = dst.w;
+ result.w = dst.w;
+ break;
+
+ case BLENDFACTOR_INV_DST_ALPHA:
+ {
+ simdscalar oneMinusDstA = _simd_sub_ps(_simd_set1_ps(1.0), dst.w);
+ result.x = oneMinusDstA;
+ result.y = oneMinusDstA;
+ result.z = oneMinusDstA;
+ result.w = oneMinusDstA;
+ break;
+ }
+
+ case BLENDFACTOR_SRC_ALPHA_SATURATE:
+ {
+ simdscalar sat = _simd_min_ps(src.w, _simd_sub_ps(_simd_set1_ps(1.0), dst.w));
+ result.x = sat;
+ result.y = sat;
+ result.z = sat;
+ result.w = _simd_set1_ps(1.0);
+ break;
+ }
+
+ case BLENDFACTOR_CONST_COLOR:
+ result.x = constantColor[0];
+ result.y = constantColor[1];
+ result.z = constantColor[2];
+ result.w = constantColor[3];
+ break;
+
+ case BLENDFACTOR_CONST_ALPHA:
+ result.x = result.y = result.z = result.w = constantColor[3];
+ break;
+
+ case BLENDFACTOR_INV_CONST_COLOR:
+ {
+ result.x = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[0]);
+ result.y = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[1]);
+ result.z = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[2]);
+ result.w = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]);
+ break;
+ }
+
+ case BLENDFACTOR_INV_CONST_ALPHA:
+ {
+ result.x = result.y = result.z = result.w = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]);
+ break;
+ }
+
+ case BLENDFACTOR_SRC1_COLOR:
+ result.x = src1.x;
+ result.y = src1.y;
+ result.z = src1.z;
+ result.w = src1.w;
+ break;
+
+ case BLENDFACTOR_SRC1_ALPHA:
+ result.x = result.y = result.z = result.w = src1.w;
+ break;
+
+ case BLENDFACTOR_INV_SRC1_COLOR:
+ result.x = _simd_sub_ps(_simd_set1_ps(1.0f), src1.x);
+ result.y = _simd_sub_ps(_simd_set1_ps(1.0f), src1.y);
+ result.z = _simd_sub_ps(_simd_set1_ps(1.0f), src1.z);
+ result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w);
+ break;
+
+ case BLENDFACTOR_INV_SRC1_ALPHA:
+ result.x = result.y = result.z = result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w);
+ break;
+
+ default: SWR_ASSERT(false, "Unimplemented blend factor: %d", func);
+ }
+
+ if (Color)
+ {
+ out.x = result.x;
+ out.y = result.y;
+ out.z = result.z;
+ }
+ if (Alpha)
+ {
+ out.w = result.w;
+ }
+
+}
+
+template<bool Color, bool Alpha>
+INLINE void BlendFunc(SWR_BLEND_OP blendOp, simdvector &src, simdvector &srcFactor, simdvector &dst, simdvector &dstFactor, simdvector &out)
+{
+ simdvector result;
+
+ switch (blendOp)
+ {
+ case BLENDOP_ADD:
+ result.x = _simd_fmadd_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x));
+ result.y = _simd_fmadd_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y));
+ result.z = _simd_fmadd_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z));
+ result.w = _simd_fmadd_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w));
+ break;
+
+ case BLENDOP_SUBTRACT:
+ result.x = _simd_fmsub_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x));
+ result.y = _simd_fmsub_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y));
+ result.z = _simd_fmsub_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z));
+ result.w = _simd_fmsub_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w));
+ break;
+
+ case BLENDOP_REVSUBTRACT:
+ result.x = _simd_fmsub_ps(dstFactor.x, dst.x, _simd_mul_ps(srcFactor.x, src.x));
+ result.y = _simd_fmsub_ps(dstFactor.y, dst.y, _simd_mul_ps(srcFactor.y, src.y));
+ result.z = _simd_fmsub_ps(dstFactor.z, dst.z, _simd_mul_ps(srcFactor.z, src.z));
+ result.w = _simd_fmsub_ps(dstFactor.w, dst.w, _simd_mul_ps(srcFactor.w, src.w));
+ break;
+
+ case BLENDOP_MIN:
+ result.x = _simd_min_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x));
+ result.y = _simd_min_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y));
+ result.z = _simd_min_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z));
+ result.w = _simd_min_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w));
+ break;
+
+ case BLENDOP_MAX:
+ result.x = _simd_max_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x));
+ result.y = _simd_max_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y));
+ result.z = _simd_max_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z));
+ result.w = _simd_max_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w));
+ break;
+
+ default:
+ SWR_ASSERT(false, "Unimplemented blend function: %d", blendOp);
+ }
+
+ if (Color)
+ {
+ out.x = result.x;
+ out.y = result.y;
+ out.z = result.z;
+ }
+ if (Alpha)
+ {
+ out.w = result.w;
+ }
+}
+
+template<SWR_TYPE type>
+INLINE void Clamp(simdvector &src)
+{
+ switch (type)
+ {
+ case SWR_TYPE_FLOAT:
+ break;
+
+ case SWR_TYPE_UNORM:
+ src.x = _simd_max_ps(src.x, _simd_setzero_ps());
+ src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f));
+
+ src.y = _simd_max_ps(src.y, _simd_setzero_ps());
+ src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f));
+
+ src.z = _simd_max_ps(src.z, _simd_setzero_ps());
+ src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f));
+
+ src.w = _simd_max_ps(src.w, _simd_setzero_ps());
+ src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f));
+ break;
+
+ case SWR_TYPE_SNORM:
+ src.x = _simd_max_ps(src.x, _simd_set1_ps(-1.0f));
+ src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f));
+
+ src.y = _simd_max_ps(src.y, _simd_set1_ps(-1.0f));
+ src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f));
+
+ src.z = _simd_max_ps(src.z, _simd_set1_ps(-1.0f));
+ src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f));
+
+ src.w = _simd_max_ps(src.w, _simd_set1_ps(-1.0f));
+ src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f));
+ break;
+
+ default:
+ SWR_ASSERT(false, "Unimplemented clamp: %d", type);
+ break;
+ }
+}
+
+template<SWR_TYPE type>
+void Blend(const SWR_BLEND_STATE *pBlendState, const SWR_RENDER_TARGET_BLEND_STATE *pState, simdvector &src, simdvector& src1, BYTE *pDst, simdvector &result)
+{
+ // load render target
+ simdvector dst;
+ LoadSOA<KNOB_COLOR_HOT_TILE_FORMAT>(pDst, dst);
+
+ simdvector constColor;
+ constColor.x = _simd_broadcast_ss(&pBlendState->constantColor[0]);
+ constColor.y = _simd_broadcast_ss(&pBlendState->constantColor[1]);
+ constColor.z = _simd_broadcast_ss(&pBlendState->constantColor[2]);
+ constColor.w = _simd_broadcast_ss(&pBlendState->constantColor[3]);
+
+ // clamp src/dst/constant
+ Clamp<type>(src);
+ Clamp<type>(src1);
+ Clamp<type>(dst);
+ Clamp<type>(constColor);
+
+ simdvector srcFactor, dstFactor;
+ if (pBlendState->independentAlphaBlendEnable)
+ {
+ GenerateBlendFactor<true, false>((SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor);
+ GenerateBlendFactor<false, true>((SWR_BLEND_FACTOR)pState->sourceAlphaBlendFactor, constColor, src, src1, dst, srcFactor);
+
+ GenerateBlendFactor<true, false>((SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor);
+ GenerateBlendFactor<false, true>((SWR_BLEND_FACTOR)pState->destAlphaBlendFactor, constColor, src, src1, dst, dstFactor);
+
+ BlendFunc<true, false>((SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result);
+ BlendFunc<false, true>((SWR_BLEND_OP)pState->alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
+ }
+ else
+ {
+ GenerateBlendFactor<true, true>((SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor);
+ GenerateBlendFactor<true, true>((SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor);
+
+ BlendFunc<true, true>((SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result);
+ }
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
new file mode 100644
index 00000000000..ce27bf71d3c
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
@@ -0,0 +1,201 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file clip.cpp
+*
+* @brief Implementation for clipping
+*
+******************************************************************************/
+
+#include <assert.h>
+
+#include "common/os.h"
+#include "core/clip.h"
+
+float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1)
+{
+ return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1));
+}
+
+template<SWR_CLIPCODES ClippingPlane>
+inline void intersect(
+ int s, // index to first edge vertex v0 in pInPts.
+ int p, // index to second edge vertex v1 in pInPts.
+ const float *pInPts, // array of all the input positions.
+ const float *pInAttribs, // array of all attributes for all vertex. All the attributes for each vertex is contiguous.
+ int numInAttribs, // number of attributes per vertex.
+ int i, // output index.
+ float *pOutPts, // array of output positions. We'll write our new intersection point at i*4.
+ float *pOutAttribs) // array of output attributes. We'll write our new attributes at i*numInAttribs.
+{
+ float t;
+
+ // Find the parameter of the intersection.
+ // t = (v1.w - v1.x) / ((v2.x - v1.x) - (v2.w - v1.w)) for x = w (RIGHT) plane, etc.
+ const float *v1 = &pInPts[s*4];
+ const float *v2 = &pInPts[p*4];
+
+ switch (ClippingPlane)
+ {
+ case FRUSTUM_LEFT: t = ComputeInterpFactor(v1[3] + v1[0], v2[3] + v2[0]); break;
+ case FRUSTUM_RIGHT: t = ComputeInterpFactor(v1[3] - v1[0], v2[3] - v2[0]); break;
+ case FRUSTUM_TOP: t = ComputeInterpFactor(v1[3] + v1[1], v2[3] + v2[1]); break;
+ case FRUSTUM_BOTTOM: t = ComputeInterpFactor(v1[3] - v1[1], v2[3] - v2[1]); break;
+ case FRUSTUM_NEAR: t = ComputeInterpFactor(v1[2], v2[2]); break;
+ case FRUSTUM_FAR: t = ComputeInterpFactor(v1[3] - v1[2], v2[3] - v2[2]); break;
+ default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
+ };
+
+
+ const float *a1 = &pInAttribs[s*numInAttribs];
+ const float *a2 = &pInAttribs[p*numInAttribs];
+
+ float *pOutP = &pOutPts[i*4];
+ float *pOutA = &pOutAttribs[i*numInAttribs];
+
+ // Interpolate new position.
+ for(int j = 0; j < 4; ++j)
+ {
+ pOutP[j] = v1[j] + (v2[j]-v1[j])*t;
+ }
+
+ // Interpolate Attributes
+ for(int attr = 0; attr < numInAttribs; ++attr)
+ {
+ pOutA[attr] = a1[attr] + (a2[attr]-a1[attr])*t;
+ }
+}
+
+
+// Checks whether vertex v lies inside clipping plane
+// in homogenous coords check -w < {x,y,z} < w;
+//
+template<SWR_CLIPCODES ClippingPlane>
+inline int inside(const float v[4])
+{
+ switch (ClippingPlane)
+ {
+ case FRUSTUM_LEFT : return (v[0]>=-v[3]);
+ case FRUSTUM_RIGHT : return (v[0]<= v[3]);
+ case FRUSTUM_TOP : return (v[1]>=-v[3]);
+ case FRUSTUM_BOTTOM : return (v[1]<= v[3]);
+ case FRUSTUM_NEAR : return (v[2]>=0.0f);
+ case FRUSTUM_FAR : return (v[2]<= v[3]);
+ default:
+ SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
+ return 0;
+ }
+}
+
+
+// Clips a polygon in homogenous coordinates to a particular clipping plane.
+// Takes in vertices of the polygon (InPts) and the clipping plane
+// Puts the vertices of the clipped polygon in OutPts
+// Returns number of points in clipped polygon
+//
+template<SWR_CLIPCODES ClippingPlane>
+int ClipTriToPlane( const float *pInPts, int numInPts,
+ const float *pInAttribs, int numInAttribs,
+ float *pOutPts, float *pOutAttribs)
+{
+ int i=0; // index number of OutPts, # of vertices in OutPts = i div 4;
+
+ for (int j = 0; j < numInPts; ++j)
+ {
+ int s = j;
+ int p = (j + 1) % numInPts;
+
+ int s_in = inside<ClippingPlane>(&pInPts[s*4]);
+ int p_in = inside<ClippingPlane>(&pInPts[p*4]);
+
+ // test if vertex is to be added to output vertices
+ if (s_in != p_in) // edge crosses clipping plane
+ {
+ // find point of intersection
+ intersect<ClippingPlane>(s, p, pInPts, pInAttribs, numInAttribs, i, pOutPts, pOutAttribs);
+ i++;
+ }
+ if (p_in) // 2nd vertex is inside clipping volume, add it to output
+ {
+ // Copy 2nd vertex position of edge over to output.
+ for(int k = 0; k < 4; ++k)
+ {
+ pOutPts[i*4 + k] = pInPts[p*4 + k];
+ }
+ // Copy 2nd vertex attributes of edge over to output.
+ for(int attr = 0; attr < numInAttribs; ++attr)
+ {
+ pOutAttribs[i*numInAttribs+attr] = pInAttribs[p*numInAttribs+attr];
+ }
+ i++;
+ }
+ // edge does not cross clipping plane and vertex outside clipping volume
+ // => do not add vertex
+ }
+ return i;
+}
+
+
+
+void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles, int *numVerts, float *pOutAttribs)
+{
+ // temp storage to hold at least 6 sets of vertices, the max number that can be created during clipping
+ OSALIGN(float, 16) tempPts[6 * 4];
+ OSALIGN(float, 16) tempAttribs[6 * KNOB_NUM_ATTRIBUTES * 4];
+
+ // we opt to clip to viewport frustum to produce smaller triangles for rasterization precision
+ int NumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pTriangle, 3, pAttribs, numAttribs, tempPts, tempAttribs);
+ NumOutPts = ClipTriToPlane<FRUSTUM_FAR>(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs);
+ NumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pOutTriangles, NumOutPts, pOutAttribs, numAttribs, tempPts, tempAttribs);
+ NumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs);
+ NumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pOutTriangles, NumOutPts, pOutAttribs, numAttribs, tempPts, tempAttribs);
+ NumOutPts = ClipTriToPlane<FRUSTUM_TOP>(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs);
+
+ SWR_ASSERT(NumOutPts <= 6);
+
+ *numVerts = NumOutPts;
+ return;
+}
+
+void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
+{
+ RDTSC_START(FEClipTriangles);
+ Clipper<3> clipper(workerId, pDC);
+ clipper.ExecuteStage(pa, prims, primMask, primId);
+ RDTSC_STOP(FEClipTriangles, 1, 0);
+}
+
+void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
+{
+ RDTSC_START(FEClipLines);
+ Clipper<2> clipper(workerId, pDC);
+ clipper.ExecuteStage(pa, prims, primMask, primId);
+ RDTSC_STOP(FEClipLines, 1, 0);
+}
+void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
+{
+ RDTSC_START(FEClipPoints);
+ Clipper<1> clipper(workerId, pDC);
+ clipper.ExecuteStage(pa, prims, primMask, primId);
+ RDTSC_STOP(FEClipPoints, 1, 0);
+}
+
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h
new file mode 100644
index 00000000000..49494a4e374
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -0,0 +1,868 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file clip.h
+*
+* @brief Definitions for clipping
+*
+******************************************************************************/
+#pragma once
+
+#include "common/simdintrin.h"
+#include "core/context.h"
+#include "core/pa.h"
+#include "rdtsc_core.h"
+
+enum SWR_CLIPCODES
+{
+ // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
+ // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
+#define CLIPCODE_SHIFT 23
+ FRUSTUM_LEFT = (0x01 << CLIPCODE_SHIFT),
+ FRUSTUM_TOP = (0x02 << CLIPCODE_SHIFT),
+ FRUSTUM_RIGHT = (0x04 << CLIPCODE_SHIFT),
+ FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT),
+
+ FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT),
+ FRUSTUM_FAR = (0x20 << CLIPCODE_SHIFT),
+
+ NEGW = (0x40 << CLIPCODE_SHIFT),
+
+ GUARDBAND_LEFT = (0x80 << CLIPCODE_SHIFT | 0x1),
+ GUARDBAND_TOP = (0x80 << CLIPCODE_SHIFT | 0x2),
+ GUARDBAND_RIGHT = (0x80 << CLIPCODE_SHIFT | 0x4),
+ GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
+};
+
+#define FRUSTUM_CLIP_MASK (FRUSTUM_LEFT|FRUSTUM_TOP|FRUSTUM_RIGHT|FRUSTUM_BOTTOM|FRUSTUM_NEAR|FRUSTUM_FAR)
+#define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
+
+void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles,
+ int *numVerts, float *pOutAttribs);
+
+INLINE
+void ComputeClipCodes(DRIVER_TYPE type, const API_STATE& state, const simdvector& vertex, simdscalar& clipCodes)
+{
+ clipCodes = _simd_setzero_ps();
+
+ // -w
+ simdscalar vNegW = _simd_mul_ps(vertex.w, _simd_set1_ps(-1.0f));
+
+ // FRUSTUM_LEFT
+ simdscalar vRes = _simd_cmplt_ps(vertex.x, vNegW);
+ clipCodes = _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT)));
+
+ // FRUSTUM_TOP
+ vRes = _simd_cmplt_ps(vertex.y, vNegW);
+ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP))));
+
+ // FRUSTUM_RIGHT
+ vRes = _simd_cmpgt_ps(vertex.x, vertex.w);
+ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_RIGHT))));
+
+ // FRUSTUM_BOTTOM
+ vRes = _simd_cmpgt_ps(vertex.y, vertex.w);
+ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_BOTTOM))));
+
+ if (state.rastState.depthClipEnable)
+ {
+ // FRUSTUM_NEAR
+ // DX clips depth [0..w], GL clips [-w..w]
+ if (type == DX)
+ {
+ vRes = _simd_cmplt_ps(vertex.z, _simd_setzero_ps());
+ }
+ else
+ {
+ vRes = _simd_cmplt_ps(vertex.z, vNegW);
+ }
+ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_NEAR))));
+
+ // FRUSTUM_FAR
+ vRes = _simd_cmpgt_ps(vertex.z, vertex.w);
+ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_FAR))));
+ }
+
+ // NEGW
+ vRes = _simd_cmple_ps(vertex.w, _simd_setzero_ps());
+ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(NEGW))));
+
+ // GUARDBAND_LEFT
+ simdscalar gbMult = _simd_mul_ps(vNegW, _simd_set1_ps(state.gbState.left));
+ vRes = _simd_cmplt_ps(vertex.x, gbMult);
+ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_LEFT))));
+
+ // GUARDBAND_TOP
+ gbMult = _simd_mul_ps(vNegW, _simd_set1_ps(state.gbState.top));
+ vRes = _simd_cmplt_ps(vertex.y, gbMult);
+ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_TOP))));
+
+ // GUARDBAND_RIGHT
+ gbMult = _simd_mul_ps(vertex.w, _simd_set1_ps(state.gbState.right));
+ vRes = _simd_cmpgt_ps(vertex.x, gbMult);
+ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_RIGHT))));
+
+ // GUARDBAND_BOTTOM
+ gbMult = _simd_mul_ps(vertex.w, _simd_set1_ps(state.gbState.bottom));
+ vRes = _simd_cmpgt_ps(vertex.y, gbMult);
+ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_BOTTOM))));
+}
+
+template<uint32_t NumVertsPerPrim>
+class Clipper
+{
+public:
+ Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
+ workerId(in_workerId), driverType(in_pDC->pContext->driverType), pDC(in_pDC), state(GetApiState(in_pDC))
+ {
+ static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
+ }
+
+ void ComputeClipCodes(simdvector vertex[])
+ {
+ for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+ {
+ ::ComputeClipCodes(this->driverType, this->state, vertex[i], this->clipCodes[i]);
+ }
+ }
+
+ simdscalar ComputeClipCodeIntersection()
+ {
+ simdscalar result = this->clipCodes[0];
+ for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
+ {
+ result = _simd_and_ps(result, this->clipCodes[i]);
+ }
+ return result;
+ }
+
+ simdscalar ComputeClipCodeUnion()
+ {
+ simdscalar result = this->clipCodes[0];
+ for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
+ {
+ result = _simd_or_ps(result, this->clipCodes[i]);
+ }
+ return result;
+ }
+
+ int ComputeNegWMask()
+ {
+ simdscalar clipCodeUnion = ComputeClipCodeUnion();
+ clipCodeUnion = _simd_and_ps(clipCodeUnion, _simd_castsi_ps(_simd_set1_epi32(NEGW)));
+ return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion, _simd_setzero_ps()));
+ }
+
+ int ComputeClipMask()
+ {
+ simdscalar clipUnion = ComputeClipCodeUnion();
+ clipUnion = _simd_and_ps(clipUnion, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK)));
+ return _simd_movemask_ps(_simd_cmpneq_ps(clipUnion, _simd_setzero_ps()));
+ }
+
+ // clipper is responsible for culling any prims with NAN coordinates
+ int ComputeNaNMask(simdvector prim[])
+ {
+ simdscalar vNanMask = _simd_setzero_ps();
+ for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
+ {
+ simdscalar vNan01 = _simd_cmp_ps(prim[e].v[0], prim[e].v[1], _CMP_UNORD_Q);
+ vNanMask = _simd_or_ps(vNanMask, vNan01);
+ simdscalar vNan23 = _simd_cmp_ps(prim[e].v[2], prim[e].v[3], _CMP_UNORD_Q);
+ vNanMask = _simd_or_ps(vNanMask, vNan23);
+ }
+
+ return _simd_movemask_ps(vNanMask);
+ }
+
+ int ComputeUserClipCullMask(PA_STATE& pa, simdvector prim[])
+ {
+ uint8_t cullMask = this->state.rastState.cullDistanceMask;
+ simdscalar vClipCullMask = _simd_setzero_ps();
+ DWORD index;
+
+ simdvector vClipCullDistLo[3];
+ simdvector vClipCullDistHi[3];
+
+ pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo);
+ pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi);
+ while (_BitScanForward(&index, cullMask))
+ {
+ cullMask &= ~(1 << index);
+ uint32_t slot = index >> 2;
+ uint32_t component = index & 0x3;
+
+ simdscalar vCullMaskElem = _simd_set1_ps(-1.0f);
+ for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
+ {
+ simdscalar vCullComp;
+ if (slot == 0)
+ {
+ vCullComp = vClipCullDistLo[e][component];
+ }
+ else
+ {
+ vCullComp = vClipCullDistHi[e][component];
+ }
+
+ // cull if cull distance < 0 || NAN
+ simdscalar vCull = _simd_cmp_ps(_mm256_setzero_ps(), vCullComp, _CMP_NLE_UQ);
+ vCullMaskElem = _simd_and_ps(vCullMaskElem, vCull);
+ }
+ vClipCullMask = _simd_or_ps(vClipCullMask, vCullMaskElem);
+ }
+
+ // clipper should also discard any primitive with NAN clip distance
+ uint8_t clipMask = this->state.rastState.clipDistanceMask;
+ while (_BitScanForward(&index, clipMask))
+ {
+ clipMask &= ~(1 << index);
+ uint32_t slot = index >> 2;
+ uint32_t component = index & 0x3;
+
+ for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
+ {
+ simdscalar vClipComp;
+ if (slot == 0)
+ {
+ vClipComp = vClipCullDistLo[e][component];
+ }
+ else
+ {
+ vClipComp = vClipCullDistHi[e][component];
+ }
+
+ simdscalar vClip = _simd_cmp_ps(vClipComp, vClipComp, _CMP_UNORD_Q);
+ vClipCullMask = _simd_or_ps(vClipCullMask, vClip);
+ }
+ }
+
+ return _simd_movemask_ps(vClipCullMask);
+ }
+
+ // clip a single primitive
+ int ClipScalar(PA_STATE& pa, uint32_t primIndex, float* pOutPos, float* pOutAttribs)
+ {
+ OSALIGN(float, 16) inVerts[3 * 4];
+ OSALIGN(float, 16) inAttribs[3 * KNOB_NUM_ATTRIBUTES * 4];
+
+ // transpose primitive position
+ __m128 verts[3];
+ pa.AssembleSingle(VERTEX_POSITION_SLOT, primIndex, verts);
+ _mm_store_ps(&inVerts[0], verts[0]);
+ _mm_store_ps(&inVerts[4], verts[1]);
+ _mm_store_ps(&inVerts[8], verts[2]);
+
+ // transpose attribs
+ uint32_t numScalarAttribs = this->state.linkageCount * 4;
+
+ int idx = 0;
+ DWORD slot = 0;
+ uint32_t mapIdx = 0;
+ uint32_t tmpLinkage = uint32_t(this->state.linkageMask);
+ while (_BitScanForward(&slot, tmpLinkage))
+ {
+ tmpLinkage &= ~(1 << slot);
+ // Compute absolute attrib slot in vertex array
+ uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + this->state.linkageMap[mapIdx++];
+ __m128 attrib[3]; // triangle attribs (always 4 wide)
+ pa.AssembleSingle(inputSlot, primIndex, attrib);
+ _mm_store_ps(&inAttribs[idx], attrib[0]);
+ _mm_store_ps(&inAttribs[idx + numScalarAttribs], attrib[1]);
+ _mm_store_ps(&inAttribs[idx + numScalarAttribs * 2], attrib[2]);
+ idx += 4;
+ }
+
+ int numVerts;
+ Clip(inVerts, inAttribs, numScalarAttribs, pOutPos, &numVerts, pOutAttribs);
+
+ return numVerts;
+ }
+
+ // clip SIMD primitives
+ void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId)
+ {
+ // input/output vertex store for clipper
+ simdvertex vertices[7]; // maximum 7 verts generated per triangle
+
+ LONG constantInterpMask = this->state.backendState.constantInterpolationMask;
+ uint32_t provokingVertex = 0;
+ if(pa.binTopology == TOP_TRIANGLE_FAN)
+ {
+ provokingVertex = this->state.frontendState.provokingVertex.triFan;
+ }
+ ///@todo: line topology for wireframe?
+
+ // assemble pos
+ simdvector tmpVector[NumVertsPerPrim];
+ pa.Assemble(VERTEX_POSITION_SLOT, tmpVector);
+ for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+ {
+ vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i];
+ }
+
+ // assemble attribs
+ DWORD slot = 0;
+ uint32_t mapIdx = 0;
+ uint32_t tmpLinkage = this->state.linkageMask;
+
+ int32_t maxSlot = -1;
+ while (_BitScanForward(&slot, tmpLinkage))
+ {
+ tmpLinkage &= ~(1 << slot);
+ // Compute absolute attrib slot in vertex array
+ uint32_t mapSlot = this->state.linkageMap[mapIdx++];
+ maxSlot = std::max<int32_t>(maxSlot, mapSlot);
+ uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + mapSlot;
+
+ pa.Assemble(inputSlot, tmpVector);
+
+ // if constant interpolation enabled for this attribute, assign the provoking
+ // vertex values to all edges
+ if (_bittest(&constantInterpMask, slot))
+ {
+ for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+ {
+ vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
+ }
+ }
+ else
+ {
+ for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+ {
+ vertices[i].attrib[inputSlot] = tmpVector[i];
+ }
+ }
+ }
+
+ uint32_t numAttribs = maxSlot + 1;
+
+ simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
+
+ // set up new PA for binning clipped primitives
+ PFN_PROCESS_PRIMS pfnBinFunc = nullptr;
+ PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
+ if (NumVertsPerPrim == 3)
+ {
+ pfnBinFunc = BinTriangles;
+ clipTopology = TOP_TRIANGLE_FAN;
+
+ // so that the binner knows to bloat wide points later
+ if (pa.binTopology == TOP_POINT_LIST)
+ clipTopology = TOP_POINT_LIST;
+ }
+ else if (NumVertsPerPrim == 2)
+ {
+ pfnBinFunc = BinLines;
+ clipTopology = TOP_LINE_LIST;
+ }
+ else
+ {
+ SWR_ASSERT(0 && "Unexpected points in clipper.");
+ }
+
+
+ uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts;
+ uint32_t* pPrimitiveId = (uint32_t*)&vPrimId;
+
+ const simdscalari vOffsets = _mm256_set_epi32(
+ 0 * sizeof(simdvertex), // unused lane
+ 6 * sizeof(simdvertex),
+ 5 * sizeof(simdvertex),
+ 4 * sizeof(simdvertex),
+ 3 * sizeof(simdvertex),
+ 2 * sizeof(simdvertex),
+ 1 * sizeof(simdvertex),
+ 0 * sizeof(simdvertex));
+
+ // only need to gather 7 verts
+ // @todo dynamic mask based on actual # of verts generated per lane
+ const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
+
+ uint32_t numClippedPrims = 0;
+ for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
+ {
+ uint32_t numEmittedVerts = pVertexCount[inputPrim];
+ if (numEmittedVerts < NumVertsPerPrim)
+ {
+ continue;
+ }
+ SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
+
+ uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
+ numClippedPrims += numEmittedPrims;
+
+ // tranpose clipper output so that each lane's vertices are in SIMD order
+ // set aside space for 2 vertices, as the PA will try to read up to 16 verts
+ // for triangle fan
+ simdvertex transposedPrims[2];
+
+ // transpose pos
+ uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
+ pBase += sizeof(simdscalar);
+ }
+
+ // transpose attribs
+ pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_ATTRIB_START_SLOT]) + sizeof(float) * inputPrim;
+ for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
+ {
+ uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib;
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ transposedPrims[0].attrib[attribSlot][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
+ pBase += sizeof(simdscalar);
+ }
+ }
+
+ PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology);
+
+ while (clipPa.GetNextStreamOutput())
+ {
+ do
+ {
+ simdvector attrib[NumVertsPerPrim];
+ bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib);
+ if (assemble)
+ {
+ static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
+ pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]));
+ }
+ } while (clipPa.NextPrim());
+ }
+ }
+
+ // update global pipeline stat
+ SWR_CONTEXT* pContext = this->pDC->pContext;
+ UPDATE_STAT(CPrimitives, numClippedPrims);
+ }
+
+ // execute the clipper stage
+ void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId)
+ {
+ // set up binner based on PA state
+ PFN_PROCESS_PRIMS pfnBinner;
+ switch (pa.binTopology)
+ {
+ case TOP_POINT_LIST:
+ pfnBinner = BinPoints;
+ break;
+ case TOP_LINE_LIST:
+ case TOP_LINE_STRIP:
+ case TOP_LINE_LOOP:
+ case TOP_LINE_LIST_ADJ:
+ case TOP_LISTSTRIP_ADJ:
+ pfnBinner = BinLines;
+ break;
+ default:
+ pfnBinner = BinTriangles;
+ break;
+ };
+
+ // update clipper invocations pipeline stat
+ SWR_CONTEXT* pContext = this->pDC->pContext;
+ uint32_t numInvoc = _mm_popcnt_u32(primMask);
+ UPDATE_STAT(CInvocations, numInvoc);
+
+ ComputeClipCodes(prim);
+
+ // cull prims with NAN coords
+ primMask &= ~ComputeNaNMask(prim);
+
+ // user cull distance cull
+ if (this->state.rastState.cullDistanceMask)
+ {
+ primMask &= ~ComputeUserClipCullMask(pa, prim);
+ }
+
+ // cull prims outside view frustum
+ simdscalar clipIntersection = ComputeClipCodeIntersection();
+ int validMask = primMask & _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection, _simd_setzero_ps()));
+
+ // skip clipping for points
+ uint32_t clipMask = 0;
+ if (NumVertsPerPrim != 1)
+ {
+ clipMask = primMask & ComputeClipMask();
+ }
+
+ if (clipMask)
+ {
+ RDTSC_START(FEGuardbandClip);
+ // we have to clip tris, execute the clipper, which will also
+ // call the binner
+ ClipSimd(vMask(primMask), vMask(clipMask), pa, primId);
+ RDTSC_STOP(FEGuardbandClip, 1, 0);
+ }
+ else if (validMask)
+ {
+ // update CPrimitives pipeline state
+ SWR_CONTEXT* pContext = this->pDC->pContext;
+ UPDATE_STAT(CPrimitives, _mm_popcnt_u32(validMask));
+
+ // forward valid prims directly to binner
+ pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId);
+ }
+ }
+
+private:
+ inline simdscalar ComputeInterpFactor(simdscalar boundaryCoord0, simdscalar boundaryCoord1)
+ {
+ return _simd_div_ps(boundaryCoord0, _simd_sub_ps(boundaryCoord0, boundaryCoord1));
+ }
+
+ inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari vIndices, uint32_t component)
+ {
+ const uint32_t simdVertexStride = sizeof(simdvertex);
+ const uint32_t componentStride = sizeof(simdscalar);
+ const uint32_t attribStride = sizeof(simdvector);
+ const __m256i vElemOffset = _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float),
+ 3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float));
+
+ // step to the simdvertex
+ simdscalari vOffsets = _simd_mullo_epi32(vIndices, _simd_set1_epi32(simdVertexStride));
+
+ // step to the attribute and component
+ vOffsets = _simd_add_epi32(vOffsets, _simd_set1_epi32(attribStride * attrib + componentStride * component));
+
+ // step to the lane
+ vOffsets = _simd_add_epi32(vOffsets, vElemOffset);
+
+ return vOffsets;
+ }
+
+ // gathers a single component for a given attribute for each SIMD lane
+ inline simdscalar GatherComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component)
+ {
+ simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
+ simdscalar vSrc = _mm256_undefined_ps();
+ return _simd_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1);
+ }
+
+ inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component, simdscalar vSrc)
+ {
+ simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
+
+ uint32_t* pOffsets = (uint32_t*)&vOffsets;
+ float* pSrc = (float*)&vSrc;
+ uint32_t mask = _simd_movemask_ps(vMask);
+ DWORD lane;
+ while (_BitScanForward(&lane, mask))
+ {
+ mask &= ~(1 << lane);
+ uint8_t* pBuf = (uint8_t*)pBuffer + pOffsets[lane];
+ *(float*)pBuf = pSrc[lane];
+ }
+ }
+
+ template<SWR_CLIPCODES ClippingPlane>
+ inline void intersect(
+ const simdscalar& vActiveMask, // active lanes to operate on
+ const simdscalari& s, // index to first edge vertex v0 in pInPts.
+ const simdscalari& p, // index to second edge vertex v1 in pInPts.
+ const simdvector& v1, // vertex 0 position
+ const simdvector& v2, // vertex 1 position
+ simdscalari& outIndex, // output index.
+ const float *pInVerts, // array of all the input positions.
+ uint32_t numInAttribs, // number of attributes per vertex.
+ float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4.
+ {
+ // compute interpolation factor
+ simdscalar t;
+ switch (ClippingPlane)
+ {
+ case FRUSTUM_LEFT: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[0]), _simd_add_ps(v2[3], v2[0])); break;
+ case FRUSTUM_RIGHT: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[0]), _simd_sub_ps(v2[3], v2[0])); break;
+ case FRUSTUM_TOP: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[1]), _simd_add_ps(v2[3], v2[1])); break;
+ case FRUSTUM_BOTTOM: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[1]), _simd_sub_ps(v2[3], v2[1])); break;
+ case FRUSTUM_NEAR:
+ // DX Znear plane is 0, GL is -w
+ if (this->driverType == DX)
+ {
+ t = ComputeInterpFactor(v1[2], v2[2]);
+ }
+ else
+ {
+ t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[2]), _simd_add_ps(v2[3], v2[2]));
+ }
+ break;
+ case FRUSTUM_FAR: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[2]), _simd_sub_ps(v2[3], v2[2])); break;
+ default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
+ };
+
+ // interpolate position and store
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ simdscalar vOutPos = _simd_fmadd_ps(_simd_sub_ps(v2[c], v1[c]), t, v1[c]);
+ ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
+ }
+
+ // interpolate attributes and store
+ for (uint32_t a = 0; a < numInAttribs; ++a)
+ {
+ uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+ simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+ simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+ ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
+ }
+ }
+ }
+
+ template<SWR_CLIPCODES ClippingPlane>
+ inline simdscalar inside(const simdvector& v)
+ {
+ switch (ClippingPlane)
+ {
+ case FRUSTUM_LEFT: return _simd_cmpge_ps(v[0], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
+ case FRUSTUM_RIGHT: return _simd_cmple_ps(v[0], v[3]);
+ case FRUSTUM_TOP: return _simd_cmpge_ps(v[1], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
+ case FRUSTUM_BOTTOM: return _simd_cmple_ps(v[1], v[3]);
+ case FRUSTUM_NEAR: return _simd_cmpge_ps(v[2], this->driverType == DX ? _simd_setzero_ps() : _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
+ case FRUSTUM_FAR: return _simd_cmple_ps(v[2], v[3]);
+ default:
+ SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
+ return _simd_setzero_ps();
+ }
+ }
+
+ template<SWR_CLIPCODES ClippingPlane>
+ simdscalari ClipTriToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
+ {
+ simdscalari vCurIndex = _simd_setzero_si();
+ simdscalari vOutIndex = _simd_setzero_si();
+ simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
+
+ while (!_simd_testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
+ {
+ simdscalari s = vCurIndex;
+ simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
+ simdscalari underFlowMask = _simd_cmpgt_epi32(vNumInPts, p);
+ p = _simd_castps_si(_simd_blendv_ps(_simd_setzero_ps(), _simd_castsi_ps(p), _simd_castsi_ps(underFlowMask)));
+
+ // gather position
+ simdvector vInPos0, vInPos1;
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
+ vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
+ }
+
+ // compute inside mask
+ simdscalar s_in = inside<ClippingPlane>(vInPos0);
+ simdscalar p_in = inside<ClippingPlane>(vInPos1);
+
+ // compute intersection mask (s_in != p_in)
+ simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
+ intersectMask = _simd_and_ps(intersectMask, vActiveMask);
+
+ // store s if inside
+ s_in = _simd_and_ps(s_in, vActiveMask);
+ if (!_simd_testz_ps(s_in, s_in))
+ {
+ // store position
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
+ }
+
+ // store attribs
+ for (uint32_t a = 0; a < numInAttribs; ++a)
+ {
+ uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+ ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
+ }
+ }
+
+ // increment outIndex
+ vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
+ }
+
+ // compute and store intersection
+ if (!_simd_testz_ps(intersectMask, intersectMask))
+ {
+ intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
+
+ // increment outIndex for active lanes
+ vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
+ }
+
+ // increment loop index and update active mask
+ vCurIndex = _simd_add_epi32(vCurIndex, _simd_set1_epi32(1));
+ vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
+ }
+
+ return vOutIndex;
+ }
+
+ template<SWR_CLIPCODES ClippingPlane>
+ simdscalari ClipLineToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
+ {
+ simdscalari vCurIndex = _simd_setzero_si();
+ simdscalari vOutIndex = _simd_setzero_si();
+ simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
+
+ if (!_simd_testz_ps(vActiveMask, vActiveMask))
+ {
+ simdscalari s = vCurIndex;
+ simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
+
+ // gather position
+ simdvector vInPos0, vInPos1;
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
+ vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
+ }
+
+ // compute inside mask
+ simdscalar s_in = inside<ClippingPlane>(vInPos0);
+ simdscalar p_in = inside<ClippingPlane>(vInPos1);
+
+ // compute intersection mask (s_in != p_in)
+ simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
+ intersectMask = _simd_and_ps(intersectMask, vActiveMask);
+
+ // store s if inside
+ s_in = _simd_and_ps(s_in, vActiveMask);
+ if (!_simd_testz_ps(s_in, s_in))
+ {
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
+ }
+
+ // interpolate attributes and store
+ for (uint32_t a = 0; a < numInAttribs; ++a)
+ {
+ uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+ ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
+ }
+ }
+
+ // increment outIndex
+ vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
+ }
+
+ // compute and store intersection
+ if (!_simd_testz_ps(intersectMask, intersectMask))
+ {
+ intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
+
+ // increment outIndex for active lanes
+ vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
+ }
+
+ // store p if inside
+ p_in = _simd_and_ps(p_in, vActiveMask);
+ if (!_simd_testz_ps(p_in, p_in))
+ {
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
+ }
+
+ // interpolate attributes and store
+ for (uint32_t a = 0; a < numInAttribs; ++a)
+ {
+ uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
+ ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
+ }
+ }
+
+ // increment outIndex
+ vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), p_in);
+ }
+ }
+
+ return vOutIndex;
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Vertical clipper. Clips SIMD primitives at a time
+ /// @param pVertices - pointer to vertices in SOA form. Clipper will read input and write results to this buffer
+ /// @param vPrimMask - mask of valid input primitives, including non-clipped prims
+ /// @param numAttribs - number of valid input attribs, including position
+ simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs)
+ {
+ // temp storage
+ simdvertex tempVertices[7];
+ float* pTempVerts = (float*)&tempVertices[0];
+
+ // zero out num input verts for non-active lanes
+ simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim);
+ vNumInPts = _simd_blendv_epi32(_simd_setzero_si(), vNumInPts, vClipMask);
+
+ // clip prims to frustum
+ simdscalari vNumOutPts;
+ if (NumVertsPerPrim == 3)
+ {
+ vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
+ vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
+ vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
+ vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
+ vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
+ vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
+ }
+ else
+ {
+ SWR_ASSERT(NumVertsPerPrim == 2);
+ vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
+ vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
+ vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
+ vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
+ vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
+ vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
+ }
+
+ // restore num verts for non-clipped, active lanes
+ simdscalar vNonClippedMask = _simd_andnot_ps(vClipMask, vPrimMask);
+ vNumOutPts = _simd_blendv_epi32(vNumOutPts, _simd_set1_epi32(NumVertsPerPrim), vNonClippedMask);
+
+ return vNumOutPts;
+ }
+
+ const uint32_t workerId;
+ const DRIVER_TYPE driverType;
+ DRAW_CONTEXT* pDC;
+ const API_STATE& state;
+ simdscalar clipCodes[NumVertsPerPrim];
+};
+
+
+// pipeline stage functions
+void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
+void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
+void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
new file mode 100644
index 00000000000..4a214aff1c8
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -0,0 +1,495 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file context.h
+*
+* @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT
+* The SWR_CONTEXT is our global context and contains the DC ring,
+* thread state, etc.
+*
+* The DRAW_CONTEXT contains all state associated with a draw operation.
+*
+******************************************************************************/
+#pragma once
+
+#include <condition_variable>
+#include <algorithm>
+
+#include "core/api.h"
+#include "core/utils.h"
+#include "core/arena.h"
+#include "core/fifo.hpp"
+#include "core/knobs.h"
+#include "common/simdintrin.h"
+#include "core/threads.h"
+
+// x.8 fixed point precision values
+#define FIXED_POINT_SHIFT 8
+#define FIXED_POINT_SCALE 256
+
+// x.16 fixed point precision values
+#define FIXED_POINT16_SHIFT 16
+#define FIXED_POINT16_SCALE 65536
+
+struct SWR_CONTEXT;
+struct DRAW_CONTEXT;
+
+struct TRI_FLAGS
+{
+ uint32_t frontFacing : 1;
+ uint32_t yMajor : 1;
+ uint32_t coverageMask : (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
+ uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
+ float pointSize;
+ uint32_t primID;
+ uint32_t renderTargetArrayIndex;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_TRIANGLE_DESC
+/////////////////////////////////////////////////////////////////////////
+struct SWR_TRIANGLE_DESC
+{
+ float I[3];
+ float J[3];
+ float Z[3];
+ float OneOverW[3];
+ float recipDet;
+
+ float *pRecipW;
+ float *pAttribs;
+ float *pPerspAttribs;
+ float *pSamplePos;
+ float *pUserClipBuffer;
+
+ uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
+
+ TRI_FLAGS triFlags;
+};
+
+struct TRIANGLE_WORK_DESC
+{
+ float *pTriBuffer;
+ float *pAttribs;
+ float *pUserClipBuffer;
+ uint32_t numAttribs;
+ TRI_FLAGS triFlags;
+};
+
+union CLEAR_FLAGS
+{
+ struct
+ {
+ uint32_t mask : 3;
+ };
+ uint32_t bits;
+};
+
+struct CLEAR_DESC
+{
+ CLEAR_FLAGS flags;
+ float clearRTColor[4]; // RGBA_32F
+ float clearDepth; // [0..1]
+ BYTE clearStencil;
+};
+
+struct INVALIDATE_TILES_DESC
+{
+ uint32_t attachmentMask;
+};
+
+struct SYNC_DESC
+{
+ PFN_CALLBACK_FUNC pfnCallbackFunc;
+ uint64_t userData;
+ uint64_t userData2;
+ uint64_t userData3;
+};
+
+struct QUERY_DESC
+{
+ SWR_STATS* pStats;
+};
+
+struct STORE_TILES_DESC
+{
+ SWR_RENDERTARGET_ATTACHMENT attachment;
+ SWR_TILE_STATE postStoreTileState;
+};
+
+struct COMPUTE_DESC
+{
+ uint32_t threadGroupCountX;
+ uint32_t threadGroupCountY;
+ uint32_t threadGroupCountZ;
+};
+
+typedef void(*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc);
+
+enum WORK_TYPE
+{
+ SYNC,
+ DRAW,
+ CLEAR,
+ INVALIDATETILES,
+ STORETILES,
+ QUERYSTATS,
+};
+
+struct BE_WORK
+{
+ WORK_TYPE type;
+ PFN_WORK_FUNC pfnWork;
+ union
+ {
+ SYNC_DESC sync;
+ TRIANGLE_WORK_DESC tri;
+ CLEAR_DESC clear;
+ INVALIDATE_TILES_DESC invalidateTiles;
+ STORE_TILES_DESC storeTiles;
+ QUERY_DESC queryStats;
+ } desc;
+};
+
+struct DRAW_WORK
+{
+ DRAW_CONTEXT* pDC;
+ union
+ {
+ uint32_t numIndices; // DrawIndexed: Number of indices for draw.
+ uint32_t numVerts; // Draw: Number of verts (triangles, lines, etc)
+ };
+ union
+ {
+ const int32_t* pIB; // DrawIndexed: App supplied indices
+ uint32_t startVertex; // Draw: Starting vertex in VB to render from.
+ };
+ int32_t baseVertex;
+ uint32_t numInstances; // Number of instances
+ uint32_t startInstance; // Instance offset
+ uint32_t startPrimID; // starting primitiveID for this draw batch
+ uint32_t startVertexID; // starting VertexID for this draw batch (only needed for non-indexed draws)
+ SWR_FORMAT type; // index buffer type
+};
+
+typedef void(*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pDesc);
+struct FE_WORK
+{
+ WORK_TYPE type;
+ PFN_FE_WORK_FUNC pfnWork;
+ union
+ {
+ SYNC_DESC sync;
+ DRAW_WORK draw;
+ CLEAR_DESC clear;
+ INVALIDATE_TILES_DESC invalidateTiles;
+ STORE_TILES_DESC storeTiles;
+ QUERY_DESC queryStats;
+ } desc;
+};
+
+struct GUARDBAND
+{
+ float left, right, top, bottom;
+};
+
+struct PA_STATE;
+
+// function signature for pipeline stages that execute after primitive assembly
+typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[],
+ uint32_t primMask, simdscalari primID);
+
+OSALIGNLINE(struct) API_STATE
+{
+ // Vertex Buffers
+ SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS];
+
+ // Index Buffer
+ SWR_INDEX_BUFFER_STATE indexBuffer;
+
+ // FS - Fetch Shader State
+ PFN_FETCH_FUNC pfnFetchFunc;
+
+ // VS - Vertex Shader State
+ PFN_VERTEX_FUNC pfnVertexFunc;
+
+ // GS - Geometry Shader State
+ PFN_GS_FUNC pfnGsFunc;
+ SWR_GS_STATE gsState;
+
+ // CS - Compute Shader
+ PFN_CS_FUNC pfnCsFunc;
+ uint32_t totalThreadsInGroup;
+
+ // FE - Frontend State
+ SWR_FRONTEND_STATE frontendState;
+
+ // SOS - Streamout Shader State
+ PFN_SO_FUNC pfnSoFunc[MAX_SO_STREAMS];
+
+ // Streamout state
+ SWR_STREAMOUT_STATE soState;
+ mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS];
+
+ // Tessellation State
+ PFN_HS_FUNC pfnHsFunc;
+ PFN_DS_FUNC pfnDsFunc;
+ SWR_TS_STATE tsState;
+
+ // Specifies which VS outputs are sent to PS.
+ // Does not include position
+ uint32_t linkageMask;
+ uint32_t linkageCount;
+ uint8_t linkageMap[MAX_ATTRIBUTES];
+
+ // attrib mask, specifies the total set of attributes used
+ // by the frontend (vs, so, gs)
+ uint32_t feAttribMask;
+
+ PRIMITIVE_TOPOLOGY topology;
+ bool forceFront;
+
+ // RS - Rasterizer State
+ SWR_RASTSTATE rastState;
+ // floating point multisample offsets
+ float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2];
+
+ GUARDBAND gbState;
+
+ SWR_VIEWPORT vp[KNOB_NUM_VIEWPORTS_SCISSORS];
+ SWR_VIEWPORT_MATRIX vpMatrix[KNOB_NUM_VIEWPORTS_SCISSORS];
+
+ BBOX scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
+ BBOX scissorInFixedPoint;
+
+ // Backend state
+ SWR_BACKEND_STATE backendState;
+
+ // PS - Pixel shader state
+ SWR_PS_STATE psState;
+
+ SWR_DEPTH_STENCIL_STATE depthStencilState;
+
+ // OM - Output Merger State
+ SWR_BLEND_STATE blendState;
+ PFN_BLEND_JIT_FUNC pfnBlendFunc[SWR_NUM_RENDERTARGETS];
+
+ // Stats are incremented when this is true.
+ bool enableStats;
+
+ struct
+ {
+ uint32_t colorHottileEnable : 8;
+ uint32_t depthHottileEnable: 1;
+ uint32_t stencilHottileEnable : 1;
+ };
+};
+
+class MacroTileMgr;
+class DispatchQueue;
+
+struct RenderOutputBuffers
+{
+ uint8_t* pColor[SWR_NUM_RENDERTARGETS];
+ uint8_t* pDepth;
+ uint8_t* pStencil;
+};
+
+// Plane equation A/B/C coeffs used to evaluate I/J barycentric coords
+struct BarycentricCoeffs
+{
+ simdscalar vIa;
+ simdscalar vIb;
+ simdscalar vIc;
+
+ simdscalar vJa;
+ simdscalar vJb;
+ simdscalar vJc;
+
+ simdscalar vZa;
+ simdscalar vZb;
+ simdscalar vZc;
+
+ simdscalar vRecipDet;
+
+ simdscalar vAOneOverW;
+ simdscalar vBOneOverW;
+ simdscalar vCOneOverW;
+};
+
+// pipeline function pointer types
+typedef void(*PFN_BACKEND_FUNC)(DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&);
+typedef void(*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT &, uint8_t* (&)[SWR_NUM_RENDERTARGETS], uint32_t, const SWR_BLEND_STATE*,
+ const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS], simdscalar&, simdscalar);
+typedef void(*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &);
+typedef void(*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
+typedef void(*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &, const uint64_t *const, const uint32_t,
+ const simdscalar, const simdscalar);
+
+struct BACKEND_FUNCS
+{
+ PFN_BACKEND_FUNC pfnBackend;
+ PFN_CALC_PIXEL_BARYCENTRICS pfnCalcPixelBarycentrics;
+ PFN_CALC_SAMPLE_BARYCENTRICS pfnCalcSampleBarycentrics;
+ PFN_CALC_CENTROID_BARYCENTRICS pfnCalcCentroidBarycentrics;
+ PFN_OUTPUT_MERGER pfnOutputMerger;
+};
+
+// Draw State
+struct DRAW_STATE
+{
+ API_STATE state;
+
+ void* pPrivateState; // Its required the driver sets this up for each draw.
+
+ // pipeline function pointers, filled in by API thread when setting up the draw
+ BACKEND_FUNCS backendFuncs;
+ PFN_PROCESS_PRIMS pfnProcessPrims;
+
+ Arena* pArena; // This should only be used by API thread.
+};
+
+// Draw Context
+// The api thread sets up a draw context that exists for the life of the draw.
+// This draw context maintains all of the state needed for the draw operation.
+struct DRAW_CONTEXT
+{
+ SWR_CONTEXT *pContext;
+
+ uint64_t drawId;
+
+ bool isCompute; // Is this DC a compute context?
+
+ FE_WORK FeWork;
+ volatile OSALIGNLINE(uint32_t) FeLock;
+ volatile OSALIGNLINE(bool) inUse;
+ volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw?
+
+ // Have all worker threads moved past draw in DC ring?
+ volatile OSALIGNLINE(uint32_t) threadsDoneFE;
+ volatile OSALIGNLINE(uint32_t) threadsDoneBE;
+
+ uint64_t dependency;
+
+ MacroTileMgr* pTileMgr;
+
+ // The following fields are valid if isCompute is true.
+ volatile OSALIGNLINE(bool) doneCompute; // Is this dispatch done? (isCompute)
+ DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
+
+ DRAW_STATE* pState;
+ Arena* pArena;
+
+ uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS]; // Scratch space used for spill fills.
+};
+
+INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
+{
+ SWR_ASSERT(pDC != nullptr);
+ SWR_ASSERT(pDC->pState != nullptr);
+
+ return pDC->pState->state;
+}
+
+INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC)
+{
+ SWR_ASSERT(pDC != nullptr);
+ SWR_ASSERT(pDC->pState != nullptr);
+
+ return pDC->pState->pPrivateState;
+}
+
+class HotTileMgr;
+
+struct SWR_CONTEXT
+{
+ // Draw Context Ring
+ // Each draw needs its own state in order to support mulitple draws in flight across multiple threads.
+ // We maintain N draw contexts configured as a ring. The size of the ring limits the maximum number
+ // of draws that can be in flight at any given time.
+ //
+ // Description:
+ // 1. State - When an application first sets state we'll request a new draw context to use.
+ // a. If there are no available draw contexts then we'll have to wait until one becomes free.
+ // b. If one is available then set pCurDrawContext to point to it and mark it in use.
+ // c. All state calls set state on pCurDrawContext.
+ // 2. Draw - Creates submits a work item that is associated with current draw context.
+ // a. Set pPrevDrawContext = pCurDrawContext
+ // b. Set pCurDrawContext to NULL.
+ // 3. State - When an applications sets state after draw
+ // a. Same as step 1.
+ // b. State is copied from prev draw context to current.
+ DRAW_CONTEXT* dcRing;
+
+ DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw.
+ DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from.
+
+ // Draw State Ring
+ // When draw are very large (lots of primitives) then the API thread will break these up.
+ // These split draws all have identical state. So instead of storing the state directly
+ // in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
+ // to reference a single entry in the DS ring.
+ DRAW_STATE* dsRing;
+
+ uint32_t curStateId; // Current index to the next available entry in the DS ring.
+
+ DRAW_STATE* subCtxSave; // Save area for inactive contexts.
+ uint32_t curSubCtxId; // Current index for active state subcontext.
+ uint32_t numSubContexts; // Number of available subcontexts
+
+ uint32_t NumWorkerThreads;
+
+ THREAD_POOL threadPool; // Thread pool associated with this context
+
+ std::condition_variable FifosNotEmpty;
+ std::mutex WaitLock;
+
+ // Draw Contexts will get a unique drawId generated from this
+ uint64_t nextDrawId;
+
+ // most recent draw id enqueued by the API thread
+ // written by api thread, read by multiple workers
+ OSALIGNLINE(volatile uint64_t) DrawEnqueued;
+
+ DRIVER_TYPE driverType;
+
+ uint32_t privateStateSize;
+
+ HotTileMgr *pHotTileMgr;
+
+ // tile load/store functions, passed in at create context time
+ PFN_LOAD_TILE pfnLoadTile;
+ PFN_STORE_TILE pfnStoreTile;
+ PFN_CLEAR_TILE pfnClearTile;
+
+ // Global Stats
+ SWR_STATS stats[KNOB_MAX_NUM_THREADS];
+
+ // Scratch space for workers.
+ uint8_t* pScratch[KNOB_MAX_NUM_THREADS];
+};
+
+void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId);
+void WakeAllThreads(SWR_CONTEXT *pContext);
+
+#define UPDATE_STAT(name, count) if (GetApiState(pDC).enableStats) { pContext->stats[workerId].name += count; }
+#define SET_STAT(name, count) if (GetApiState(pDC).enableStats) { pContext->stats[workerId].name = count; }
diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
new file mode 100644
index 00000000000..4f245c8c53e
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
@@ -0,0 +1,245 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file depthstencil.h
+*
+* @brief Implements depth/stencil functionality
+*
+******************************************************************************/
+#pragma once
+#include "common/os.h"
+#include "format_conversion.h"
+
+INLINE
+void StencilOp(SWR_STENCILOP op, simdscalar mask, simdscalar stencilRefps, simdscalar &stencilps)
+{
+ simdscalari stencil = _simd_castps_si(stencilps);
+
+ switch (op)
+ {
+ case STENCILOP_KEEP:
+ break;
+ case STENCILOP_ZERO:
+ stencilps = _simd_blendv_ps(stencilps, _simd_setzero_ps(), mask);
+ break;
+ case STENCILOP_REPLACE:
+ stencilps = _simd_blendv_ps(stencilps, stencilRefps, mask);
+ break;
+ case STENCILOP_INCRSAT:
+ {
+ simdscalari stencilincr = _simd_adds_epu8(stencil, _simd_set1_epi32(1));
+ stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask);
+ break;
+ }
+ case STENCILOP_DECRSAT:
+ {
+ simdscalari stencildecr = _simd_subs_epu8(stencil, _simd_set1_epi32(1));
+ stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask);
+ break;
+ }
+ case STENCILOP_INCR:
+ {
+ simdscalari stencilincr = _simd_add_epi8(stencil, _simd_set1_epi32(1));
+ stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask);
+ break;
+ }
+ case STENCILOP_DECR:
+ {
+ simdscalari stencildecr = _simd_add_epi8(stencil, _simd_set1_epi32((-1) & 0xff));
+ stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask);
+ break;
+ }
+ case STENCILOP_INVERT:
+ {
+ simdscalar stencilinvert = _simd_andnot_ps(stencilps, _simd_cmpeq_ps(_simd_setzero_ps(), _simd_setzero_ps()));
+ stencilps = _simd_blendv_ps(stencilps, stencilinvert, mask);
+ break;
+ }
+ default:
+ break;
+ }
+}
+
+
+INLINE
+simdscalar DepthStencilTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState,
+ bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, simdscalar coverageMask, BYTE *pStencilBase,
+ simdscalar* pStencilMask)
+{
+ static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
+ static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format");
+
+ simdscalar depthResult = _simd_set1_ps(-1.0f);
+ simdscalar zbuf;
+
+ // clamp Z to viewport [minZ..maxZ]
+ simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ);
+ simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ);
+ interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, interpZ));
+
+ if (pDSState->depthTestEnable)
+ {
+ switch (pDSState->depthTestFunc)
+ {
+ case ZFUNC_NEVER: depthResult = _simd_setzero_ps(); break;
+ case ZFUNC_ALWAYS: break;
+ default:
+ zbuf = _simd_load_ps((const float*)pDepthBase);
+ }
+
+ switch (pDSState->depthTestFunc)
+ {
+ case ZFUNC_LE: depthResult = _simd_cmple_ps(interpZ, zbuf); break;
+ case ZFUNC_LT: depthResult = _simd_cmplt_ps(interpZ, zbuf); break;
+ case ZFUNC_GT: depthResult = _simd_cmpgt_ps(interpZ, zbuf); break;
+ case ZFUNC_GE: depthResult = _simd_cmpge_ps(interpZ, zbuf); break;
+ case ZFUNC_EQ: depthResult = _simd_cmpeq_ps(interpZ, zbuf); break;
+ case ZFUNC_NE: depthResult = _simd_cmpneq_ps(interpZ, zbuf); break;
+ }
+ }
+
+ simdscalar stencilMask = _simd_set1_ps(-1.0f);
+
+ if (pDSState->stencilTestEnable)
+ {
+ uint8_t stencilRefValue;
+ uint32_t stencilTestFunc;
+ uint8_t stencilTestMask;
+ if (frontFacing || !pDSState->doubleSidedStencilTestEnable)
+ {
+ stencilRefValue = pDSState->stencilRefValue;
+ stencilTestFunc = pDSState->stencilTestFunc;
+ stencilTestMask = pDSState->stencilTestMask;
+ }
+ else
+ {
+ stencilRefValue = pDSState->backfaceStencilRefValue;
+ stencilTestFunc = pDSState->backfaceStencilTestFunc;
+ stencilTestMask = pDSState->backfaceStencilTestMask;
+ }
+
+ simdvector sbuf;
+ simdscalar stencilWithMask;
+ simdscalar stencilRef;
+ switch(stencilTestFunc)
+ {
+ case ZFUNC_NEVER: stencilMask = _simd_setzero_ps(); break;
+ case ZFUNC_ALWAYS: break;
+ default:
+ LoadSOA<R8_UINT>(pStencilBase, sbuf);
+
+ // apply stencil read mask
+ stencilWithMask = _simd_castsi_ps(_simd_and_si(_simd_castps_si(sbuf.v[0]), _simd_set1_epi32(stencilTestMask)));
+
+ // do stencil compare in float to avoid simd integer emulation in AVX1
+ stencilWithMask = _simd_cvtepi32_ps(_simd_castps_si(stencilWithMask));
+
+ stencilRef = _simd_set1_ps((float)(stencilRefValue & stencilTestMask));
+ break;
+ }
+
+ switch(stencilTestFunc)
+ {
+ case ZFUNC_LE: stencilMask = _simd_cmple_ps(stencilRef, stencilWithMask); break;
+ case ZFUNC_LT: stencilMask = _simd_cmplt_ps(stencilRef, stencilWithMask); break;
+ case ZFUNC_GT: stencilMask = _simd_cmpgt_ps(stencilRef, stencilWithMask); break;
+ case ZFUNC_GE: stencilMask = _simd_cmpge_ps(stencilRef, stencilWithMask); break;
+ case ZFUNC_EQ: stencilMask = _simd_cmpeq_ps(stencilRef, stencilWithMask); break;
+ case ZFUNC_NE: stencilMask = _simd_cmpneq_ps(stencilRef, stencilWithMask); break;
+ }
+ }
+
+ simdscalar depthWriteMask = _simd_and_ps(depthResult, stencilMask);
+ depthWriteMask = _simd_and_ps(depthWriteMask, coverageMask);
+
+ *pStencilMask = stencilMask;
+ return depthWriteMask;
+}
+
+INLINE
+void DepthStencilWrite(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState,
+ bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask,
+ BYTE *pStencilBase, const simdscalar& stencilMask)
+{
+ if (pDSState->depthWriteEnable)
+ {
+ // clamp Z to viewport [minZ..maxZ]
+ simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ);
+ simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ);
+ interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, interpZ));
+
+ simdscalar vMask = _simd_and_ps(depthMask, coverageMask);
+ _simd_maskstore_ps((float*)pDepthBase, _simd_castps_si(vMask), interpZ);
+ }
+
+ if (pDSState->stencilWriteEnable)
+ {
+ simdvector sbuf;
+ LoadSOA<R8_UINT>(pStencilBase, sbuf);
+ simdscalar stencilbuf = sbuf.v[0];
+
+ uint8_t stencilRefValue;
+ uint32_t stencilFailOp;
+ uint32_t stencilPassDepthPassOp;
+ uint32_t stencilPassDepthFailOp;
+ uint8_t stencilWriteMask;
+ if (frontFacing || !pDSState->doubleSidedStencilTestEnable)
+ {
+ stencilRefValue = pDSState->stencilRefValue;
+ stencilFailOp = pDSState->stencilFailOp;
+ stencilPassDepthPassOp = pDSState->stencilPassDepthPassOp;
+ stencilPassDepthFailOp = pDSState->stencilPassDepthFailOp;
+ stencilWriteMask = pDSState->stencilWriteMask;
+ }
+ else
+ {
+ stencilRefValue = pDSState->backfaceStencilRefValue;
+ stencilFailOp = pDSState->backfaceStencilFailOp;
+ stencilPassDepthPassOp = pDSState->backfaceStencilPassDepthPassOp;
+ stencilPassDepthFailOp = pDSState->backfaceStencilPassDepthFailOp;
+ stencilWriteMask = pDSState->backfaceStencilWriteMask;
+ }
+
+ simdscalar stencilps = stencilbuf;
+ simdscalar stencilRefps = _simd_castsi_ps(_simd_set1_epi32(stencilRefValue));
+
+ simdscalar stencilFailMask = _simd_andnot_ps(stencilMask, coverageMask);
+ simdscalar stencilPassDepthPassMask = _simd_and_ps(stencilMask, depthMask);
+ simdscalar stencilPassDepthFailMask = _simd_and_ps(stencilMask, _simd_andnot_ps(depthMask, _simd_set1_ps(-1)));
+
+ simdscalar origStencil = stencilps;
+
+ StencilOp((SWR_STENCILOP)stencilFailOp, stencilFailMask, stencilRefps, stencilps);
+ StencilOp((SWR_STENCILOP)stencilPassDepthFailOp, stencilPassDepthFailMask, stencilRefps, stencilps);
+ StencilOp((SWR_STENCILOP)stencilPassDepthPassOp, stencilPassDepthPassMask, stencilRefps, stencilps);
+
+ // apply stencil write mask
+ simdscalari vWriteMask = _simd_set1_epi32(stencilWriteMask);
+ stencilps = _simd_and_ps(stencilps, _simd_castsi_ps(vWriteMask));
+ stencilps = _simd_or_ps(_simd_andnot_ps(_simd_castsi_ps(vWriteMask), origStencil), stencilps);
+
+ simdvector stencilResult;
+ stencilResult.v[0] = _simd_blendv_ps(origStencil, stencilps, coverageMask);
+ StoreSOA<R8_UINT>(stencilResult, pStencilBase);
+ }
+
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
new file mode 100644
index 00000000000..7e556012e6b
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
@@ -0,0 +1,136 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file fifo.hpp
+*
+* @brief Definitions for our fifos used for thread communication.
+*
+******************************************************************************/
+#pragma once
+
+
+#include "common/os.h"
+#include "arena.h"
+
+#include <vector>
+#include <cassert>
+
+template<class T>
+struct QUEUE
+{
+ OSALIGNLINE(volatile uint32_t) mLock{ 0 };
+ OSALIGNLINE(volatile uint32_t) mNumEntries{ 0 };
+ std::vector<T*> mBlocks;
+ T* mCurBlock{ nullptr };
+ uint32_t mHead{ 0 };
+ uint32_t mTail{ 0 };
+ uint32_t mCurBlockIdx{ 0 };
+
+ // power of 2
+ static const uint32_t mBlockSizeShift = 6;
+ static const uint32_t mBlockSize = 1 << mBlockSizeShift;
+
+ void clear(Arena& arena)
+ {
+ mHead = 0;
+ mTail = 0;
+ mBlocks.clear();
+ T* pNewBlock = (T*)arena.Alloc(sizeof(T)*mBlockSize);
+ mBlocks.push_back(pNewBlock);
+ mCurBlock = pNewBlock;
+ mCurBlockIdx = 0;
+
+ mNumEntries = 0;
+ _ReadWriteBarrier();
+ mLock = 0;
+ }
+
+ uint32_t getNumQueued()
+ {
+ return mNumEntries;
+ }
+
+ bool tryLock()
+ {
+ if (mLock)
+ {
+ return false;
+ }
+
+ // try to lock the FIFO
+ LONG initial = InterlockedCompareExchange(&mLock, 1, 0);
+ return (initial == 0);
+ }
+
+ void unlock()
+ {
+ mLock = 0;
+ }
+
+ T* peek()
+ {
+ if (mNumEntries == 0)
+ {
+ return nullptr;
+ }
+ uint32_t block = mHead >> mBlockSizeShift;
+ return &mBlocks[block][mHead & (mBlockSize-1)];
+ }
+
+ void dequeue_noinc()
+ {
+ mHead ++;
+ mNumEntries --;
+ }
+
+ bool enqueue_try_nosync(Arena& arena, const T* entry)
+ {
+ memcpy(&mCurBlock[mTail], entry, sizeof(T));
+
+ mTail ++;
+ if (mTail == mBlockSize)
+ {
+ if (++mCurBlockIdx < mBlocks.size())
+ {
+ mCurBlock = mBlocks[mCurBlockIdx];
+ }
+ else
+ {
+ T* newBlock = (T*)arena.Alloc(sizeof(T)*mBlockSize);
+ SWR_ASSERT(newBlock);
+
+ mBlocks.push_back(newBlock);
+ mCurBlock = newBlock;
+ }
+
+ mTail = 0;
+ }
+
+ mNumEntries ++;
+ return true;
+ }
+
+ void destroy()
+ {
+ }
+
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
new file mode 100644
index 00000000000..83d85fc86d8
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
@@ -0,0 +1,196 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file format_conversion.h
+*
+* @brief API implementation
+*
+******************************************************************************/
+#include "format_types.h"
+#include "format_traits.h"
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Load SIMD packed pixels in SOA format and converts to
+/// SOA RGBA32_FLOAT format.
+/// @param pSrc - source data in SOA form
+/// @param dst - output data in SOA form
+template<SWR_FORMAT SrcFormat>
+INLINE void LoadSOA(const BYTE *pSrc, simdvector &dst)
+{
+ // fast path for float32
+ if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<SrcFormat>::GetBPC(0) == 32))
+ {
+ auto lambda = [&](int comp)
+ {
+ simdscalar vComp = _simd_load_ps((const float*)(pSrc + comp*sizeof(simdscalar)));
+
+ dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
+ };
+
+ UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda);
+ return;
+ }
+
+ auto lambda = [&](int comp)
+ {
+ // load SIMD components
+ simdscalar vComp = FormatTraits<SrcFormat>::loadSOA(comp, pSrc);
+
+ // unpack
+ vComp = FormatTraits<SrcFormat>::unpack(comp, vComp);
+
+ // convert
+ if (FormatTraits<SrcFormat>::isNormalized(comp))
+ {
+ vComp = _simd_cvtepi32_ps(_simd_castps_si(vComp));
+ vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<SrcFormat>::toFloat(comp)));
+ }
+
+ dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
+
+ pSrc += (FormatTraits<SrcFormat>::GetBPC(comp) * KNOB_SIMD_WIDTH) / 8;
+ };
+
+ UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Clamps the given component based on the requirements on the
+/// Format template arg
+/// @param vComp - SIMD vector of floats
+/// @param Component - component
+template<SWR_FORMAT Format>
+INLINE simdscalar Clamp(simdscalar vComp, uint32_t Component)
+{
+ if (FormatTraits<Format>::isNormalized(Component))
+ {
+ if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UNORM)
+ {
+ vComp = _simd_max_ps(vComp, _simd_setzero_ps());
+ }
+
+ if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SNORM)
+ {
+ vComp = _simd_max_ps(vComp, _simd_set1_ps(-1.0f));
+ }
+ vComp = _simd_min_ps(vComp, _simd_set1_ps(1.0f));
+ }
+ else if (FormatTraits<Format>::GetBPC(Component) < 32)
+ {
+ if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UINT)
+ {
+ int iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1;
+ int iMin = 0;
+ simdscalari vCompi = _simd_castps_si(vComp);
+ vCompi = _simd_max_epu32(vCompi, _simd_set1_epi32(iMin));
+ vCompi = _simd_min_epu32(vCompi, _simd_set1_epi32(iMax));
+ vComp = _simd_castsi_ps(vCompi);
+ }
+ else if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SINT)
+ {
+ int iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1;
+ int iMin = -1 - iMax;
+ simdscalari vCompi = _simd_castps_si(vComp);
+ vCompi = _simd_max_epi32(vCompi, _simd_set1_epi32(iMin));
+ vCompi = _simd_min_epi32(vCompi, _simd_set1_epi32(iMax));
+ vComp = _simd_castsi_ps(vCompi);
+ }
+ }
+
+ return vComp;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Normalize the given component based on the requirements on the
+/// Format template arg
+/// @param vComp - SIMD vector of floats
+/// @param Component - component
+template<SWR_FORMAT Format>
+INLINE simdscalar Normalize(simdscalar vComp, uint32_t Component)
+{
+ if (FormatTraits<Format>::isNormalized(Component))
+ {
+ vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<Format>::fromFloat(Component)));
+ vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
+ }
+ return vComp;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Convert and store simdvector of pixels in SOA
+/// RGBA32_FLOAT to SOA format
+/// @param src - source data in SOA form
+/// @param dst - output data in SOA form
+template<SWR_FORMAT DstFormat>
+INLINE void StoreSOA(const simdvector &src, BYTE *pDst)
+{
+ // fast path for float32
+ if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<DstFormat>::GetBPC(0) == 32))
+ {
+ for (uint32_t comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
+ {
+ simdscalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
+
+ // Gamma-correct
+ if (FormatTraits<DstFormat>::isSRGB)
+ {
+ if (comp < 3) // Input format is always RGBA32_FLOAT.
+ {
+ vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
+ }
+ }
+
+ _simd_store_ps((float*)(pDst + comp*sizeof(simdscalar)), vComp);
+ }
+ return;
+ }
+
+ auto lambda = [&](int comp)
+ {
+ simdscalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
+
+ // Gamma-correct
+ if (FormatTraits<DstFormat>::isSRGB)
+ {
+ if (comp < 3) // Input format is always RGBA32_FLOAT.
+ {
+ vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
+ }
+ }
+
+ // clamp
+ vComp = Clamp<DstFormat>(vComp, comp);
+
+ // normalize
+ vComp = Normalize<DstFormat>(vComp, comp);
+
+ // pack
+ vComp = FormatTraits<DstFormat>::pack(comp, vComp);
+
+ // store
+ FormatTraits<DstFormat>::storeSOA(comp, pDst, vComp);
+
+ pDst += (FormatTraits<DstFormat>::GetBPC(comp) * KNOB_SIMD_WIDTH) / 8;
+ };
+
+ UnrollerL<0, FormatTraits<DstFormat>::numComps, 1>::step(lambda);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_traits.h b/src/gallium/drivers/swr/rasterizer/core/format_traits.h
new file mode 100644
index 00000000000..52340f4987a
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/format_traits.h
@@ -0,0 +1,3548 @@
+
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file format_traits.h
+*
+* @brief auto-generated file
+*
+* DO NOT EDIT
+*
+******************************************************************************/
+
+#pragma once
+
+#include "format_types.h"
+#include "utils.h"
+
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatSwizzle - Component swizzle selects
+//////////////////////////////////////////////////////////////////////////
+template<UINT comp0 = 0, uint32_t comp1 = 0, uint32_t comp2 = 0, uint32_t comp3 = 0>
+struct FormatSwizzle
+{
+ // Return swizzle select for component.
+ INLINE static uint32_t swizzle(UINT c)
+ {
+ static const uint32_t s[4] = { comp0, comp1, comp2, comp3 };
+ return s[c];
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits - Format traits
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT format>
+struct FormatTraits :
+ ComponentTraits<SWR_TYPE_UNKNOWN, 0>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0>
+{
+ static const uint32_t bpp{ 0 };
+ static const uint32_t numComps{ 0 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{1};
+ static const uint32_t bcHeight{1};
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32B32A32_FLOAT> - Format traits specialization for R32G32B32A32_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32B32A32_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 128 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose32_32_32_32 TransposeT;
+ typedef Format4<32, 32, 32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32B32A32_SINT> - Format traits specialization for R32G32B32A32_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32B32A32_SINT> :
+ ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 128 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose32_32_32_32 TransposeT;
+ typedef Format4<32, 32, 32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32B32A32_UINT> - Format traits specialization for R32G32B32A32_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32B32A32_UINT> :
+ ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 128 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose32_32_32_32 TransposeT;
+ typedef Format4<32, 32, 32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32B32X32_FLOAT> - Format traits specialization for R32G32B32X32_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32B32X32_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_UNUSED, 32>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 128 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose32_32_32_32 TransposeT;
+ typedef Format4<32, 32, 32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32B32A32_SSCALED> - Format traits specialization for R32G32B32A32_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32B32A32_SSCALED> :
+ ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 128 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose32_32_32_32 TransposeT;
+ typedef Format4<32, 32, 32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32B32A32_USCALED> - Format traits specialization for R32G32B32A32_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32B32A32_USCALED> :
+ ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 128 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose32_32_32_32 TransposeT;
+ typedef Format4<32, 32, 32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32B32_FLOAT> - Format traits specialization for R32G32B32_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32B32_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
+ FormatSwizzle<0, 1, 2>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 96 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose32_32_32 TransposeT;
+ typedef Format3<32, 32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32B32_SINT> - Format traits specialization for R32G32B32_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32B32_SINT> :
+ ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>,
+ FormatSwizzle<0, 1, 2>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 96 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose32_32_32 TransposeT;
+ typedef Format3<32, 32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32B32_UINT> - Format traits specialization for R32G32B32_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32B32_UINT> :
+ ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>,
+ FormatSwizzle<0, 1, 2>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 96 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose32_32_32 TransposeT;
+ typedef Format3<32, 32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32B32_SSCALED> - Format traits specialization for R32G32B32_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32B32_SSCALED> :
+ ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>,
+ FormatSwizzle<0, 1, 2>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 96 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose32_32_32 TransposeT;
+ typedef Format3<32, 32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32B32_USCALED> - Format traits specialization for R32G32B32_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32B32_USCALED> :
+ ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>,
+ FormatSwizzle<0, 1, 2>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 96 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose32_32_32 TransposeT;
+ typedef Format3<32, 32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16A16_UNORM> - Format traits specialization for R16G16B16A16_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16A16_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16_16_16 TransposeT;
+ typedef Format4<16, 16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16A16_SNORM> - Format traits specialization for R16G16B16A16_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16A16_SNORM> :
+ ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16_16_16 TransposeT;
+ typedef Format4<16, 16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16A16_SINT> - Format traits specialization for R16G16B16A16_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16A16_SINT> :
+ ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16_16_16 TransposeT;
+ typedef Format4<16, 16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16A16_UINT> - Format traits specialization for R16G16B16A16_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16A16_UINT> :
+ ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16_16_16 TransposeT;
+ typedef Format4<16, 16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16A16_FLOAT> - Format traits specialization for R16G16B16A16_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16A16_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16_16_16 TransposeT;
+ typedef Format4<16, 16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32_FLOAT> - Format traits specialization for R32G32_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
+ FormatSwizzle<0, 1>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose32_32 TransposeT;
+ typedef Format2<32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32_SINT> - Format traits specialization for R32G32_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32_SINT> :
+ ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>,
+ FormatSwizzle<0, 1>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose32_32 TransposeT;
+ typedef Format2<32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32_UINT> - Format traits specialization for R32G32_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32_UINT> :
+ ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>,
+ FormatSwizzle<0, 1>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose32_32 TransposeT;
+ typedef Format2<32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32_FLOAT_X8X24_TYPELESS> - Format traits specialization for R32_FLOAT_X8X24_TYPELESS
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32_FLOAT_X8X24_TYPELESS> :
+ ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_UNUSED, 32>,
+ FormatSwizzle<0, 1>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose32_32 TransposeT;
+ typedef Format2<32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<X32_TYPELESS_G8X24_UINT> - Format traits specialization for X32_TYPELESS_G8X24_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<X32_TYPELESS_G8X24_UINT> :
+ ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UNUSED, 32>,
+ FormatSwizzle<0, 1>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose32_32 TransposeT;
+ typedef Format2<32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L32A32_FLOAT> - Format traits specialization for L32A32_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L32A32_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
+ FormatSwizzle<0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 1 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose32_32 TransposeT;
+ typedef Format2<32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16X16_UNORM> - Format traits specialization for R16G16B16X16_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16X16_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNUSED, 16>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16_16_16 TransposeT;
+ typedef Format4<16, 16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16X16_FLOAT> - Format traits specialization for R16G16B16X16_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16X16_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_UNUSED, 16>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16_16_16 TransposeT;
+ typedef Format4<16, 16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L32X32_FLOAT> - Format traits specialization for L32X32_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L32X32_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
+ FormatSwizzle<0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose32_32 TransposeT;
+ typedef Format2<32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<I32X32_FLOAT> - Format traits specialization for I32X32_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<I32X32_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
+ FormatSwizzle<0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose32_32 TransposeT;
+ typedef Format2<32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16A16_SSCALED> - Format traits specialization for R16G16B16A16_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16A16_SSCALED> :
+ ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16_16_16 TransposeT;
+ typedef Format4<16, 16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16A16_USCALED> - Format traits specialization for R16G16B16A16_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16A16_USCALED> :
+ ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16_16_16 TransposeT;
+ typedef Format4<16, 16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32_SSCALED> - Format traits specialization for R32G32_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32_SSCALED> :
+ ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>,
+ FormatSwizzle<0, 1>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose32_32 TransposeT;
+ typedef Format2<32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32_USCALED> - Format traits specialization for R32G32_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32_USCALED> :
+ ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>,
+ FormatSwizzle<0, 1>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose32_32 TransposeT;
+ typedef Format2<32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32_FLOAT_X8X24_TYPELESS_LD> - Format traits specialization for R32_FLOAT_X8X24_TYPELESS_LD
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32_FLOAT_X8X24_TYPELESS_LD> :
+ ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_UNUSED, 32>,
+ FormatSwizzle<0, 1>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose32_32 TransposeT;
+ typedef Format2<32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B8G8R8A8_UNORM> - Format traits specialization for B8G8R8A8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B8G8R8A8_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<2, 1, 0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8_8_8 TransposeT;
+ typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B8G8R8A8_UNORM_SRGB> - Format traits specialization for B8G8R8A8_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B8G8R8A8_UNORM_SRGB> :
+ ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<2, 1, 0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ true };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8_8_8 TransposeT;
+ typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R10G10B10A2_UNORM> - Format traits specialization for R10G10B10A2_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R10G10B10A2_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose10_10_10_2 TransposeT;
+ typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R10G10B10A2_UNORM_SRGB> - Format traits specialization for R10G10B10A2_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R10G10B10A2_UNORM_SRGB> :
+ ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ true };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose10_10_10_2 TransposeT;
+ typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R10G10B10A2_UINT> - Format traits specialization for R10G10B10A2_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R10G10B10A2_UINT> :
+ ComponentTraits<SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 2>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose10_10_10_2 TransposeT;
+ typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8A8_UNORM> - Format traits specialization for R8G8B8A8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8A8_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8_8_8 TransposeT;
+ typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8A8_UNORM_SRGB> - Format traits specialization for R8G8B8A8_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8A8_UNORM_SRGB> :
+ ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ true };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8_8_8 TransposeT;
+ typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8A8_SNORM> - Format traits specialization for R8G8B8A8_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8A8_SNORM> :
+ ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8_8_8 TransposeT;
+ typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8A8_SINT> - Format traits specialization for R8G8B8A8_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8A8_SINT> :
+ ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8_8_8 TransposeT;
+ typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8A8_UINT> - Format traits specialization for R8G8B8A8_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8A8_UINT> :
+ ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8_8_8 TransposeT;
+ typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16_UNORM> - Format traits specialization for R16G16_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>,
+ FormatSwizzle<0, 1>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16 TransposeT;
+ typedef Format2<16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16_SNORM> - Format traits specialization for R16G16_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16_SNORM> :
+ ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>,
+ FormatSwizzle<0, 1>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16 TransposeT;
+ typedef Format2<16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16_SINT> - Format traits specialization for R16G16_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16_SINT> :
+ ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>,
+ FormatSwizzle<0, 1>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16 TransposeT;
+ typedef Format2<16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16_UINT> - Format traits specialization for R16G16_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16_UINT> :
+ ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>,
+ FormatSwizzle<0, 1>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16 TransposeT;
+ typedef Format2<16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16_FLOAT> - Format traits specialization for R16G16_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>,
+ FormatSwizzle<0, 1>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16 TransposeT;
+ typedef Format2<16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B10G10R10A2_UNORM> - Format traits specialization for B10G10R10A2_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B10G10R10A2_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>,
+ FormatSwizzle<2, 1, 0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose10_10_10_2 TransposeT;
+ typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B10G10R10A2_UNORM_SRGB> - Format traits specialization for B10G10R10A2_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B10G10R10A2_UNORM_SRGB> :
+ ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>,
+ FormatSwizzle<2, 1, 0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ true };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose10_10_10_2 TransposeT;
+ typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R11G11B10_FLOAT> - Format traits specialization for R11G11B10_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R11G11B10_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 11, SWR_TYPE_FLOAT, 11, SWR_TYPE_FLOAT, 10>,
+ FormatSwizzle<0, 1, 2>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose11_11_10 TransposeT;
+ typedef Format3<11, 11, 10> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32_SINT> - Format traits specialization for R32_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32_SINT> :
+ ComponentTraits<SWR_TYPE_SINT, 32>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<32> TransposeT;
+ typedef Format1<32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32_UINT> - Format traits specialization for R32_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32_UINT> :
+ ComponentTraits<SWR_TYPE_UINT, 32>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<32> TransposeT;
+ typedef Format1<32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32_FLOAT> - Format traits specialization for R32_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 32>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<32> TransposeT;
+ typedef Format1<32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R24_UNORM_X8_TYPELESS> - Format traits specialization for R24_UNORM_X8_TYPELESS
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R24_UNORM_X8_TYPELESS> :
+ ComponentTraits<SWR_TYPE_UNORM, 24>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<32> TransposeT;
+ typedef Format1<24> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R24_UNORM_X8_TYPELESS_LD> - Format traits specialization for R24_UNORM_X8_TYPELESS_LD
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R24_UNORM_X8_TYPELESS_LD> :
+ ComponentTraits<SWR_TYPE_UNORM, 24>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<32> TransposeT;
+ typedef Format1<24> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L16A16_UNORM> - Format traits specialization for L16A16_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L16A16_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>,
+ FormatSwizzle<0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 1 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16 TransposeT;
+ typedef Format2<16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<I24X8_UNORM> - Format traits specialization for I24X8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<I24X8_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 24, SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose24_8 TransposeT;
+ typedef Format2<24, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L24X8_UNORM> - Format traits specialization for L24X8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L24X8_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 24, SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose24_8 TransposeT;
+ typedef Format2<24, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<I32_FLOAT> - Format traits specialization for I32_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<I32_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 32>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<32> TransposeT;
+ typedef Format1<32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L32_FLOAT> - Format traits specialization for L32_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L32_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 32>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<32> TransposeT;
+ typedef Format1<32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<A32_FLOAT> - Format traits specialization for A32_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<A32_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 32>,
+ FormatSwizzle<3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<32> TransposeT;
+ typedef Format1<32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B8G8R8X8_UNORM> - Format traits specialization for B8G8R8X8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B8G8R8X8_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
+ FormatSwizzle<2, 1, 0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8_8_8 TransposeT;
+ typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B8G8R8X8_UNORM_SRGB> - Format traits specialization for B8G8R8X8_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B8G8R8X8_UNORM_SRGB> :
+ ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
+ FormatSwizzle<2, 1, 0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ true };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8_8_8 TransposeT;
+ typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8X8_UNORM> - Format traits specialization for R8G8B8X8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8X8_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8_8_8 TransposeT;
+ typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8X8_UNORM_SRGB> - Format traits specialization for R8G8B8X8_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8X8_UNORM_SRGB> :
+ ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ true };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8_8_8 TransposeT;
+ typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R9G9B9E5_SHAREDEXP> - Format traits specialization for R9G9B9E5_SHAREDEXP
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R9G9B9E5_SHAREDEXP> :
+ ComponentTraits<SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 5>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose9_9_9_5 TransposeT;
+ typedef Format4<9, 9, 9, 5> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B10G10R10X2_UNORM> - Format traits specialization for B10G10R10X2_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B10G10R10X2_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNUSED, 2>,
+ FormatSwizzle<2, 1, 0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose10_10_10_2 TransposeT;
+ typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L16A16_FLOAT> - Format traits specialization for L16A16_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L16A16_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>,
+ FormatSwizzle<0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 1 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16 TransposeT;
+ typedef Format2<16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R10G10B10X2_USCALED> - Format traits specialization for R10G10B10X2_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R10G10B10X2_USCALED> :
+ ComponentTraits<SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_UNUSED, 2>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose10_10_10_2 TransposeT;
+ typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8A8_SSCALED> - Format traits specialization for R8G8B8A8_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8A8_SSCALED> :
+ ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8_8_8 TransposeT;
+ typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8A8_USCALED> - Format traits specialization for R8G8B8A8_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8A8_USCALED> :
+ ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8_8_8 TransposeT;
+ typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16_SSCALED> - Format traits specialization for R16G16_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16_SSCALED> :
+ ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>,
+ FormatSwizzle<0, 1>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16 TransposeT;
+ typedef Format2<16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16_USCALED> - Format traits specialization for R16G16_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16_USCALED> :
+ ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>,
+ FormatSwizzle<0, 1>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16 TransposeT;
+ typedef Format2<16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32_SSCALED> - Format traits specialization for R32_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32_SSCALED> :
+ ComponentTraits<SWR_TYPE_SSCALED, 32>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<32> TransposeT;
+ typedef Format1<32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32_USCALED> - Format traits specialization for R32_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32_USCALED> :
+ ComponentTraits<SWR_TYPE_USCALED, 32>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<32> TransposeT;
+ typedef Format1<32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B5G6R5_UNORM> - Format traits specialization for B5G6R5_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B5G6R5_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 6, SWR_TYPE_UNORM, 5>,
+ FormatSwizzle<2, 1, 0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose5_6_5 TransposeT;
+ typedef Format3<5, 6, 5> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B5G6R5_UNORM_SRGB> - Format traits specialization for B5G6R5_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B5G6R5_UNORM_SRGB> :
+ ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 6, SWR_TYPE_UNORM, 5>,
+ FormatSwizzle<2, 1, 0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ true };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose5_6_5 TransposeT;
+ typedef Format3<5, 6, 5> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B5G5R5A1_UNORM> - Format traits specialization for B5G5R5A1_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B5G5R5A1_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 1>,
+ FormatSwizzle<2, 1, 0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose5_5_5_1 TransposeT;
+ typedef Format4<5, 5, 5, 1> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B5G5R5A1_UNORM_SRGB> - Format traits specialization for B5G5R5A1_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B5G5R5A1_UNORM_SRGB> :
+ ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 1>,
+ FormatSwizzle<2, 1, 0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ true };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose5_5_5_1 TransposeT;
+ typedef Format4<5, 5, 5, 1> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B4G4R4A4_UNORM> - Format traits specialization for B4G4R4A4_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B4G4R4A4_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>,
+ FormatSwizzle<2, 1, 0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose4_4_4_4 TransposeT;
+ typedef Format4<4, 4, 4, 4> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B4G4R4A4_UNORM_SRGB> - Format traits specialization for B4G4R4A4_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B4G4R4A4_UNORM_SRGB> :
+ ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>,
+ FormatSwizzle<2, 1, 0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ true };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose4_4_4_4 TransposeT;
+ typedef Format4<4, 4, 4, 4> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8_UNORM> - Format traits specialization for R8G8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0, 1>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8 TransposeT;
+ typedef Format2<8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8_SNORM> - Format traits specialization for R8G8_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8_SNORM> :
+ ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>,
+ FormatSwizzle<0, 1>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8 TransposeT;
+ typedef Format2<8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8_SINT> - Format traits specialization for R8G8_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8_SINT> :
+ ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>,
+ FormatSwizzle<0, 1>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8 TransposeT;
+ typedef Format2<8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8_UINT> - Format traits specialization for R8G8_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8_UINT> :
+ ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
+ FormatSwizzle<0, 1>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8 TransposeT;
+ typedef Format2<8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16_UNORM> - Format traits specialization for R16_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 16>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<16> TransposeT;
+ typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16_SNORM> - Format traits specialization for R16_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16_SNORM> :
+ ComponentTraits<SWR_TYPE_SNORM, 16>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<16> TransposeT;
+ typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16_SINT> - Format traits specialization for R16_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16_SINT> :
+ ComponentTraits<SWR_TYPE_SINT, 16>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<16> TransposeT;
+ typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16_UINT> - Format traits specialization for R16_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16_UINT> :
+ ComponentTraits<SWR_TYPE_UINT, 16>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<16> TransposeT;
+ typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16_FLOAT> - Format traits specialization for R16_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 16>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<16> TransposeT;
+ typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<I16_UNORM> - Format traits specialization for I16_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<I16_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 16>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<16> TransposeT;
+ typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L16_UNORM> - Format traits specialization for L16_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L16_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 16>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<16> TransposeT;
+ typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<A16_UNORM> - Format traits specialization for A16_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<A16_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 16>,
+ FormatSwizzle<3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<16> TransposeT;
+ typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L8A8_UNORM> - Format traits specialization for L8A8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L8A8_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 1 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8 TransposeT;
+ typedef Format2<8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<I16_FLOAT> - Format traits specialization for I16_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<I16_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 16>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<16> TransposeT;
+ typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L16_FLOAT> - Format traits specialization for L16_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L16_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 16>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<16> TransposeT;
+ typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<A16_FLOAT> - Format traits specialization for A16_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<A16_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 16>,
+ FormatSwizzle<3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<16> TransposeT;
+ typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L8A8_UNORM_SRGB> - Format traits specialization for L8A8_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L8A8_UNORM_SRGB> :
+ ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 1 };
+ static const bool isSRGB{ true };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8 TransposeT;
+ typedef Format2<8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B5G5R5X1_UNORM> - Format traits specialization for B5G5R5X1_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B5G5R5X1_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNUSED, 1>,
+ FormatSwizzle<2, 1, 0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose5_5_5_1 TransposeT;
+ typedef Format4<5, 5, 5, 1> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B5G5R5X1_UNORM_SRGB> - Format traits specialization for B5G5R5X1_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B5G5R5X1_UNORM_SRGB> :
+ ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNUSED, 1>,
+ FormatSwizzle<2, 1, 0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ true };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose5_5_5_1 TransposeT;
+ typedef Format4<5, 5, 5, 1> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8_SSCALED> - Format traits specialization for R8G8_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8_SSCALED> :
+ ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>,
+ FormatSwizzle<0, 1>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8 TransposeT;
+ typedef Format2<8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8_USCALED> - Format traits specialization for R8G8_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8_USCALED> :
+ ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>,
+ FormatSwizzle<0, 1>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8 TransposeT;
+ typedef Format2<8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16_SSCALED> - Format traits specialization for R16_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16_SSCALED> :
+ ComponentTraits<SWR_TYPE_SSCALED, 16>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<16> TransposeT;
+ typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16_USCALED> - Format traits specialization for R16_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16_USCALED> :
+ ComponentTraits<SWR_TYPE_USCALED, 16>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<16> TransposeT;
+ typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L8A8_UINT> - Format traits specialization for L8A8_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L8A8_UINT> :
+ ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
+ FormatSwizzle<0, 3>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 1 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8 TransposeT;
+ typedef Format2<8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L8A8_SINT> - Format traits specialization for L8A8_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L8A8_SINT> :
+ ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>,
+ FormatSwizzle<0, 3>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 16 };
+ static const uint32_t numComps{ 2 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 1 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8 TransposeT;
+ typedef Format2<8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8_UNORM> - Format traits specialization for R8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 8 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<8> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8_SNORM> - Format traits specialization for R8_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8_SNORM> :
+ ComponentTraits<SWR_TYPE_SNORM, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 8 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<8> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8_SINT> - Format traits specialization for R8_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8_SINT> :
+ ComponentTraits<SWR_TYPE_SINT, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 8 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<8> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8_UINT> - Format traits specialization for R8_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8_UINT> :
+ ComponentTraits<SWR_TYPE_UINT, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 8 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<8> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<A8_UNORM> - Format traits specialization for A8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<A8_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 8 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<8> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<I8_UNORM> - Format traits specialization for I8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<I8_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 8 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<8> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L8_UNORM> - Format traits specialization for L8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L8_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 8 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<8> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8_SSCALED> - Format traits specialization for R8_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8_SSCALED> :
+ ComponentTraits<SWR_TYPE_SSCALED, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 8 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<8> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8_USCALED> - Format traits specialization for R8_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8_USCALED> :
+ ComponentTraits<SWR_TYPE_USCALED, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 8 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<8> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L8_UNORM_SRGB> - Format traits specialization for L8_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L8_UNORM_SRGB> :
+ ComponentTraits<SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 8 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ true };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<8> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L8_UINT> - Format traits specialization for L8_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L8_UINT> :
+ ComponentTraits<SWR_TYPE_UINT, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 8 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<8> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L8_SINT> - Format traits specialization for L8_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L8_SINT> :
+ ComponentTraits<SWR_TYPE_SINT, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 8 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<8> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<I8_UINT> - Format traits specialization for I8_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<I8_UINT> :
+ ComponentTraits<SWR_TYPE_UINT, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 8 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<8> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<I8_SINT> - Format traits specialization for I8_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<I8_SINT> :
+ ComponentTraits<SWR_TYPE_SINT, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 8 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef TransposeSingleComponent<8> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<YCRCB_SWAPUVY> - Format traits specialization for YCRCB_SWAPUVY
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<YCRCB_SWAPUVY> :
+ ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ true };
+ static const uint32_t bcWidth{ 2 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8_8_8 TransposeT;
+ typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC1_UNORM> - Format traits specialization for BC1_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC1_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ true };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 4 };
+ static const uint32_t bcHeight{ 4 };
+
+ typedef TransposeSingleComponent<64> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC2_UNORM> - Format traits specialization for BC2_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC2_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 128 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ true };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 4 };
+ static const uint32_t bcHeight{ 4 };
+
+ typedef TransposeSingleComponent<128> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC3_UNORM> - Format traits specialization for BC3_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC3_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 128 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ true };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 4 };
+ static const uint32_t bcHeight{ 4 };
+
+ typedef TransposeSingleComponent<128> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC4_UNORM> - Format traits specialization for BC4_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC4_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ true };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 4 };
+ static const uint32_t bcHeight{ 4 };
+
+ typedef TransposeSingleComponent<64> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC5_UNORM> - Format traits specialization for BC5_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC5_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 128 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ true };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 4 };
+ static const uint32_t bcHeight{ 4 };
+
+ typedef TransposeSingleComponent<128> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC1_UNORM_SRGB> - Format traits specialization for BC1_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC1_UNORM_SRGB> :
+ ComponentTraits<SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ true };
+ static const bool isBC{ true };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 4 };
+ static const uint32_t bcHeight{ 4 };
+
+ typedef TransposeSingleComponent<64> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC2_UNORM_SRGB> - Format traits specialization for BC2_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC2_UNORM_SRGB> :
+ ComponentTraits<SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 128 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ true };
+ static const bool isBC{ true };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 4 };
+ static const uint32_t bcHeight{ 4 };
+
+ typedef TransposeSingleComponent<128> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC3_UNORM_SRGB> - Format traits specialization for BC3_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC3_UNORM_SRGB> :
+ ComponentTraits<SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 128 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ true };
+ static const bool isBC{ true };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 4 };
+ static const uint32_t bcHeight{ 4 };
+
+ typedef TransposeSingleComponent<128> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<YCRCB_SWAPUV> - Format traits specialization for YCRCB_SWAPUV
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<YCRCB_SWAPUV> :
+ ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ true };
+ static const uint32_t bcWidth{ 2 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8_8_8 TransposeT;
+ typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8_UNORM> - Format traits specialization for R8G8B8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0, 1, 2>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 24 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8_8 TransposeT;
+ typedef Format3<8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8_SNORM> - Format traits specialization for R8G8B8_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8_SNORM> :
+ ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>,
+ FormatSwizzle<0, 1, 2>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 24 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8_8 TransposeT;
+ typedef Format3<8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8_SSCALED> - Format traits specialization for R8G8B8_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8_SSCALED> :
+ ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>,
+ FormatSwizzle<0, 1, 2>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 24 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8_8 TransposeT;
+ typedef Format3<8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8_USCALED> - Format traits specialization for R8G8B8_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8_USCALED> :
+ ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>,
+ FormatSwizzle<0, 1, 2>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 24 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8_8 TransposeT;
+ typedef Format3<8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC4_SNORM> - Format traits specialization for BC4_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC4_SNORM> :
+ ComponentTraits<SWR_TYPE_SNORM, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 64 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ true };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 4 };
+ static const uint32_t bcHeight{ 4 };
+
+ typedef TransposeSingleComponent<64> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC5_SNORM> - Format traits specialization for BC5_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC5_SNORM> :
+ ComponentTraits<SWR_TYPE_SNORM, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 128 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ true };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 4 };
+ static const uint32_t bcHeight{ 4 };
+
+ typedef TransposeSingleComponent<128> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16_FLOAT> - Format traits specialization for R16G16B16_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16_FLOAT> :
+ ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>,
+ FormatSwizzle<0, 1, 2>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 48 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16_16 TransposeT;
+ typedef Format3<16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16_UNORM> - Format traits specialization for R16G16B16_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>,
+ FormatSwizzle<0, 1, 2>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 48 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16_16 TransposeT;
+ typedef Format3<16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16_SNORM> - Format traits specialization for R16G16B16_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16_SNORM> :
+ ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>,
+ FormatSwizzle<0, 1, 2>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 48 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16_16 TransposeT;
+ typedef Format3<16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16_SSCALED> - Format traits specialization for R16G16B16_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16_SSCALED> :
+ ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>,
+ FormatSwizzle<0, 1, 2>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 48 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16_16 TransposeT;
+ typedef Format3<16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16_USCALED> - Format traits specialization for R16G16B16_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16_USCALED> :
+ ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>,
+ FormatSwizzle<0, 1, 2>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 48 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16_16 TransposeT;
+ typedef Format3<16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC6H_SF16> - Format traits specialization for BC6H_SF16
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC6H_SF16> :
+ ComponentTraits<SWR_TYPE_SNORM, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 128 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ true };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 4 };
+ static const uint32_t bcHeight{ 4 };
+
+ typedef TransposeSingleComponent<128> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC7_UNORM> - Format traits specialization for BC7_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC7_UNORM> :
+ ComponentTraits<SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 128 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ true };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 4 };
+ static const uint32_t bcHeight{ 4 };
+
+ typedef TransposeSingleComponent<128> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC7_UNORM_SRGB> - Format traits specialization for BC7_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC7_UNORM_SRGB> :
+ ComponentTraits<SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 128 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ true };
+ static const bool isBC{ true };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 4 };
+ static const uint32_t bcHeight{ 4 };
+
+ typedef TransposeSingleComponent<128> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC6H_UF16> - Format traits specialization for BC6H_UF16
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC6H_UF16> :
+ ComponentTraits<SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 128 };
+ static const uint32_t numComps{ 1 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ true };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 4 };
+ static const uint32_t bcHeight{ 4 };
+
+ typedef TransposeSingleComponent<128> TransposeT;
+ typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8_UNORM_SRGB> - Format traits specialization for R8G8B8_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8_UNORM_SRGB> :
+ ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+ FormatSwizzle<0, 1, 2>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 24 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ true };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8_8 TransposeT;
+ typedef Format3<8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16_UINT> - Format traits specialization for R16G16B16_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16_UINT> :
+ ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>,
+ FormatSwizzle<0, 1, 2>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 48 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16_16 TransposeT;
+ typedef Format3<16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16_SINT> - Format traits specialization for R16G16B16_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16_SINT> :
+ ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>,
+ FormatSwizzle<0, 1, 2>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 48 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose16_16_16 TransposeT;
+ typedef Format3<16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R10G10B10A2_SNORM> - Format traits specialization for R10G10B10A2_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R10G10B10A2_SNORM> :
+ ComponentTraits<SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 2>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose10_10_10_2 TransposeT;
+ typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R10G10B10A2_USCALED> - Format traits specialization for R10G10B10A2_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R10G10B10A2_USCALED> :
+ ComponentTraits<SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 2>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose10_10_10_2 TransposeT;
+ typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R10G10B10A2_SSCALED> - Format traits specialization for R10G10B10A2_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R10G10B10A2_SSCALED> :
+ ComponentTraits<SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 2>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose10_10_10_2 TransposeT;
+ typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R10G10B10A2_SINT> - Format traits specialization for R10G10B10A2_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R10G10B10A2_SINT> :
+ ComponentTraits<SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 2>,
+ FormatSwizzle<0, 1, 2, 3>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose10_10_10_2 TransposeT;
+ typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B10G10R10A2_SNORM> - Format traits specialization for B10G10R10A2_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B10G10R10A2_SNORM> :
+ ComponentTraits<SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 2>,
+ FormatSwizzle<2, 1, 0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose10_10_10_2 TransposeT;
+ typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B10G10R10A2_USCALED> - Format traits specialization for B10G10R10A2_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B10G10R10A2_USCALED> :
+ ComponentTraits<SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 2>,
+ FormatSwizzle<2, 1, 0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose10_10_10_2 TransposeT;
+ typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B10G10R10A2_SSCALED> - Format traits specialization for B10G10R10A2_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B10G10R10A2_SSCALED> :
+ ComponentTraits<SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 2>,
+ FormatSwizzle<2, 1, 0, 3>,
+ Defaults<0, 0, 0, 0x3f800000>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose10_10_10_2 TransposeT;
+ typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B10G10R10A2_UINT> - Format traits specialization for B10G10R10A2_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B10G10R10A2_UINT> :
+ ComponentTraits<SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 2>,
+ FormatSwizzle<2, 1, 0, 3>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose10_10_10_2 TransposeT;
+ typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B10G10R10A2_SINT> - Format traits specialization for B10G10R10A2_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B10G10R10A2_SINT> :
+ ComponentTraits<SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 2>,
+ FormatSwizzle<2, 1, 0, 3>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 32 };
+ static const uint32_t numComps{ 4 };
+ static const bool hasAlpha{ true };
+ static const uint32_t alphaComp{ 3 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose10_10_10_2 TransposeT;
+ typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8_UINT> - Format traits specialization for R8G8B8_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8_UINT> :
+ ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
+ FormatSwizzle<0, 1, 2>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 24 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8_8 TransposeT;
+ typedef Format3<8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8_SINT> - Format traits specialization for R8G8B8_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8_SINT> :
+ ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>,
+ FormatSwizzle<0, 1, 2>,
+ Defaults<0, 0, 0, 0x1>
+{
+ static const uint32_t bpp{ 24 };
+ static const uint32_t numComps{ 3 };
+ static const bool hasAlpha{ false };
+ static const uint32_t alphaComp{ 0 };
+ static const bool isSRGB{ false };
+ static const bool isBC{ false };
+ static const bool isSubsampled{ false };
+ static const uint32_t bcWidth{ 1 };
+ static const uint32_t bcHeight{ 1 };
+
+ typedef Transpose8_8_8 TransposeT;
+ typedef Format3<8, 8, 8> FormatT;
+};
+
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h
new file mode 100644
index 00000000000..aa350259a15
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h
@@ -0,0 +1,1075 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file formats.h
+*
+* @brief Definitions for SWR_FORMAT functions.
+*
+******************************************************************************/
+#pragma once
+
+//////////////////////////////////////////////////////////////////////////
+/// PackTraits - Helpers for packing / unpacking same pixel sizes
+//////////////////////////////////////////////////////////////////////////
+template <uint32_t NumBits, bool Signed = false>
+struct PackTraits
+{
+ static const uint32_t MyNumBits = NumBits;
+ static simdscalar loadSOA(const BYTE *pSrc) = delete;
+ static void storeSOA(BYTE *pDst, simdscalar src) = delete;
+ static simdscalar unpack(simdscalar &in) = delete;
+ static simdscalar pack(simdscalar &in) = delete;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// PackTraits - Helpers for packing / unpacking unused channels
+//////////////////////////////////////////////////////////////////////////
+template <>
+struct PackTraits<0, false>
+{
+ static const uint32_t MyNumBits = 0;
+
+ static simdscalar loadSOA(const BYTE *pSrc) { return _simd_setzero_ps(); }
+ static void storeSOA(BYTE *pDst, simdscalar src) { return; }
+ static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); }
+ static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
+};
+
+
+//////////////////////////////////////////////////////////////////////////
+/// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels
+//////////////////////////////////////////////////////////////////////////
+template <>
+struct PackTraits<8, false>
+{
+ static const uint32_t MyNumBits = 8;
+
+ static simdscalar loadSOA(const BYTE *pSrc)
+ {
+#if KNOB_SIMD_WIDTH == 8
+ __m256 result = _mm256_setzero_ps();
+ __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
+ return _mm256_insertf128_ps(result, vLo, 0);
+#else
+#error Unsupported vector width
+#endif
+ }
+
+ static void storeSOA(BYTE *pDst, simdscalar src)
+ {
+ // store simd bytes
+#if KNOB_SIMD_WIDTH == 8
+ _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
+#else
+#error Unsupported vector width
+#endif
+ }
+
+ static simdscalar unpack(simdscalar &in)
+ {
+#if KNOB_SIMD_WIDTH == 8
+#if KNOB_ARCH==KNOB_ARCH_AVX
+ __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
+ __m128i resLo = _mm_cvtepu8_epi32(src);
+ __m128i resHi = _mm_shuffle_epi8(src,
+ _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
+
+ __m256i result = _mm256_castsi128_si256(resLo);
+ result = _mm256_insertf128_si256(result, resHi, 1);
+ return _mm256_castsi256_ps(result);
+#elif KNOB_ARCH==KNOB_ARCH_AVX2
+ return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
+#endif
+#else
+#error Unsupported vector width
+#endif
+ }
+
+ static simdscalar pack(simdscalar &in)
+ {
+#if KNOB_SIMD_WIDTH == 8
+ simdscalari src = _simd_castps_si(in);
+ __m128i res16 = _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
+ __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128());
+ return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
+#else
+#error Unsupported vector width
+#endif
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// PackTraits - Helpers for packing / unpacking 8 bit signed channels
+//////////////////////////////////////////////////////////////////////////
+template <>
+struct PackTraits<8, true>
+{
+ static const uint32_t MyNumBits = 8;
+
+ static simdscalar loadSOA(const BYTE *pSrc)
+ {
+#if KNOB_SIMD_WIDTH == 8
+ __m256 result = _mm256_setzero_ps();
+ __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
+ return _mm256_insertf128_ps(result, vLo, 0);
+#else
+#error Unsupported vector width
+#endif
+ }
+
+ static void storeSOA(BYTE *pDst, simdscalar src)
+ {
+ // store simd bytes
+#if KNOB_SIMD_WIDTH == 8
+ _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
+#else
+#error Unsupported vector width
+#endif
+ }
+
+ static simdscalar unpack(simdscalar &in)
+ {
+#if KNOB_SIMD_WIDTH == 8
+#if KNOB_ARCH==KNOB_ARCH_AVX
+ SWR_ASSERT(0); // I think this may be incorrect.
+ __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
+ __m128i resLo = _mm_cvtepi8_epi32(src);
+ __m128i resHi = _mm_shuffle_epi8(src,
+ _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
+
+ __m256i result = _mm256_castsi128_si256(resLo);
+ result = _mm256_insertf128_si256(result, resHi, 1);
+ return _mm256_castsi256_ps(result);
+#elif KNOB_ARCH==KNOB_ARCH_AVX2
+ return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
+#endif
+#else
+#error Unsupported vector width
+#endif
+ }
+
+ static simdscalar pack(simdscalar &in)
+ {
+#if KNOB_SIMD_WIDTH == 8
+ simdscalari src = _simd_castps_si(in);
+ __m128i res16 = _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
+ __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128());
+ return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
+#else
+#error Unsupported vector width
+#endif
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels
+//////////////////////////////////////////////////////////////////////////
+template <>
+struct PackTraits<16, false>
+{
+ static const uint32_t MyNumBits = 16;
+
+ static simdscalar loadSOA(const BYTE *pSrc)
+ {
+#if KNOB_SIMD_WIDTH == 8
+ __m256 result = _mm256_setzero_ps();
+ __m128 vLo = _mm_load_ps((const float*)pSrc);
+ return _mm256_insertf128_ps(result, vLo, 0);
+#else
+#error Unsupported vector width
+#endif
+ }
+
+ static void storeSOA(BYTE *pDst, simdscalar src)
+ {
+#if KNOB_SIMD_WIDTH == 8
+ // store 16B (2B * 8)
+ _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
+#else
+#error Unsupported vector width
+#endif
+ }
+
+ static simdscalar unpack(simdscalar &in)
+ {
+#if KNOB_SIMD_WIDTH == 8
+#if KNOB_ARCH==KNOB_ARCH_AVX
+ __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
+ __m128i resLo = _mm_cvtepu16_epi32(src);
+ __m128i resHi = _mm_shuffle_epi8(src,
+ _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
+
+ __m256i result = _mm256_castsi128_si256(resLo);
+ result = _mm256_insertf128_si256(result, resHi, 1);
+ return _mm256_castsi256_ps(result);
+#elif KNOB_ARCH==KNOB_ARCH_AVX2
+ return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
+#endif
+#else
+#error Unsupported vector width
+#endif
+ }
+
+ static simdscalar pack(simdscalar &in)
+ {
+#if KNOB_SIMD_WIDTH == 8
+ simdscalari src = _simd_castps_si(in);
+ __m256i res = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
+ return _mm256_castsi256_ps(res);
+#else
+#error Unsupported vector width
+#endif
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// PackTraits - Helpers for packing / unpacking 16 bit signed channels
+//////////////////////////////////////////////////////////////////////////
+template <>
+struct PackTraits<16, true>
+{
+ static const uint32_t MyNumBits = 16;
+
+ static simdscalar loadSOA(const BYTE *pSrc)
+ {
+#if KNOB_SIMD_WIDTH == 8
+ __m256 result = _mm256_setzero_ps();
+ __m128 vLo = _mm_load_ps((const float*)pSrc);
+ return _mm256_insertf128_ps(result, vLo, 0);
+#else
+#error Unsupported vector width
+#endif
+ }
+
+ static void storeSOA(BYTE *pDst, simdscalar src)
+ {
+#if KNOB_SIMD_WIDTH == 8
+ // store 16B (2B * 8)
+ _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
+#else
+#error Unsupported vector width
+#endif
+ }
+
+ static simdscalar unpack(simdscalar &in)
+ {
+#if KNOB_SIMD_WIDTH == 8
+#if KNOB_ARCH==KNOB_ARCH_AVX
+ SWR_ASSERT(0); // I think this is incorrectly implemented
+ __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
+ __m128i resLo = _mm_cvtepi16_epi32(src);
+ __m128i resHi = _mm_shuffle_epi8(src,
+ _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
+
+ __m256i result = _mm256_castsi128_si256(resLo);
+ result = _mm256_insertf128_si256(result, resHi, 1);
+ return _mm256_castsi256_ps(result);
+#elif KNOB_ARCH==KNOB_ARCH_AVX2
+ return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
+#endif
+#else
+#error Unsupported vector width
+#endif
+ }
+
+ static simdscalar pack(simdscalar &in)
+ {
+#if KNOB_SIMD_WIDTH == 8
+ simdscalari src = _simd_castps_si(in);
+ __m256i res = _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
+ return _mm256_castsi256_ps(res);
+#else
+#error Unsupported vector width
+#endif
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// PackTraits - Helpers for packing / unpacking 32 bit channels
+//////////////////////////////////////////////////////////////////////////
+template <>
+struct PackTraits<32, false>
+{
+ static const uint32_t MyNumBits = 32;
+
+ static simdscalar loadSOA(const BYTE *pSrc) { return _simd_load_ps((const float*)pSrc); }
+ static void storeSOA(BYTE *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); }
+ static simdscalar unpack(simdscalar &in) { return in; }
+ static simdscalar pack(simdscalar &in) { return in; }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits.
+//////////////////////////////////////////////////////////////////////////
+template<SWR_TYPE type, uint32_t NumBits>
+struct TypeTraits : PackTraits<NumBits>
+{
+ static const SWR_TYPE MyType = type;
+ static float toFloat() { return 0.0; }
+ static float fromFloat() { SWR_ASSERT(0); return 0.0; }
+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for UINT8
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_UINT, 8> : PackTraits<8>
+{
+ static const SWR_TYPE MyType = SWR_TYPE_UINT;
+ static float toFloat() { return 0.0; }
+ static float fromFloat() { SWR_ASSERT(0); return 0.0; }
+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for UINT8
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_SINT, 8> : PackTraits<8, true>
+{
+ static const SWR_TYPE MyType = SWR_TYPE_SINT;
+ static float toFloat() { return 0.0; }
+ static float fromFloat() { SWR_ASSERT(0); return 0.0; }
+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for UINT16
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_UINT, 16> : PackTraits<16>
+{
+ static const SWR_TYPE MyType = SWR_TYPE_UINT;
+ static float toFloat() { return 0.0; }
+ static float fromFloat() { SWR_ASSERT(0); return 0.0; }
+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for SINT16
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_SINT, 16> : PackTraits<16, true>
+{
+ static const SWR_TYPE MyType = SWR_TYPE_SINT;
+ static float toFloat() { return 0.0; }
+ static float fromFloat() { SWR_ASSERT(0); return 0.0; }
+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for UINT32
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_UINT, 32> : PackTraits<32>
+{
+ static const SWR_TYPE MyType = SWR_TYPE_UINT;
+ static float toFloat() { return 0.0; }
+ static float fromFloat() { SWR_ASSERT(0); return 0.0; }
+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for UINT32
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_SINT, 32> : PackTraits<32>
+{
+ static const SWR_TYPE MyType = SWR_TYPE_SINT;
+ static float toFloat() { return 0.0; }
+ static float fromFloat() { SWR_ASSERT(0); return 0.0; }
+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for UNORM5
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_UNORM, 5> : PackTraits<5>
+{
+ static const SWR_TYPE MyType = SWR_TYPE_UNORM;
+ static float toFloat() { return 1.0f / 31.0f; }
+ static float fromFloat() { return 31.0f; }
+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for UNORM6
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_UNORM, 6> : PackTraits<6>
+{
+ static const SWR_TYPE MyType = SWR_TYPE_UNORM;
+ static float toFloat() { return 1.0f / 63.0f; }
+ static float fromFloat() { return 63.0f; }
+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for UNORM8
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_UNORM, 8> : PackTraits<8>
+{
+ static const SWR_TYPE MyType = SWR_TYPE_UNORM;
+ static float toFloat() { return 1.0f / 255.0f; }
+ static float fromFloat() { return 255.0f; }
+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for UNORM8
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_SNORM, 8> : PackTraits<8, true>
+{
+ static const SWR_TYPE MyType = SWR_TYPE_SNORM;
+ static float toFloat() { return 1.0f / 127.0f; }
+ static float fromFloat() { return 127.0f; }
+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for UNORM16
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_UNORM, 16> : PackTraits<16>
+{
+ static const SWR_TYPE MyType = SWR_TYPE_UNORM;
+ static float toFloat() { return 1.0f / 65535.0f; }
+ static float fromFloat() { return 65535.0f; }
+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for SNORM16
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_SNORM, 16> : PackTraits<16, true>
+{
+ static const SWR_TYPE MyType = SWR_TYPE_UNORM;
+ static float toFloat() { return 1.0f / 32767.0f; }
+ static float fromFloat() { return 32767.0f; }
+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for UNORM24
+//////////////////////////////////////////////////////////////////////////
+template<>
+struct TypeTraits < SWR_TYPE_UNORM, 24 > : PackTraits<32>
+{
+ static const SWR_TYPE MyType = SWR_TYPE_UNORM;
+ static float toFloat() { return 1.0f / 16777215.0f; }
+ static float fromFloat() { return 16777215.0f; }
+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+// FLOAT Specializations from here on...
+//////////////////////////////////////////////////////////////////////////
+#define TO_M128i(a) _mm_castps_si128(a)
+#define TO_M128(a) _mm_castsi128_ps(a)
+
+#include "math.h"
+
+template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
+inline static __m128 fastpow(__m128 arg) {
+ __m128 ret = arg;
+
+ static const __m128 factor = _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f)
+ * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum));
+
+ // Apply a constant pre-correction factor.
+ ret = _mm_mul_ps(ret, factor);
+
+ // Reinterpret arg as integer to obtain logarithm.
+ //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret));
+ ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
+
+ // Multiply logarithm by power.
+ ret = _mm_mul_ps(ret, _mm_set1_ps(1.0f * expnum / expden));
+
+ // Convert back to "integer" to exponentiate.
+ //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret));
+ ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
+
+ return ret;
+}
+
+inline static __m128 pow512_4(__m128 arg) {
+ // 5/12 is too small, so compute the 4th root of 20/12 instead.
+ // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
+ // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
+ __m128 xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg);
+ __m128 xover = _mm_mul_ps(arg, xf);
+
+ __m128 xfm1 = _mm_rsqrt_ps(xf);
+ __m128 x2 = _mm_mul_ps(arg, arg);
+ __m128 xunder = _mm_mul_ps(x2, xfm1);
+
+ // sqrt2 * over + 2 * sqrt2 * under
+ __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
+ _mm_add_ps(xover, xunder));
+
+ xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
+ xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
+ return xavg;
+}
+
+inline static __m128 powf_wrapper(__m128 Base, float Exp)
+{
+ float *f = (float *)(&Base);
+
+ return _mm_set_ps(powf(f[0], Exp),
+ powf(f[1], Exp),
+ powf(f[2], Exp),
+ powf(f[3], Exp));
+}
+
+static inline __m128 ConvertFloatToSRGB2(__m128& Src)
+{
+ // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value
+ __m128i CmpToSRGBThresholdMask = TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f), Src));
+
+ // squeeze the mask down to 16 bits (4 bits per DWORD)
+ int CompareResult = _mm_movemask_epi8(CmpToSRGBThresholdMask);
+
+ __m128 Result;
+
+ //
+ if (CompareResult == 0xFFFF)
+ {
+ // all DWORDs are <= the threshold
+ Result = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
+ }
+ else if (CompareResult == 0x0)
+ {
+ // all DWORDs are > the threshold
+ __m128 fSrc_0RGB = Src;
+
+ // --> 1.055f * c(1.0f/2.4f) - 0.055f
+#if KNOB_USE_FAST_SRGB == TRUE
+ // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
+ __m128 f = pow512_4(fSrc_0RGB);
+#else
+ __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
+#endif
+ f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
+ Result = _mm_sub_ps(f, _mm_set1_ps(0.055f));
+ }
+ else
+ {
+ // some DWORDs are <= the threshold and some are > threshold
+ __m128 Src_0RGB_mul_denorm = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
+
+ __m128 fSrc_0RGB = Src;
+
+ // --> 1.055f * c(1.0f/2.4f) - 0.055f
+#if KNOB_USE_FAST_SRGB == TRUE
+ // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
+ __m128 f = pow512_4(fSrc_0RGB);
+#else
+ __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
+#endif
+ f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
+ f = _mm_sub_ps(f, _mm_set1_ps(0.055f));
+
+ // Clear the alpha (is garbage after the sub)
+ __m128i i = _mm_and_si128(TO_M128i(f), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF));
+
+ __m128i LessThanPart = _mm_and_si128(CmpToSRGBThresholdMask, TO_M128i(Src_0RGB_mul_denorm));
+ __m128i GreaterEqualPart = _mm_andnot_si128(CmpToSRGBThresholdMask, i);
+ __m128i CombinedParts = _mm_or_si128(LessThanPart, GreaterEqualPart);
+
+ Result = TO_M128(CombinedParts);
+ }
+
+ return Result;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for FLOAT16
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16>
+{
+ static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
+ static float toFloat() { return 1.0f; }
+ static float fromFloat() { return 1.0f; }
+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+
+ static simdscalar pack(const simdscalar &in)
+ {
+#if KNOB_SIMD_WIDTH == 8
+#if (KNOB_ARCH == KNOB_ARCH_AVX)
+ // input is 8 packed float32, output is 8 packed float16
+ simdscalari src = _simd_castps_si(in);
+
+ static const uint32_t FLOAT_EXP_BITS = 8;
+ static const uint32_t FLOAT_MANTISSA_BITS = 23;
+ static const uint32_t FLOAT_MANTISSA_MASK = (1U << FLOAT_MANTISSA_BITS) - 1;
+ static const uint32_t FLOAT_EXP_MASK = ((1U << FLOAT_EXP_BITS) - 1) << FLOAT_MANTISSA_BITS;
+
+ static const uint32_t HALF_EXP_BITS = 5;
+ static const uint32_t HALF_MANTISSA_BITS = 10;
+ static const uint32_t HALF_MANTISSA_MASK = (1U << HALF_MANTISSA_BITS) - 1;
+ static const uint32_t HALF_EXP_MASK = ((1U << HALF_EXP_BITS) - 1) << HALF_MANTISSA_BITS;
+
+ // minimum exponent required, exponents below this are flushed to 0.
+ static const int32_t HALF_EXP_MIN = -14;
+ static const int32_t FLOAT_EXP_BIAS = 127;
+ static const int32_t FLOAT_EXP_MIN = HALF_EXP_MIN + FLOAT_EXP_BIAS;
+ static const int32_t FLOAT_EXP_MIN_FTZ = FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand
+
+ // maximum exponent required, exponents above this are set to infinity
+ static const int32_t HALF_EXP_MAX = 15;
+ static const int32_t FLOAT_EXP_MAX = HALF_EXP_MAX + FLOAT_EXP_BIAS;
+
+ const simdscalari vSignMask = _simd_set1_epi32(0x80000000);
+ const simdscalari vExpMask = _simd_set1_epi32(FLOAT_EXP_MASK);
+ const simdscalari vManMask = _simd_set1_epi32(FLOAT_MANTISSA_MASK);
+ const simdscalari vExpMin = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS));
+ const simdscalari vExpMinFtz = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS));
+ const simdscalari vExpMax = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS));
+
+ simdscalari vSign = _simd_and_si(src, vSignMask);
+ simdscalari vExp = _simd_and_si(src, vExpMask);
+ simdscalari vMan = _simd_and_si(src, vManMask);
+
+ simdscalari vFTZMask = _simd_cmplt_epi32(vExp, vExpMinFtz);
+ simdscalari vDenormMask = _simd_andnot_si(vFTZMask, _simd_cmplt_epi32(vExp, vExpMin));
+ simdscalari vInfMask = _simd_cmpeq_epi32(vExpMask, vExp);
+ simdscalari vClampMask = _simd_andnot_si(vInfMask, _simd_cmplt_epi32(vExpMax, vExp));
+
+ simdscalari vHalfExp = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS));
+
+ // pack output 16-bits into the lower 16-bits of each 32-bit channel
+ simdscalari vDst = _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK));
+ vDst = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
+
+ // Flush To Zero
+ vDst = _simd_andnot_si(vFTZMask, vDst);
+ // Apply Infinites / NaN
+ vDst = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK)));
+
+ // Apply clamps
+ vDst = _simd_andnot_si(vClampMask, vDst);
+ vDst = _simd_or_si(vDst,
+ _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF)));
+
+ // Compute Denormals (subnormals)
+ if (!_mm256_testz_si256(vDenormMask, vDenormMask))
+ {
+ uint32_t *pDenormMask = (uint32_t*)&vDenormMask;
+ uint32_t *pExp = (uint32_t*)&vExp;
+ uint32_t *pMan = (uint32_t*)&vMan;
+ uint32_t *pDst = (uint32_t*)&vDst;
+ for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
+ {
+ if (pDenormMask[i])
+ {
+ // Need to compute subnormal value
+ uint32_t exponent = pExp[i] >> FLOAT_MANTISSA_BITS;
+ uint32_t mantissa = pMan[i] |
+ (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s. Make it explicit
+
+ pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) + (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
+ }
+ }
+ }
+
+ // Add in sign bits
+ vDst = _simd_or_si(vDst, _simd_srli_epi32(vSign, 16));
+
+ // Pack to lower 128-bits
+ vDst = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1)));
+
+#if 0
+#if !defined(NDEBUG)
+ simdscalari vCheck = _mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC));
+
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ SWR_ASSERT(vCheck.m256i_i32[i] == vDst.m256i_i32[i]);
+ }
+#endif
+#endif
+
+ return _simd_castsi_ps(vDst);
+
+#else
+ return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC)));
+#endif
+#else
+#error Unsupported vector width
+#endif
+ }
+
+ static simdscalar unpack(const simdscalar &in)
+ {
+ // input is 8 packed float16, output is 8 packed float32
+ SWR_ASSERT(0); // @todo
+ return _simd_setzero_ps();
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for FLOAT32
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32>
+{
+ static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
+ static float toFloat() { return 1.0f; }
+ static float fromFloat() { return 1.0f; }
+ static inline simdscalar convertSrgb(simdscalar &in)
+ {
+#if (KNOB_ARCH == KNOB_ARCH_AVX || KNOB_ARCH == KNOB_ARCH_AVX2)
+ __m128 srcLo = _mm256_extractf128_ps(in, 0);
+ __m128 srcHi = _mm256_extractf128_ps(in, 1);
+
+ srcLo = ConvertFloatToSRGB2(srcLo);
+ srcHi = ConvertFloatToSRGB2(srcHi);
+
+ in = _mm256_insertf128_ps(in, srcLo, 0);
+ in = _mm256_insertf128_ps(in, srcHi, 1);
+
+#endif
+ return in;
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Format1 - Bitfield for single component formats.
+//////////////////////////////////////////////////////////////////////////
+template<uint32_t x>
+struct Format1
+{
+ union
+ {
+ uint32_t r : x;
+
+ ///@ The following are here to provide full template needed in Formats.
+ uint32_t g : x;
+ uint32_t b : x;
+ uint32_t a : x;
+ };
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Format1 - Bitfield for single component formats - 8 bit specialization
+//////////////////////////////////////////////////////////////////////////
+template<>
+struct Format1<8>
+{
+ union
+ {
+ uint8_t r;
+
+ ///@ The following are here to provide full template needed in Formats.
+ uint8_t g;
+ uint8_t b;
+ uint8_t a;
+ };
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Format1 - Bitfield for single component formats - 16 bit specialization
+//////////////////////////////////////////////////////////////////////////
+template<>
+struct Format1<16>
+{
+ union
+ {
+ uint16_t r;
+
+ ///@ The following are here to provide full template needed in Formats.
+ uint16_t g;
+ uint16_t b;
+ uint16_t a;
+ };
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Format2 - Bitfield for 2 component formats.
+//////////////////////////////////////////////////////////////////////////
+template<uint32_t x, uint32_t y>
+union Format2
+{
+ struct
+ {
+ uint32_t r : x;
+ uint32_t g : y;
+ };
+ struct
+ {
+ ///@ The following are here to provide full template needed in Formats.
+ uint32_t b : x;
+ uint32_t a : y;
+ };
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Format2 - Bitfield for 2 component formats - 16 bit specialization
+//////////////////////////////////////////////////////////////////////////
+template<>
+union Format2<8,8>
+{
+ struct
+ {
+ uint16_t r : 8;
+ uint16_t g : 8;
+ };
+ struct
+ {
+ ///@ The following are here to provide full template needed in Formats.
+ uint16_t b : 8;
+ uint16_t a : 8;
+ };
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Format3 - Bitfield for 3 component formats.
+//////////////////////////////////////////////////////////////////////////
+template<uint32_t x, uint32_t y, uint32_t z>
+union Format3
+{
+ struct
+ {
+ uint32_t r : x;
+ uint32_t g : y;
+ uint32_t b : z;
+ };
+ uint32_t a; ///@note This is here to provide full template needed in Formats.
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Format3 - Bitfield for 3 component formats - 16 bit specialization
+//////////////////////////////////////////////////////////////////////////
+template<>
+union Format3<5,6,5>
+{
+ struct
+ {
+ uint16_t r : 5;
+ uint16_t g : 6;
+ uint16_t b : 5;
+ };
+ uint16_t a; ///@note This is here to provide full template needed in Formats.
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Format4 - Bitfield for 4 component formats.
+//////////////////////////////////////////////////////////////////////////
+template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
+struct Format4
+{
+ uint32_t r : x;
+ uint32_t g : y;
+ uint32_t b : z;
+ uint32_t a : w;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Format4 - Bitfield for 4 component formats - 16 bit specialization
+//////////////////////////////////////////////////////////////////////////
+template<>
+struct Format4<5,5,5,1>
+{
+ uint16_t r : 5;
+ uint16_t g : 5;
+ uint16_t b : 5;
+ uint16_t a : 1;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Format4 - Bitfield for 4 component formats - 16 bit specialization
+//////////////////////////////////////////////////////////////////////////
+template<>
+struct Format4<4,4,4,4>
+{
+ uint16_t r : 4;
+ uint16_t g : 4;
+ uint16_t b : 4;
+ uint16_t a : 4;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// ComponentTraits - Default components
+//////////////////////////////////////////////////////////////////////////
+template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
+struct Defaults
+{
+ INLINE static uint32_t GetDefault(uint32_t comp)
+ {
+ static const uint32_t defaults[4]{ x, y, z, w };
+ return defaults[comp];
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// ComponentTraits - Component type traits.
+//////////////////////////////////////////////////////////////////////////
+template<SWR_TYPE X, uint32_t NumBitsX, SWR_TYPE Y = SWR_TYPE_UNKNOWN, uint32_t NumBitsY = 0, SWR_TYPE Z = SWR_TYPE_UNKNOWN, uint32_t NumBitsZ = 0, SWR_TYPE W = SWR_TYPE_UNKNOWN, uint32_t NumBitsW = 0>
+struct ComponentTraits
+{
+ INLINE static SWR_TYPE GetType(uint32_t comp)
+ {
+ static const SWR_TYPE CompType[4]{ X, Y, Z, W };
+ return CompType[comp];
+ }
+
+ INLINE static uint32_t GetBPC(uint32_t comp)
+ {
+ static const uint32_t MyBpc[4]{ NumBitsX, NumBitsY, NumBitsZ, NumBitsW };
+ return MyBpc[comp];
+ }
+
+ INLINE static bool isNormalized(uint32_t comp)
+ {
+ switch (comp)
+ {
+ case 0:
+ return (X == SWR_TYPE_UNORM || X == SWR_TYPE_SNORM) ? true : false;
+ case 1:
+ return (Y == SWR_TYPE_UNORM || Y == SWR_TYPE_SNORM) ? true : false;
+ case 2:
+ return (Z == SWR_TYPE_UNORM || Z == SWR_TYPE_SNORM) ? true : false;
+ case 3:
+ return (W == SWR_TYPE_UNORM || W == SWR_TYPE_SNORM) ? true : false;
+ }
+ SWR_ASSERT(0);
+ return false;
+ }
+
+ INLINE static float toFloat(uint32_t comp)
+ {
+ switch (comp)
+ {
+ case 0:
+ return TypeTraits<X, NumBitsX>::toFloat();
+ case 1:
+ return TypeTraits<Y, NumBitsY>::toFloat();
+ case 2:
+ return TypeTraits<Z, NumBitsZ>::toFloat();
+ case 3:
+ return TypeTraits<W, NumBitsW>::toFloat();
+ }
+ SWR_ASSERT(0);
+ return TypeTraits<X, NumBitsX>::toFloat();
+
+ }
+
+ INLINE static float fromFloat(uint32_t comp)
+ {
+ switch (comp)
+ {
+ case 0:
+ return TypeTraits<X, NumBitsX>::fromFloat();
+ case 1:
+ return TypeTraits<Y, NumBitsY>::fromFloat();
+ case 2:
+ return TypeTraits<Z, NumBitsZ>::fromFloat();
+ case 3:
+ return TypeTraits<W, NumBitsW>::fromFloat();
+ }
+ SWR_ASSERT(0);
+ return TypeTraits<X, NumBitsX>::fromFloat();
+ }
+
+ INLINE static simdscalar loadSOA(uint32_t comp, const BYTE* pSrc)
+ {
+ switch (comp)
+ {
+ case 0:
+ return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
+ case 1:
+ return TypeTraits<Y, NumBitsY>::loadSOA(pSrc);
+ case 2:
+ return TypeTraits<Z, NumBitsZ>::loadSOA(pSrc);
+ case 3:
+ return TypeTraits<W, NumBitsW>::loadSOA(pSrc);
+ }
+ SWR_ASSERT(0);
+ return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
+ }
+
+ INLINE static void storeSOA(uint32_t comp, BYTE *pDst, simdscalar src)
+ {
+ switch (comp)
+ {
+ case 0:
+ TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
+ return;
+ case 1:
+ TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
+ return;
+ case 2:
+ TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
+ return;
+ case 3:
+ TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
+ return;
+ }
+ SWR_ASSERT(0);
+ TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
+ }
+
+ INLINE static simdscalar unpack(uint32_t comp, simdscalar &in)
+ {
+ switch (comp)
+ {
+ case 0:
+ return TypeTraits<X, NumBitsX>::unpack(in);
+ case 1:
+ return TypeTraits<Y, NumBitsY>::unpack(in);
+ case 2:
+ return TypeTraits<Z, NumBitsZ>::unpack(in);
+ case 3:
+ return TypeTraits<W, NumBitsW>::unpack(in);
+ }
+ SWR_ASSERT(0);
+ return TypeTraits<X, NumBitsX>::unpack(in);
+ }
+
+ INLINE static simdscalar pack(uint32_t comp, simdscalar &in)
+ {
+ switch (comp)
+ {
+ case 0:
+ return TypeTraits<X, NumBitsX>::pack(in);
+ case 1:
+ return TypeTraits<Y, NumBitsY>::pack(in);
+ case 2:
+ return TypeTraits<Z, NumBitsZ>::pack(in);
+ case 3:
+ return TypeTraits<W, NumBitsW>::pack(in);
+ }
+ SWR_ASSERT(0);
+ return TypeTraits<X, NumBitsX>::pack(in);
+ }
+
+ INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar &in)
+ {
+ switch (comp)
+ {
+ case 0:
+ return TypeTraits<X, NumBitsX>::convertSrgb(in);;
+ case 1:
+ return TypeTraits<Y, NumBitsY>::convertSrgb(in);;
+ case 2:
+ return TypeTraits<Z, NumBitsZ>::convertSrgb(in);;
+ case 3:
+ return TypeTraits<W, NumBitsW>::convertSrgb(in);;
+ }
+ SWR_ASSERT(0);
+ return TypeTraits<X, NumBitsX>::convertSrgb(in);
+ }
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
new file mode 100644
index 00000000000..f43a672bd82
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -0,0 +1,2345 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file frontend.cpp
+*
+* @brief Implementation for Frontend which handles vertex processing,
+* primitive assembly, clipping, binning, etc.
+*
+******************************************************************************/
+
+#include "api.h"
+#include "frontend.h"
+#include "backend.h"
+#include "context.h"
+#include "rdtsc_core.h"
+#include "rasterizer.h"
+#include "utils.h"
+#include "threads.h"
+#include "pa.h"
+#include "clip.h"
+#include "tilemgr.h"
+#include "tessellator.h"
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Helper macro to generate a bitmask
+static INLINE uint32_t GenMask(uint32_t numBits)
+{
+ SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
+ return ((1U << numBits) - 1);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Offsets added to post-viewport vertex positions based on
+/// raster state.
+static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
+{
+ _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
+ _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief FE handler for SwrSync.
+/// @param pContext - pointer to SWR context.
+/// @param pDC - pointer to draw context.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param pUserData - Pointer to user data passed back to sync callback.
+/// @todo This should go away when we switch this to use compute threading.
+void ProcessSync(
+ SWR_CONTEXT *pContext,
+ DRAW_CONTEXT *pDC,
+ uint32_t workerId,
+ void *pUserData)
+{
+ SYNC_DESC *pSync = (SYNC_DESC*)pUserData;
+ BE_WORK work;
+ work.type = SYNC;
+ work.pfnWork = ProcessSyncBE;
+ work.desc.sync = *pSync;
+
+ MacroTileMgr *pTileMgr = pDC->pTileMgr;
+ pTileMgr->enqueue(0, 0, &work);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief FE handler for SwrGetStats.
+/// @param pContext - pointer to SWR context.
+/// @param pDC - pointer to draw context.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param pUserData - Pointer to user data passed back to stats callback.
+/// @todo This should go away when we switch this to use compute threading.
+void ProcessQueryStats(
+ SWR_CONTEXT *pContext,
+ DRAW_CONTEXT *pDC,
+ uint32_t workerId,
+ void *pUserData)
+{
+ QUERY_DESC *pQueryStats = (QUERY_DESC*)pUserData;
+ BE_WORK work;
+ work.type = QUERYSTATS;
+ work.pfnWork = ProcessQueryStatsBE;
+ work.desc.queryStats = *pQueryStats;
+
+ MacroTileMgr *pTileMgr = pDC->pTileMgr;
+ pTileMgr->enqueue(0, 0, &work);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief FE handler for SwrClearRenderTarget.
+/// @param pContext - pointer to SWR context.
+/// @param pDC - pointer to draw context.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param pUserData - Pointer to user data passed back to clear callback.
+/// @todo This should go away when we switch this to use compute threading.
+void ProcessClear(
+ SWR_CONTEXT *pContext,
+ DRAW_CONTEXT *pDC,
+ uint32_t workerId,
+ void *pUserData)
+{
+ CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
+ MacroTileMgr *pTileMgr = pDC->pTileMgr;
+
+ const API_STATE& state = GetApiState(pDC);
+
+ // queue a clear to each macro tile
+ // compute macro tile bounds for the current scissor/viewport
+ uint32_t macroTileLeft = state.scissorInFixedPoint.left / KNOB_MACROTILE_X_DIM_FIXED;
+ uint32_t macroTileRight = state.scissorInFixedPoint.right / KNOB_MACROTILE_X_DIM_FIXED;
+ uint32_t macroTileTop = state.scissorInFixedPoint.top / KNOB_MACROTILE_Y_DIM_FIXED;
+ uint32_t macroTileBottom = state.scissorInFixedPoint.bottom / KNOB_MACROTILE_Y_DIM_FIXED;
+
+ BE_WORK work;
+ work.type = CLEAR;
+ work.pfnWork = ProcessClearBE;
+ work.desc.clear = *pClear;
+
+ for (uint32_t y = macroTileTop; y <= macroTileBottom; ++y)
+ {
+ for (uint32_t x = macroTileLeft; x <= macroTileRight; ++x)
+ {
+ pTileMgr->enqueue(x, y, &work);
+ }
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief FE handler for SwrStoreTiles.
+/// @param pContext - pointer to SWR context.
+/// @param pDC - pointer to draw context.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param pUserData - Pointer to user data passed back to callback.
+/// @todo This should go away when we switch this to use compute threading.
+void ProcessStoreTiles(
+ SWR_CONTEXT *pContext,
+ DRAW_CONTEXT *pDC,
+ uint32_t workerId,
+ void *pUserData)
+{
+ RDTSC_START(FEProcessStoreTiles);
+ STORE_TILES_DESC *pStore = (STORE_TILES_DESC*)pUserData;
+ MacroTileMgr *pTileMgr = pDC->pTileMgr;
+
+ const API_STATE& state = GetApiState(pDC);
+
+ // queue a store to each macro tile
+ // compute macro tile bounds for the current render target
+ const uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
+ const uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
+
+ uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
+ uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
+
+ // store tiles
+ BE_WORK work;
+ work.type = STORETILES;
+ work.pfnWork = ProcessStoreTileBE;
+ work.desc.storeTiles = *pStore;
+
+ for (uint32_t x = 0; x < numMacroTilesX; ++x)
+ {
+ for (uint32_t y = 0; y < numMacroTilesY; ++y)
+ {
+ pTileMgr->enqueue(x, y, &work);
+ }
+ }
+
+ RDTSC_STOP(FEProcessStoreTiles, 0, pDC->drawId);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief FE handler for SwrInvalidateTiles.
+/// @param pContext - pointer to SWR context.
+/// @param pDC - pointer to draw context.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param pUserData - Pointer to user data passed back to callback.
+/// @todo This should go away when we switch this to use compute threading.
+void ProcessInvalidateTiles(
+ SWR_CONTEXT *pContext,
+ DRAW_CONTEXT *pDC,
+ uint32_t workerId,
+ void *pUserData)
+{
+ RDTSC_START(FEProcessInvalidateTiles);
+ INVALIDATE_TILES_DESC *pInv = (INVALIDATE_TILES_DESC*)pUserData;
+ MacroTileMgr *pTileMgr = pDC->pTileMgr;
+
+ const API_STATE& state = GetApiState(pDC);
+
+ // queue a store to each macro tile
+ // compute macro tile bounds for the current render target
+ uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
+ uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
+
+ uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
+ uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
+
+ // load tiles
+ BE_WORK work;
+ work.type = INVALIDATETILES;
+ work.pfnWork = ProcessInvalidateTilesBE;
+ work.desc.invalidateTiles = *pInv;
+
+ for (uint32_t x = 0; x < numMacroTilesX; ++x)
+ {
+ for (uint32_t y = 0; y < numMacroTilesY; ++y)
+ {
+ pTileMgr->enqueue(x, y, &work);
+ }
+ }
+
+ RDTSC_STOP(FEProcessInvalidateTiles, 0, pDC->drawId);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes the number of primitives given the number of verts.
+/// @param mode - primitive topology for draw operation.
+/// @param numPrims - number of vertices or indices for draw.
+/// @todo Frontend needs to be refactored. This will go in appropriate place then.
+uint32_t GetNumPrims(
+ PRIMITIVE_TOPOLOGY mode,
+ uint32_t numPrims)
+{
+ switch (mode)
+ {
+ case TOP_POINT_LIST: return numPrims;
+ case TOP_TRIANGLE_LIST: return numPrims / 3;
+ case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
+ case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
+ case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
+ case TOP_QUAD_LIST: return numPrims / 4;
+ case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
+ case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
+ case TOP_LINE_LIST: return numPrims / 2;
+ case TOP_LINE_LOOP: return numPrims;
+ case TOP_RECT_LIST: return numPrims / 3;
+ case TOP_LINE_LIST_ADJ: return numPrims / 4;
+ case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
+ case TOP_TRI_LIST_ADJ: return numPrims / 6;
+ case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
+
+ case TOP_PATCHLIST_1:
+ case TOP_PATCHLIST_2:
+ case TOP_PATCHLIST_3:
+ case TOP_PATCHLIST_4:
+ case TOP_PATCHLIST_5:
+ case TOP_PATCHLIST_6:
+ case TOP_PATCHLIST_7:
+ case TOP_PATCHLIST_8:
+ case TOP_PATCHLIST_9:
+ case TOP_PATCHLIST_10:
+ case TOP_PATCHLIST_11:
+ case TOP_PATCHLIST_12:
+ case TOP_PATCHLIST_13:
+ case TOP_PATCHLIST_14:
+ case TOP_PATCHLIST_15:
+ case TOP_PATCHLIST_16:
+ case TOP_PATCHLIST_17:
+ case TOP_PATCHLIST_18:
+ case TOP_PATCHLIST_19:
+ case TOP_PATCHLIST_20:
+ case TOP_PATCHLIST_21:
+ case TOP_PATCHLIST_22:
+ case TOP_PATCHLIST_23:
+ case TOP_PATCHLIST_24:
+ case TOP_PATCHLIST_25:
+ case TOP_PATCHLIST_26:
+ case TOP_PATCHLIST_27:
+ case TOP_PATCHLIST_28:
+ case TOP_PATCHLIST_29:
+ case TOP_PATCHLIST_30:
+ case TOP_PATCHLIST_31:
+ case TOP_PATCHLIST_32:
+ return numPrims / (mode - TOP_PATCHLIST_BASE);
+
+ case TOP_POLYGON:
+ case TOP_POINT_LIST_BF:
+ case TOP_LINE_STRIP_CONT:
+ case TOP_LINE_STRIP_BF:
+ case TOP_LINE_STRIP_CONT_BF:
+ case TOP_TRIANGLE_FAN_NOSTIPPLE:
+ case TOP_TRI_STRIP_REVERSE:
+ case TOP_PATCHLIST_BASE:
+ case TOP_UNKNOWN:
+ SWR_ASSERT(false, "Unsupported topology: %d", mode);
+ return 0;
+ }
+
+ return 0;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes the number of verts given the number of primitives.
+/// @param mode - primitive topology for draw operation.
+/// @param numPrims - number of primitives for draw.
+uint32_t GetNumVerts(
+ PRIMITIVE_TOPOLOGY mode,
+ uint32_t numPrims)
+{
+ switch (mode)
+ {
+ case TOP_POINT_LIST: return numPrims;
+ case TOP_TRIANGLE_LIST: return numPrims * 3;
+ case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
+ case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
+ case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
+ case TOP_QUAD_LIST: return numPrims * 4;
+ case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
+ case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
+ case TOP_LINE_LIST: return numPrims * 2;
+ case TOP_LINE_LOOP: return numPrims;
+ case TOP_RECT_LIST: return numPrims * 3;
+ case TOP_LINE_LIST_ADJ: return numPrims * 4;
+ case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
+ case TOP_TRI_LIST_ADJ: return numPrims * 6;
+ case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
+
+ case TOP_PATCHLIST_1:
+ case TOP_PATCHLIST_2:
+ case TOP_PATCHLIST_3:
+ case TOP_PATCHLIST_4:
+ case TOP_PATCHLIST_5:
+ case TOP_PATCHLIST_6:
+ case TOP_PATCHLIST_7:
+ case TOP_PATCHLIST_8:
+ case TOP_PATCHLIST_9:
+ case TOP_PATCHLIST_10:
+ case TOP_PATCHLIST_11:
+ case TOP_PATCHLIST_12:
+ case TOP_PATCHLIST_13:
+ case TOP_PATCHLIST_14:
+ case TOP_PATCHLIST_15:
+ case TOP_PATCHLIST_16:
+ case TOP_PATCHLIST_17:
+ case TOP_PATCHLIST_18:
+ case TOP_PATCHLIST_19:
+ case TOP_PATCHLIST_20:
+ case TOP_PATCHLIST_21:
+ case TOP_PATCHLIST_22:
+ case TOP_PATCHLIST_23:
+ case TOP_PATCHLIST_24:
+ case TOP_PATCHLIST_25:
+ case TOP_PATCHLIST_26:
+ case TOP_PATCHLIST_27:
+ case TOP_PATCHLIST_28:
+ case TOP_PATCHLIST_29:
+ case TOP_PATCHLIST_30:
+ case TOP_PATCHLIST_31:
+ case TOP_PATCHLIST_32:
+ return numPrims * (mode - TOP_PATCHLIST_BASE);
+
+ case TOP_POLYGON:
+ case TOP_POINT_LIST_BF:
+ case TOP_LINE_STRIP_CONT:
+ case TOP_LINE_STRIP_BF:
+ case TOP_LINE_STRIP_CONT_BF:
+ case TOP_TRIANGLE_FAN_NOSTIPPLE:
+ case TOP_TRI_STRIP_REVERSE:
+ case TOP_PATCHLIST_BASE:
+ case TOP_UNKNOWN:
+ SWR_ASSERT(false, "Unsupported topology: %d", mode);
+ return 0;
+ }
+
+ return 0;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Return number of verts per primitive.
+/// @param topology - topology
+/// @param includeAdjVerts - include adjacent verts in primitive vertices
+INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
+{
+ uint32_t numVerts = 0;
+ switch (topology)
+ {
+ case TOP_POINT_LIST:
+ case TOP_POINT_LIST_BF:
+ numVerts = 1;
+ break;
+ case TOP_LINE_LIST:
+ case TOP_LINE_STRIP:
+ case TOP_LINE_LIST_ADJ:
+ case TOP_LINE_LOOP:
+ case TOP_LINE_STRIP_CONT:
+ case TOP_LINE_STRIP_BF:
+ case TOP_LISTSTRIP_ADJ:
+ numVerts = 2;
+ break;
+ case TOP_TRIANGLE_LIST:
+ case TOP_TRIANGLE_STRIP:
+ case TOP_TRIANGLE_FAN:
+ case TOP_TRI_LIST_ADJ:
+ case TOP_TRI_STRIP_ADJ:
+ case TOP_TRI_STRIP_REVERSE:
+ case TOP_RECT_LIST:
+ numVerts = 3;
+ break;
+ case TOP_QUAD_LIST:
+ case TOP_QUAD_STRIP:
+ numVerts = 4;
+ break;
+ case TOP_PATCHLIST_1:
+ case TOP_PATCHLIST_2:
+ case TOP_PATCHLIST_3:
+ case TOP_PATCHLIST_4:
+ case TOP_PATCHLIST_5:
+ case TOP_PATCHLIST_6:
+ case TOP_PATCHLIST_7:
+ case TOP_PATCHLIST_8:
+ case TOP_PATCHLIST_9:
+ case TOP_PATCHLIST_10:
+ case TOP_PATCHLIST_11:
+ case TOP_PATCHLIST_12:
+ case TOP_PATCHLIST_13:
+ case TOP_PATCHLIST_14:
+ case TOP_PATCHLIST_15:
+ case TOP_PATCHLIST_16:
+ case TOP_PATCHLIST_17:
+ case TOP_PATCHLIST_18:
+ case TOP_PATCHLIST_19:
+ case TOP_PATCHLIST_20:
+ case TOP_PATCHLIST_21:
+ case TOP_PATCHLIST_22:
+ case TOP_PATCHLIST_23:
+ case TOP_PATCHLIST_24:
+ case TOP_PATCHLIST_25:
+ case TOP_PATCHLIST_26:
+ case TOP_PATCHLIST_27:
+ case TOP_PATCHLIST_28:
+ case TOP_PATCHLIST_29:
+ case TOP_PATCHLIST_30:
+ case TOP_PATCHLIST_31:
+ case TOP_PATCHLIST_32:
+ numVerts = topology - TOP_PATCHLIST_BASE;
+ break;
+ default:
+ SWR_ASSERT(false, "Unsupported topology: %d", topology);
+ break;
+ }
+
+ if (includeAdjVerts)
+ {
+ switch (topology)
+ {
+ case TOP_LISTSTRIP_ADJ:
+ case TOP_LINE_LIST_ADJ: numVerts = 4; break;
+ case TOP_TRI_STRIP_ADJ:
+ case TOP_TRI_LIST_ADJ: numVerts = 6; break;
+ default: break;
+ }
+ }
+
+ return numVerts;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Generate mask from remaining work.
+/// @param numWorkItems - Number of items being worked on by a SIMD.
+static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
+{
+ uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
+ uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
+ return _simd_castps_si(vMask(mask));
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief StreamOut - Streams vertex data out to SO buffers.
+/// Generally, we are only streaming out a SIMDs worth of triangles.
+/// @param pDC - pointer to draw context.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
+static void StreamOut(
+ DRAW_CONTEXT* pDC,
+ PA_STATE& pa,
+ uint32_t workerId,
+ uint32_t* pPrimData,
+ uint32_t streamIndex)
+{
+ RDTSC_START(FEStreamout);
+
+ SWR_CONTEXT* pContext = pDC->pContext;
+
+ const API_STATE& state = GetApiState(pDC);
+ const SWR_STREAMOUT_STATE &soState = state.soState;
+
+ uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
+
+ // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
+ uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
+
+ SWR_STREAMOUT_CONTEXT soContext = { 0 };
+
+ // Setup buffer state pointers.
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ soContext.pBuffer[i] = &state.soBuffer[i];
+ }
+
+ uint32_t numPrims = pa.NumPrims();
+ for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
+ {
+ DWORD slot = 0;
+ uint32_t soMask = soState.streamMasks[streamIndex];
+
+ // Write all entries into primitive data buffer for SOS.
+ while (_BitScanForward(&slot, soMask))
+ {
+ __m128 attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
+ uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
+ pa.AssembleSingle(paSlot, primIndex, attrib);
+
+ // Attribute offset is relative offset from start of vertex.
+ // Note that attributes start at slot 1 in the PA buffer. We need to write this
+ // to prim data starting at slot 0. Which is why we do (slot - 1).
+ // Also note: GL works slightly differently, and needs slot 0
+ uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
+
+ // Store each vertex's attrib at appropriate locations in pPrimData buffer.
+ for (uint32_t v = 0; v < soVertsPerPrim; ++v)
+ {
+ uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
+
+ _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
+ }
+ soMask &= ~(1 << slot);
+ }
+
+ // Update pPrimData pointer
+ soContext.pPrimData = pPrimData;
+
+ // Call SOS
+ SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
+ state.pfnSoFunc[streamIndex](soContext);
+ }
+
+ // Update SO write offset. The driver provides memory for the update.
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ if (state.soBuffer[i].pWriteOffset)
+ {
+ *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
+
+ // The SOS increments the existing write offset. So we don't want to increment
+ // the SoWriteOffset stat using an absolute offset instead of relative.
+ SET_STAT(SoWriteOffset[i], soContext.pBuffer[i]->streamOffset);
+ }
+ }
+
+ UPDATE_STAT(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
+ UPDATE_STAT(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
+
+ RDTSC_STOP(FEStreamout, 1, 0);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes number of invocations. The current index represents
+/// the start of the SIMD. The max index represents how much work
+/// items are remaining. If there is less then a SIMD's left of work
+/// then return the remaining amount of work.
+/// @param curIndex - The start index for the SIMD.
+/// @param maxIndex - The last index for all work items.
+static INLINE uint32_t GetNumInvocations(
+ uint32_t curIndex,
+ uint32_t maxIndex)
+{
+ uint32_t remainder = (maxIndex - curIndex);
+ return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Converts a streamId buffer to a cut buffer for the given stream id.
+/// The geometry shader will loop over each active streamout buffer, assembling
+/// primitives for the downstream stages. When multistream output is enabled,
+/// the generated stream ID buffer from the GS needs to be converted to a cut
+/// buffer for the primitive assembler.
+/// @param stream - stream id to generate the cut buffer for
+/// @param pStreamIdBase - pointer to the stream ID buffer
+/// @param numEmittedVerts - Number of total verts emitted by the GS
+/// @param pCutBuffer - output buffer to write cuts to
+void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
+{
+ SWR_ASSERT(stream < MAX_SO_STREAMS);
+
+ uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8;
+ uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
+
+ for (uint32_t b = 0; b < numOutputBytes; ++b)
+ {
+ uint8_t curInputByte = pStreamIdBase[2*b];
+ uint8_t outByte = 0;
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ if ((curInputByte & 0x3) != stream)
+ {
+ outByte |= (1 << i);
+ }
+ curInputByte >>= 2;
+ }
+
+ curInputByte = pStreamIdBase[2 * b + 1];
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ if ((curInputByte & 0x3) != stream)
+ {
+ outByte |= (1 << (i + 4));
+ }
+ curInputByte >>= 2;
+ }
+
+ *pCutBuffer++ = outByte;
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Implements GS stage.
+/// @param pDC - pointer to draw context.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param pa - The primitive assembly object.
+/// @param pGsOut - output stream for GS
+template <
+ bool HasStreamOutT,
+ bool HasRastT>
+static void GeometryShaderStage(
+ DRAW_CONTEXT *pDC,
+ uint32_t workerId,
+ PA_STATE& pa,
+ void* pGsOut,
+ void* pCutBuffer,
+ void* pStreamCutBuffer,
+ uint32_t* pSoPrimData,
+ simdscalari primID)
+{
+ RDTSC_START(FEGeometryShader);
+
+ SWR_GS_CONTEXT gsContext;
+ SWR_CONTEXT* pContext = pDC->pContext;
+
+ const API_STATE& state = GetApiState(pDC);
+ const SWR_GS_STATE* pState = &state.gsState;
+
+ SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
+ SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
+
+ gsContext.pStream = (uint8_t*)pGsOut;
+ gsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
+ gsContext.PrimitiveID = primID;
+
+ uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
+ simdvector attrib[MAX_ATTRIBUTES];
+
+ // assemble all attributes for the input primitive
+ for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
+ {
+ uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
+ pa.Assemble(attribSlot, attrib);
+
+ for (uint32_t i = 0; i < numVertsPerPrim; ++i)
+ {
+ gsContext.vert[i].attrib[attribSlot] = attrib[i];
+ }
+ }
+
+ // assemble position
+ pa.Assemble(VERTEX_POSITION_SLOT, attrib);
+ for (uint32_t i = 0; i < numVertsPerPrim; ++i)
+ {
+ gsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
+ }
+
+ const uint32_t vertexStride = sizeof(simdvertex);
+ const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
+ const uint32_t inputPrimStride = numSimdBatches * vertexStride;
+ const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
+ uint32_t cutPrimStride;
+ uint32_t cutInstanceStride;
+
+ if (pState->isSingleStream)
+ {
+ cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
+ cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
+ }
+ else
+ {
+ cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
+ cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
+ }
+
+ // record valid prims from the frontend to avoid over binning the newly generated
+ // prims from the GS
+ uint32_t numInputPrims = pa.NumPrims();
+
+ for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
+ {
+ gsContext.InstanceID = instance;
+ gsContext.mask = GenerateMask(numInputPrims);
+
+ // execute the geometry shader
+ state.pfnGsFunc(GetPrivateState(pDC), &gsContext);
+
+ gsContext.pStream += instanceStride;
+ gsContext.pCutOrStreamIdBuffer += cutInstanceStride;
+ }
+
+ // set up new binner and state for the GS output topology
+ PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
+ if (HasRastT)
+ {
+ switch (pState->outputTopology)
+ {
+ case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break;
+ case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break;
+ case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
+ default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
+ }
+ }
+
+ // foreach input prim:
+ // - setup a new PA based on the emitted verts for that prim
+ // - loop over the new verts, calling PA to assemble each prim
+ uint32_t* pVertexCount = (uint32_t*)&gsContext.vertexCount;
+ uint32_t* pPrimitiveId = (uint32_t*)&primID;
+
+ uint32_t totalPrimsGenerated = 0;
+ for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
+ {
+ uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
+ uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
+ for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
+ {
+ uint32_t numEmittedVerts = pVertexCount[inputPrim];
+ if (numEmittedVerts == 0)
+ {
+ continue;
+ }
+
+ uint8_t* pBase = pInstanceBase + instance * instanceStride;
+ uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
+
+ DWORD numAttribs;
+ _BitScanReverse(&numAttribs, state.feAttribMask);
+ numAttribs++;
+
+ for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
+ {
+ bool processCutVerts = false;
+
+ uint8_t* pCutBuffer = pCutBase;
+
+ // assign default stream ID, only relevant when GS is outputting a single stream
+ uint32_t streamID = 0;
+ if (pState->isSingleStream)
+ {
+ processCutVerts = true;
+ streamID = pState->singleStreamID;
+ if (streamID != stream) continue;
+ }
+ else
+ {
+ // early exit if this stream is not enabled for streamout
+ if (HasStreamOutT && !state.soState.streamEnable[stream])
+ {
+ continue;
+ }
+
+ // multi-stream output, need to translate StreamID buffer to a cut buffer
+ ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
+ pCutBuffer = (uint8_t*)pStreamCutBuffer;
+ processCutVerts = false;
+ }
+
+ PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
+
+ while (gsPa.GetNextStreamOutput())
+ {
+ do
+ {
+ bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
+
+ if (assemble)
+ {
+ totalPrimsGenerated += gsPa.NumPrims();
+
+ if (HasStreamOutT)
+ {
+ StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
+ }
+
+ if (HasRastT && state.soState.streamToRasterizer == stream)
+ {
+ simdscalari vPrimId;
+ // pull primitiveID from the GS output if available
+ if (state.gsState.emitsPrimitiveID)
+ {
+ simdvector primIdAttrib[3];
+ gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
+ vPrimId = _simd_castps_si(primIdAttrib[0].x);
+ }
+ else
+ {
+ vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
+ }
+
+ pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
+ }
+ }
+ } while (gsPa.NextPrim());
+ }
+ }
+ }
+ }
+
+ // update GS pipeline stats
+ UPDATE_STAT(GsInvocations, numInputPrims * pState->instanceCount);
+ UPDATE_STAT(GsPrimitives, totalPrimsGenerated);
+
+ RDTSC_STOP(FEGeometryShader, 1, 0);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Allocate GS buffers
+/// @param pDC - pointer to draw context.
+/// @param state - API state
+/// @param ppGsOut - pointer to GS output buffer allocation
+/// @param ppCutBuffer - pointer to GS output cut buffer allocation
+static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
+ void **ppStreamCutBuffer)
+{
+ Arena* pArena = pDC->pArena;
+ SWR_ASSERT(pArena != nullptr);
+ SWR_ASSERT(state.gsState.gsEnable);
+ // allocate arena space to hold GS output verts
+ // @todo pack attribs
+ // @todo support multiple streams
+ const uint32_t vertexStride = sizeof(simdvertex);
+ const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
+ uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
+ *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
+
+ const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
+ const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
+ const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
+ const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
+
+ // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
+ // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
+
+ // allocate space for temporary per-stream cut buffer if multi-stream is enabled
+ if (state.gsState.isSingleStream)
+ {
+ *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
+ *ppStreamCutBuffer = nullptr;
+ }
+ else
+ {
+ *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
+ *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
+ }
+
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Contains all data generated by the HS and passed to the
+/// tessellator and DS.
+struct TessellationThreadLocalData
+{
+ SWR_HS_CONTEXT hsContext;
+ ScalarPatch patchData[KNOB_SIMD_WIDTH];
+ void* pTxCtx;
+ size_t tsCtxSize;
+
+ simdscalar* pDSOutput;
+ size_t numDSOutputVectors;
+};
+
+THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Allocate tessellation data for this worker thread.
+INLINE
+static void AllocateTessellationData(SWR_CONTEXT* pContext)
+{
+ /// @TODO - Don't use thread local storage. Use Worker local storage instead.
+ if (gt_pTessellationThreadData == nullptr)
+ {
+ gt_pTessellationThreadData = (TessellationThreadLocalData*)
+ _aligned_malloc(sizeof(TessellationThreadLocalData), 64);
+ memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Implements Tessellation Stages.
+/// @param pDC - pointer to draw context.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param pa - The primitive assembly object.
+/// @param pGsOut - output stream for GS
+template <
+ bool HasGeometryShaderT,
+ bool HasStreamOutT,
+ bool HasRastT>
+static void TessellationStages(
+ DRAW_CONTEXT *pDC,
+ uint32_t workerId,
+ PA_STATE& pa,
+ void* pGsOut,
+ void* pCutBuffer,
+ void* pCutStreamBuffer,
+ uint32_t* pSoPrimData,
+ simdscalari primID)
+{
+ const API_STATE& state = GetApiState(pDC);
+ const SWR_TS_STATE& tsState = state.tsState;
+ SWR_CONTEXT *pContext = pDC->pContext; // Needed for UPDATE_STATS macro
+
+ SWR_ASSERT(gt_pTessellationThreadData);
+
+ HANDLE tsCtx = TSInitCtx(
+ tsState.domain,
+ tsState.partitioning,
+ tsState.tsOutputTopology,
+ gt_pTessellationThreadData->pTxCtx,
+ gt_pTessellationThreadData->tsCtxSize);
+ if (tsCtx == nullptr)
+ {
+ gt_pTessellationThreadData->pTxCtx = _aligned_malloc(gt_pTessellationThreadData->tsCtxSize, 64);
+ tsCtx = TSInitCtx(
+ tsState.domain,
+ tsState.partitioning,
+ tsState.tsOutputTopology,
+ gt_pTessellationThreadData->pTxCtx,
+ gt_pTessellationThreadData->tsCtxSize);
+ }
+ SWR_ASSERT(tsCtx);
+
+ PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
+ if (HasRastT)
+ {
+ switch (tsState.postDSTopology)
+ {
+ case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
+ case TOP_LINE_LIST: pfnClipFunc = ClipLines; break;
+ case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
+ default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
+ }
+ }
+
+ SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
+ hsContext.pCPout = gt_pTessellationThreadData->patchData;
+ hsContext.PrimitiveID = primID;
+
+ uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
+ // Max storage for one attribute for an entire simdprimitive
+ simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
+
+ // assemble all attributes for the input primitives
+ for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
+ {
+ uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
+ pa.Assemble(attribSlot, simdattrib);
+
+ for (uint32_t i = 0; i < numVertsPerPrim; ++i)
+ {
+ hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
+ }
+ }
+
+#if defined(_DEBUG)
+ memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
+#endif
+
+ uint32_t numPrims = pa.NumPrims();
+ hsContext.mask = GenerateMask(numPrims);
+
+ // Run the HS
+ RDTSC_START(FEHullShader);
+ state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
+ RDTSC_STOP(FEHullShader, 0, 0);
+
+ UPDATE_STAT(HsInvocations, numPrims);
+
+ const uint32_t* pPrimId = (const uint32_t*)&primID;
+
+ for (uint32_t p = 0; p < numPrims; ++p)
+ {
+ // Run Tessellator
+ SWR_TS_TESSELLATED_DATA tsData = { 0 };
+ RDTSC_START(FETessellation);
+ TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
+ RDTSC_STOP(FETessellation, 0, 0);
+
+ if (tsData.NumPrimitives == 0)
+ {
+ continue;
+ }
+ SWR_ASSERT(tsData.NumDomainPoints);
+
+ // Allocate DS Output memory
+ uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
+ size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
+ size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
+ if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
+ {
+ _aligned_free(gt_pTessellationThreadData->pDSOutput);
+ gt_pTessellationThreadData->pDSOutput = (simdscalar*)_aligned_malloc(requiredAllocSize, 64);
+ gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
+ }
+ SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
+ SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
+
+#if defined(_DEBUG)
+ memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
+#endif
+
+ // Run Domain Shader
+ SWR_DS_CONTEXT dsContext;
+ dsContext.PrimitiveID = pPrimId[p];
+ dsContext.pCpIn = &hsContext.pCPout[p];
+ dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
+ dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
+ dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
+ dsContext.vectorStride = requiredDSVectorInvocations;
+
+ uint32_t dsInvocations = 0;
+
+ for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
+ {
+ dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
+
+ RDTSC_START(FEDomainShader);
+ state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
+ RDTSC_STOP(FEDomainShader, 0, 0);
+
+ dsInvocations += KNOB_SIMD_WIDTH;
+ }
+ UPDATE_STAT(DsInvocations, tsData.NumDomainPoints);
+
+ PA_TESS tessPa(
+ pDC,
+ dsContext.pOutputData,
+ dsContext.vectorStride,
+ tsState.numDsOutputAttribs,
+ tsData.ppIndices,
+ tsData.NumPrimitives,
+ tsState.postDSTopology);
+
+ while (tessPa.HasWork())
+ {
+ if (HasGeometryShaderT)
+ {
+ GeometryShaderStage<HasStreamOutT, HasRastT>(
+ pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
+ _simd_set1_epi32(dsContext.PrimitiveID));
+ }
+ else
+ {
+ if (HasStreamOutT)
+ {
+ StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
+ }
+
+ if (HasRastT)
+ {
+ simdvector prim[3]; // Only deal with triangles, lines, or points
+ RDTSC_START(FEPAAssemble);
+#if SWR_ENABLE_ASSERTS
+ bool assemble =
+#endif
+ tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
+ RDTSC_STOP(FEPAAssemble, 1, 0);
+ SWR_ASSERT(assemble);
+
+ SWR_ASSERT(pfnClipFunc);
+ pfnClipFunc(pDC, tessPa, workerId, prim,
+ GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID));
+ }
+ }
+
+ tessPa.NextPrim();
+
+ } // while (tessPa.HasWork())
+ } // for (uint32_t p = 0; p < numPrims; ++p)
+
+ TSDestroyCtx(tsCtx);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief FE handler for SwrDraw.
+/// @tparam IsIndexedT - Is indexed drawing enabled
+/// @tparam HasTessellationT - Is tessellation enabled
+/// @tparam HasGeometryShaderT - Is the geometry shader stage enabled
+/// @tparam HasStreamOutT - Is stream-out enabled
+/// @tparam HasRastT - Is rasterization enabled
+/// @param pContext - pointer to SWR context.
+/// @param pDC - pointer to draw context.
+/// @param workerId - thread's worker id.
+/// @param pUserData - Pointer to DRAW_WORK
+template <
+ bool IsIndexedT,
+ bool HasTessellationT,
+ bool HasGeometryShaderT,
+ bool HasStreamOutT,
+ bool HasRastT>
+void ProcessDraw(
+ SWR_CONTEXT *pContext,
+ DRAW_CONTEXT *pDC,
+ uint32_t workerId,
+ void *pUserData)
+{
+
+#if KNOB_ENABLE_TOSS_POINTS
+ if (KNOB_TOSS_QUEUE_FE)
+ {
+ return;
+ }
+#endif
+
+ RDTSC_START(FEProcessDraw);
+
+ DRAW_WORK& work = *(DRAW_WORK*)pUserData;
+ const API_STATE& state = GetApiState(pDC);
+ __m256i vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+ SWR_VS_CONTEXT vsContext;
+ simdvertex vin;
+
+ int indexSize = 0;
+ uint32_t endVertex = work.numVerts;
+
+ const int32_t* pLastRequestedIndex = nullptr;
+ if (IsIndexedT)
+ {
+ switch (work.type)
+ {
+ case R32_UINT:
+ indexSize = sizeof(uint32_t);
+ pLastRequestedIndex = &(work.pIB[endVertex]);
+ break;
+ case R16_UINT:
+ indexSize = sizeof(uint16_t);
+ // nasty address offset to last index
+ pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
+ break;
+ case R8_UINT:
+ indexSize = sizeof(uint8_t);
+ // nasty address offset to last index
+ pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
+ break;
+ default:
+ SWR_ASSERT(0);
+ }
+ }
+ else
+ {
+ // No cuts, prune partial primitives.
+ endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
+ }
+
+ SWR_FETCH_CONTEXT fetchInfo = { 0 };
+ fetchInfo.pStreams = &state.vertexBuffers[0];
+ fetchInfo.StartInstance = work.startInstance;
+ fetchInfo.StartVertex = 0;
+
+ vsContext.pVin = &vin;
+
+ if (IsIndexedT)
+ {
+ fetchInfo.BaseVertex = work.baseVertex;
+
+ // if the entire index buffer isn't being consumed, set the last index
+ // so that fetches < a SIMD wide will be masked off
+ fetchInfo.pLastIndex = (const int32_t*)(((BYTE*)state.indexBuffer.pIndices) + state.indexBuffer.size);
+ if (pLastRequestedIndex < fetchInfo.pLastIndex)
+ {
+ fetchInfo.pLastIndex = pLastRequestedIndex;
+ }
+ }
+ else
+ {
+ fetchInfo.StartVertex = work.startVertex;
+ }
+
+#ifdef KNOB_ENABLE_RDTSC
+ uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
+#endif
+
+ void* pGsOut = nullptr;
+ void* pCutBuffer = nullptr;
+ void* pStreamCutBuffer = nullptr;
+ if (HasGeometryShaderT)
+ {
+ AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
+ }
+
+ if (HasTessellationT)
+ {
+ SWR_ASSERT(state.tsState.tsEnable == true);
+ SWR_ASSERT(state.pfnHsFunc != nullptr);
+ SWR_ASSERT(state.pfnDsFunc != nullptr);
+
+ AllocateTessellationData(pContext);
+ }
+ else
+ {
+ SWR_ASSERT(state.tsState.tsEnable == false);
+ SWR_ASSERT(state.pfnHsFunc == nullptr);
+ SWR_ASSERT(state.pfnDsFunc == nullptr);
+ }
+
+ // allocate space for streamout input prim data
+ uint32_t* pSoPrimData = nullptr;
+ if (HasStreamOutT)
+ {
+ pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
+
+ // update the
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ SET_STAT(SoWriteOffset[i], state.soBuffer[i].streamOffset);
+ }
+
+ }
+
+ // choose primitive assembler
+ PA_FACTORY<IsIndexedT> paFactory(pDC, state.topology, work.numVerts);
+ PA_STATE& pa = paFactory.GetPA();
+
+ /// @todo: temporarily move instance loop in the FE to ensure SO ordering
+ for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
+ {
+ simdscalari vIndex;
+ uint32_t i = 0;
+
+ if (IsIndexedT)
+ {
+ fetchInfo.pIndices = work.pIB;
+ }
+ else
+ {
+ vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
+ fetchInfo.pIndices = (const int32_t*)&vIndex;
+ }
+
+ fetchInfo.CurInstance = instanceNum;
+ vsContext.InstanceID = instanceNum;
+
+ while (pa.HasWork())
+ {
+ // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
+ // So we need to keep this outside of (i < endVertex) check.
+ simdmask* pvCutIndices = nullptr;
+ if (IsIndexedT)
+ {
+ pvCutIndices = &pa.GetNextVsIndices();
+ }
+
+ simdvertex& vout = pa.GetNextVsOutput();
+ vsContext.pVout = &vout;
+
+ if (i < endVertex)
+ {
+
+ // 1. Execute FS/VS for a single SIMD.
+ RDTSC_START(FEFetchShader);
+ state.pfnFetchFunc(fetchInfo, vin);
+ RDTSC_STOP(FEFetchShader, 0, 0);
+
+ // forward fetch generated vertex IDs to the vertex shader
+ vsContext.VertexID = fetchInfo.VertexID;
+
+ // Setup active mask for vertex shader.
+ vsContext.mask = GenerateMask(endVertex - i);
+
+ // forward cut mask to the PA
+ if (IsIndexedT)
+ {
+ *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
+ }
+
+ UPDATE_STAT(IaVertices, GetNumInvocations(i, endVertex));
+
+#if KNOB_ENABLE_TOSS_POINTS
+ if (!KNOB_TOSS_FETCH)
+#endif
+ {
+ RDTSC_START(FEVertexShader);
+ state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
+ RDTSC_STOP(FEVertexShader, 0, 0);
+
+ UPDATE_STAT(VsInvocations, GetNumInvocations(i, endVertex));
+ }
+ }
+
+ // 2. Assemble primitives given the last two SIMD.
+ do
+ {
+ simdvector prim[MAX_NUM_VERTS_PER_PRIM];
+ // PaAssemble returns false if there is not enough verts to assemble.
+ RDTSC_START(FEPAAssemble);
+ bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
+ RDTSC_STOP(FEPAAssemble, 1, 0);
+
+#if KNOB_ENABLE_TOSS_POINTS
+ if (!KNOB_TOSS_FETCH)
+#endif
+ {
+#if KNOB_ENABLE_TOSS_POINTS
+ if (!KNOB_TOSS_VS)
+#endif
+ {
+ if (assemble)
+ {
+ UPDATE_STAT(IaPrimitives, pa.NumPrims());
+
+ if (HasTessellationT)
+ {
+ TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
+ pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
+ }
+ else if (HasGeometryShaderT)
+ {
+ GeometryShaderStage<HasStreamOutT, HasRastT>(
+ pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
+ }
+ else
+ {
+ // If streamout is enabled then stream vertices out to memory.
+ if (HasStreamOutT)
+ {
+ StreamOut(pDC, pa, workerId, pSoPrimData, 0);
+ }
+
+ if (HasRastT)
+ {
+ SWR_ASSERT(pDC->pState->pfnProcessPrims);
+ pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
+ GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID));
+ }
+ }
+ }
+ }
+ }
+ } while (pa.NextPrim());
+
+ i += KNOB_SIMD_WIDTH;
+ if (IsIndexedT)
+ {
+ fetchInfo.pIndices = (int*)((BYTE*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
+ }
+ else
+ {
+ vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
+ }
+ }
+ pa.Reset();
+ }
+
+ RDTSC_STOP(FEProcessDraw, numPrims * work.numInstances, pDC->drawId);
+}
+// Explicit Instantiation of all combinations
+template void ProcessDraw<false, false, false, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, false, false, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, false, false, true, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, false, false, true, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, false, true, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, false, true, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, false, true, true, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, false, true, true, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, true, false, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, true, false, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, true, false, true, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, true, false, true, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, true, true, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, true, true, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, true, true, true, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, true, true, true, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true, false, false, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true, false, false, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true, false, false, true, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true, false, false, true, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true, false, true, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true, false, true, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true, false, true, true, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true, false, true, true, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true, true, false, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true, true, false, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true, true, false, true, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true, true, false, true, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true, true, true, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true, true, true, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true, true, true, true, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true, true, true, true, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Processes attributes for the backend based on linkage mask and
+/// linkage map. Essentially just doing an SOA->AOS conversion and pack.
+/// @param pDC - Draw context
+/// @param pa - Primitive Assembly state
+/// @param linkageMask - Specifies which VS outputs are routed to PS.
+/// @param pLinkageMap - maps VS attribute slot to PS slot
+/// @param triIndex - Triangle to process attributes for
+/// @param pBuffer - Output result
+template<uint32_t NumVerts>
+INLINE void ProcessAttributes(
+ DRAW_CONTEXT *pDC,
+ PA_STATE&pa,
+ uint32_t linkageMask,
+ const uint8_t* pLinkageMap,
+ uint32_t triIndex,
+ float *pBuffer)
+{
+ DWORD slot = 0;
+ uint32_t mapIdx = 0;
+ LONG constantInterpMask = pDC->pState->state.backendState.constantInterpolationMask;
+ const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
+
+ while (_BitScanForward(&slot, linkageMask))
+ {
+ linkageMask &= ~(1 << slot); // done with this bit.
+
+ // compute absolute slot in vertex attrib array
+ uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + pLinkageMap[mapIdx];
+
+ __m128 attrib[3]; // triangle attribs (always 4 wide)
+ pa.AssembleSingle(inputSlot, triIndex, attrib);
+
+ if (_bittest(&constantInterpMask, mapIdx))
+ {
+ for (uint32_t i = 0; i < NumVerts; ++i)
+ {
+ _mm_store_ps(pBuffer, attrib[provokingVertex]);
+ pBuffer += 4;
+ }
+ }
+ else
+ {
+ for (uint32_t i = 0; i < NumVerts; ++i)
+ {
+ _mm_store_ps(pBuffer, attrib[i]);
+ pBuffer += 4;
+ }
+ }
+
+ // pad out the attrib buffer to 3 verts to ensure the triangle
+ // interpolation code in the pixel shader works correctly for the
+ // 3 topologies - point, line, tri. This effectively zeros out the
+ // effect of the missing vertices in the triangle interpolation.
+ for (uint32_t i = NumVerts; i < 3; ++i)
+ {
+ _mm_store_ps(pBuffer, attrib[NumVerts - 1]);
+ pBuffer += 4;
+ }
+
+ mapIdx++;
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Processes enabled user clip distances. Loads the active clip
+/// distances from the PA, sets up barycentric equations, and
+/// stores the results to the output buffer
+/// @param pa - Primitive Assembly state
+/// @param primIndex - primitive index to process
+/// @param clipDistMask - mask of enabled clip distances
+/// @param pUserClipBuffer - buffer to store results
+template<uint32_t NumVerts>
+void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer)
+{
+ DWORD clipDist;
+ while (_BitScanForward(&clipDist, clipDistMask))
+ {
+ clipDistMask &= ~(1 << clipDist);
+ uint32_t clipSlot = clipDist >> 2;
+ uint32_t clipComp = clipDist & 0x3;
+ uint32_t clipAttribSlot = clipSlot == 0 ?
+ VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
+
+ __m128 primClipDist[3];
+ pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
+
+ float vertClipDist[NumVerts];
+ for (uint32_t e = 0; e < NumVerts; ++e)
+ {
+ OSALIGNSIMD(float) aVertClipDist[4];
+ _mm_store_ps(aVertClipDist, primClipDist[e]);
+ vertClipDist[e] = aVertClipDist[clipComp];
+ };
+
+ // setup plane equations for barycentric interpolation in the backend
+ float baryCoeff[NumVerts];
+ for (uint32_t e = 0; e < NumVerts - 1; ++e)
+ {
+ baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1];
+ }
+ baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1];
+
+ for (uint32_t e = 0; e < NumVerts; ++e)
+ {
+ *(pUserClipBuffer++) = baryCoeff[e];
+ }
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
+/// culling, viewport transform, etc.
+/// @param pDC - pointer to draw context.
+/// @param pa - The primitive assembly object.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param tri - Contains triangle position data for SIMDs worth of triangles.
+/// @param primID - Primitive ID for each triangle.
+void BinTriangles(
+ DRAW_CONTEXT *pDC,
+ PA_STATE& pa,
+ uint32_t workerId,
+ simdvector tri[3],
+ uint32_t triMask,
+ simdscalari primID)
+{
+ RDTSC_START(FEBinTriangles);
+
+ const API_STATE& state = GetApiState(pDC);
+ const SWR_RASTSTATE& rastState = state.rastState;
+ const SWR_FRONTEND_STATE& feState = state.frontendState;
+ const SWR_GS_STATE& gsState = state.gsState;
+
+ // Simple wireframe mode for debugging purposes only
+
+ simdscalar vRecipW0 = _simd_set1_ps(1.0f);
+ simdscalar vRecipW1 = _simd_set1_ps(1.0f);
+ simdscalar vRecipW2 = _simd_set1_ps(1.0f);
+
+ if (!feState.vpTransformDisable)
+ {
+ // perspective divide
+ vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
+ vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
+ vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
+
+ tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
+ tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
+ tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
+
+ tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
+ tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
+ tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
+
+ tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
+ tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
+ tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
+
+ // viewport transform to screen coords
+ viewportTransform<3>(tri, state.vpMatrix[0]);
+ }
+
+ // adjust for pixel center location
+ simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
+ tri[0].x = _simd_add_ps(tri[0].x, offset);
+ tri[0].y = _simd_add_ps(tri[0].y, offset);
+
+ tri[1].x = _simd_add_ps(tri[1].x, offset);
+ tri[1].y = _simd_add_ps(tri[1].y, offset);
+
+ tri[2].x = _simd_add_ps(tri[2].x, offset);
+ tri[2].y = _simd_add_ps(tri[2].y, offset);
+
+ // convert to fixed point
+ simdscalari vXi[3], vYi[3];
+ vXi[0] = fpToFixedPointVertical(tri[0].x);
+ vYi[0] = fpToFixedPointVertical(tri[0].y);
+ vXi[1] = fpToFixedPointVertical(tri[1].x);
+ vYi[1] = fpToFixedPointVertical(tri[1].y);
+ vXi[2] = fpToFixedPointVertical(tri[2].x);
+ vYi[2] = fpToFixedPointVertical(tri[2].y);
+
+ // triangle setup
+ simdscalari vAi[3], vBi[3];
+ triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
+
+ // determinant
+ simdscalari vDet[2];
+ calcDeterminantIntVertical(vAi, vBi, vDet);
+
+ // cull zero area
+ int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
+ int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
+
+ int cullZeroAreaMask = maskLo | ((maskHi << KNOB_SIMD_WIDTH / 2));
+
+ uint32_t origTriMask = triMask;
+ triMask &= ~cullZeroAreaMask;
+
+ // determine front winding tris
+ // CW +det
+ // CCW -det
+ maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
+ maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
+ int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) );
+
+ uint32_t frontWindingTris;
+ if (rastState.frontWinding == SWR_FRONTWINDING_CW)
+ {
+ frontWindingTris = cwTriMask;
+ }
+ else
+ {
+ frontWindingTris = ~cwTriMask;
+ }
+
+ // cull
+ uint32_t cullTris;
+ switch ((SWR_CULLMODE)rastState.cullMode)
+ {
+ case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
+ case SWR_CULLMODE_NONE: cullTris = 0x0; break;
+ case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
+ case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
+ default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
+ }
+
+ triMask &= ~cullTris;
+
+ if (origTriMask ^ triMask)
+ {
+ RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
+ }
+
+ // compute per tri backface
+ uint32_t frontFaceMask = frontWindingTris;
+
+ uint32_t *pPrimID = (uint32_t *)&primID;
+ DWORD triIndex = 0;
+
+ if (!triMask)
+ {
+ goto endBinTriangles;
+ }
+
+ // Calc bounding box of triangles
+ simdBBox bbox;
+ calcBoundingBoxIntVertical(vXi, vYi, bbox);
+
+ // determine if triangle falls between pixel centers and discard
+ // only discard for non-MSAA case
+ // (left + 127) & ~255
+ // (right + 128) & ~255
+
+ if(rastState.sampleCount == SWR_MULTISAMPLE_1X)
+ {
+ origTriMask = triMask;
+
+ int cullCenterMask;
+ {
+ simdscalari left = _simd_add_epi32(bbox.left, _simd_set1_epi32(127));
+ left = _simd_and_si(left, _simd_set1_epi32(~255));
+ simdscalari right = _simd_add_epi32(bbox.right, _simd_set1_epi32(128));
+ right = _simd_and_si(right, _simd_set1_epi32(~255));
+
+ simdscalari vMaskH = _simd_cmpeq_epi32(left, right);
+
+ simdscalari top = _simd_add_epi32(bbox.top, _simd_set1_epi32(127));
+ top = _simd_and_si(top, _simd_set1_epi32(~255));
+ simdscalari bottom = _simd_add_epi32(bbox.bottom, _simd_set1_epi32(128));
+ bottom = _simd_and_si(bottom, _simd_set1_epi32(~255));
+
+ simdscalari vMaskV = _simd_cmpeq_epi32(top, bottom);
+ vMaskV = _simd_or_si(vMaskH, vMaskV);
+ cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
+ }
+
+ triMask &= ~cullCenterMask;
+
+ if(origTriMask ^ triMask)
+ {
+ RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
+ }
+ }
+
+ // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
+ bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
+ bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
+ bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
+ bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
+
+ // Cull tris completely outside scissor
+ {
+ simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
+ simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
+ simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
+ uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
+ triMask = triMask & ~maskOutsideScissor;
+ }
+
+ if (!triMask)
+ {
+ goto endBinTriangles;
+ }
+
+ // Convert triangle bbox to macrotile units.
+ bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
+ bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+ bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
+ bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+
+ OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
+ _simd_store_si((simdscalari*)aMTLeft, bbox.left);
+ _simd_store_si((simdscalari*)aMTRight, bbox.right);
+ _simd_store_si((simdscalari*)aMTTop, bbox.top);
+ _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
+
+ // transpose verts needed for backend
+ /// @todo modify BE to take non-transformed verts
+ __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
+ vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
+ vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
+ vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
+ vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
+
+ // store render target array index
+ OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
+ if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
+ {
+ simdvector vRtai[3];
+ pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
+ simdscalari vRtaii;
+ vRtaii = _simd_castps_si(vRtai[0].x);
+ _simd_store_si((simdscalari*)aRTAI, vRtaii);
+ }
+ else
+ {
+ _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
+ }
+
+ // scan remaining valid triangles and bin each separately
+ while (_BitScanForward(&triIndex, triMask))
+ {
+ uint32_t linkageCount = state.linkageCount;
+ uint32_t linkageMask = state.linkageMask;
+ uint32_t numScalarAttribs = linkageCount * 4;
+
+ BE_WORK work;
+ work.type = DRAW;
+
+ TRIANGLE_WORK_DESC &desc = work.desc.tri;
+
+ desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
+ desc.triFlags.primID = pPrimID[triIndex];
+ desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
+
+ if(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN)
+ {
+ work.pfnWork = gRasterizerTable[rastState.scissorEnable][rastState.sampleCount];
+ }
+ else
+ {
+ // for center sample pattern, all samples are at pixel center; calculate coverage
+ // once at center and broadcast the results in the backend
+ work.pfnWork = gRasterizerTable[rastState.scissorEnable][SWR_MULTISAMPLE_1X];
+ }
+
+ Arena* pArena = pDC->pArena;
+ SWR_ASSERT(pArena != nullptr);
+
+ // store active attribs
+ float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
+ desc.pAttribs = pAttribs;
+ desc.numAttribs = linkageCount;
+ ProcessAttributes<3>(pDC, pa, linkageMask, state.linkageMap, triIndex, desc.pAttribs);
+
+ // store triangle vertex data
+ desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
+
+ _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
+ _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
+ _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
+ _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
+
+ // store user clip distances
+ if (rastState.clipDistanceMask)
+ {
+ uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
+ desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
+ ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
+ }
+
+ MacroTileMgr *pTileMgr = pDC->pTileMgr;
+ for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
+ {
+ for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
+ {
+#if KNOB_ENABLE_TOSS_POINTS
+ if (!KNOB_TOSS_SETUP_TRIS)
+#endif
+ {
+ pTileMgr->enqueue(x, y, &work);
+ }
+ }
+ }
+
+ triMask &= ~(1 << triIndex);
+ }
+
+endBinTriangles:
+ RDTSC_STOP(FEBinTriangles, 1, 0);
+}
+
+
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Bin SIMD points to the backend. Only supports point size of 1
+/// @param pDC - pointer to draw context.
+/// @param pa - The primitive assembly object.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param tri - Contains point position data for SIMDs worth of points.
+/// @param primID - Primitive ID for each point.
+void BinPoints(
+ DRAW_CONTEXT *pDC,
+ PA_STATE& pa,
+ uint32_t workerId,
+ simdvector prim[3],
+ uint32_t primMask,
+ simdscalari primID)
+{
+ RDTSC_START(FEBinPoints);
+
+ simdvector& primVerts = prim[0];
+
+ const API_STATE& state = GetApiState(pDC);
+ const SWR_FRONTEND_STATE& feState = state.frontendState;
+ const SWR_GS_STATE& gsState = state.gsState;
+ const SWR_RASTSTATE& rastState = state.rastState;
+
+ if (!feState.vpTransformDisable)
+ {
+ // perspective divide
+ simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
+ primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
+ primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
+ primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
+
+ // viewport transform to screen coords
+ viewportTransform<1>(&primVerts, state.vpMatrix[0]);
+ }
+
+ // adjust for pixel center location
+ simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
+ primVerts.x = _simd_add_ps(primVerts.x, offset);
+ primVerts.y = _simd_add_ps(primVerts.y, offset);
+
+ // convert to fixed point
+ simdscalari vXi, vYi;
+ vXi = fpToFixedPointVertical(primVerts.x);
+ vYi = fpToFixedPointVertical(primVerts.y);
+
+ if (CanUseSimplePoints(pDC))
+ {
+ // adjust for top-left rule
+ vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
+ vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
+
+ // cull points off the top-left edge of the viewport
+ primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
+ primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
+
+ // compute macro tile coordinates
+ simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
+ simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+
+ OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
+ _simd_store_si((simdscalari*)aMacroX, macroX);
+ _simd_store_si((simdscalari*)aMacroY, macroY);
+
+ // compute raster tile coordinates
+ simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
+ simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
+
+ // compute raster tile relative x,y for coverage mask
+ simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
+ simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
+
+ simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
+ simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
+
+ OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
+ OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
+ _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
+ _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
+
+ OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
+ OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
+ _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
+ _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
+
+ OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
+ _simd_store_ps((float*)aZ, primVerts.z);
+
+ // store render target array index
+ OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
+ if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
+ {
+ simdvector vRtai;
+ pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
+ simdscalari vRtaii = _simd_castps_si(vRtai.x);
+ _simd_store_si((simdscalari*)aRTAI, vRtaii);
+ }
+ else
+ {
+ _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
+ }
+
+ uint32_t *pPrimID = (uint32_t *)&primID;
+ DWORD primIndex = 0;
+ // scan remaining valid triangles and bin each separately
+ while (_BitScanForward(&primIndex, primMask))
+ {
+ uint32_t linkageCount = state.linkageCount;
+ uint32_t linkageMask = state.linkageMask;
+
+ uint32_t numScalarAttribs = linkageCount * 4;
+
+ BE_WORK work;
+ work.type = DRAW;
+
+ TRIANGLE_WORK_DESC &desc = work.desc.tri;
+
+ // points are always front facing
+ desc.triFlags.frontFacing = 1;
+ desc.triFlags.primID = pPrimID[primIndex];
+ desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
+
+ work.pfnWork = RasterizeSimplePoint;
+
+ Arena* pArena = pDC->pArena;
+ SWR_ASSERT(pArena != nullptr);
+
+ // store attributes
+ float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
+ desc.pAttribs = pAttribs;
+ desc.numAttribs = linkageCount;
+
+ ProcessAttributes<1>(pDC, pa, linkageMask, state.linkageMap, primIndex, pAttribs);
+
+ // store raster tile aligned x, y, perspective correct z
+ float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
+ desc.pTriBuffer = pTriBuffer;
+ *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
+ *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
+ *pTriBuffer = aZ[primIndex];
+
+ uint32_t tX = aTileRelativeX[primIndex];
+ uint32_t tY = aTileRelativeY[primIndex];
+
+ // pack the relative x,y into the coverageMask, the rasterizer will
+ // generate the true coverage mask from it
+ work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
+
+ // bin it
+ MacroTileMgr *pTileMgr = pDC->pTileMgr;
+#if KNOB_ENABLE_TOSS_POINTS
+ if (!KNOB_TOSS_SETUP_TRIS)
+#endif
+ {
+ pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
+ }
+ primMask &= ~(1 << primIndex);
+ }
+ }
+ else
+ {
+ // non simple points need to be potentially binned to multiple macro tiles
+ simdscalar vPointSize;
+ if (rastState.pointParam)
+ {
+ simdvector size[3];
+ pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
+ vPointSize = size[0].x;
+ }
+ else
+ {
+ vPointSize = _simd_set1_ps(rastState.pointSize);
+ }
+
+ // bloat point to bbox
+ simdBBox bbox;
+ bbox.left = bbox.right = vXi;
+ bbox.top = bbox.bottom = vYi;
+
+ simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
+ simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
+ bbox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
+ bbox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
+ bbox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
+ bbox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
+
+ // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
+ bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
+ bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
+ bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
+ bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
+
+ // Cull bloated points completely outside scissor
+ simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
+ simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
+ simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
+ uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
+ primMask = primMask & ~maskOutsideScissor;
+
+ // Convert bbox to macrotile units.
+ bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
+ bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+ bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
+ bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+
+ OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
+ _simd_store_si((simdscalari*)aMTLeft, bbox.left);
+ _simd_store_si((simdscalari*)aMTRight, bbox.right);
+ _simd_store_si((simdscalari*)aMTTop, bbox.top);
+ _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
+
+ // store render target array index
+ OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
+ if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
+ {
+ simdvector vRtai[2];
+ pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
+ simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
+ _simd_store_si((simdscalari*)aRTAI, vRtaii);
+ }
+ else
+ {
+ _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
+ }
+
+ OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
+ _simd_store_ps((float*)aPointSize, vPointSize);
+
+ uint32_t *pPrimID = (uint32_t *)&primID;
+
+ OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
+ OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
+ OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
+
+ _simd_store_ps((float*)aPrimVertsX, primVerts.x);
+ _simd_store_ps((float*)aPrimVertsY, primVerts.y);
+ _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
+
+ // scan remaining valid prims and bin each separately
+ DWORD primIndex;
+ while (_BitScanForward(&primIndex, primMask))
+ {
+ uint32_t linkageCount = state.linkageCount;
+ uint32_t linkageMask = state.linkageMask;
+ uint32_t numScalarAttribs = linkageCount * 4;
+
+ BE_WORK work;
+ work.type = DRAW;
+
+ TRIANGLE_WORK_DESC &desc = work.desc.tri;
+
+ desc.triFlags.frontFacing = 1;
+ desc.triFlags.primID = pPrimID[primIndex];
+ desc.triFlags.pointSize = aPointSize[primIndex];
+ desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
+
+ work.pfnWork = RasterizeTriPoint;
+
+ Arena* pArena = pDC->pArena;
+ SWR_ASSERT(pArena != nullptr);
+
+ // store active attribs
+ desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
+ desc.numAttribs = linkageCount;
+ ProcessAttributes<1>(pDC, pa, linkageMask, state.linkageMap, primIndex, desc.pAttribs);
+
+ // store point vertex data
+ float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
+ desc.pTriBuffer = pTriBuffer;
+ *pTriBuffer++ = aPrimVertsX[primIndex];
+ *pTriBuffer++ = aPrimVertsY[primIndex];
+ *pTriBuffer = aPrimVertsZ[primIndex];
+
+ // store user clip distances
+ if (rastState.clipDistanceMask)
+ {
+ uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
+ desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
+ ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
+ }
+
+ MacroTileMgr *pTileMgr = pDC->pTileMgr;
+ for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
+ {
+ for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
+ {
+#if KNOB_ENABLE_TOSS_POINTS
+ if (!KNOB_TOSS_SETUP_TRIS)
+#endif
+ {
+ pTileMgr->enqueue(x, y, &work);
+ }
+ }
+ }
+
+ primMask &= ~(1 << primIndex);
+ }
+ }
+
+
+
+
+ RDTSC_STOP(FEBinPoints, 1, 0);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Bin SIMD lines to the backend.
+/// @param pDC - pointer to draw context.
+/// @param pa - The primitive assembly object.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param tri - Contains line position data for SIMDs worth of points.
+/// @param primID - Primitive ID for each line.
+void BinLines(
+ DRAW_CONTEXT *pDC,
+ PA_STATE& pa,
+ uint32_t workerId,
+ simdvector prim[],
+ uint32_t primMask,
+ simdscalari primID)
+{
+ RDTSC_START(FEBinLines);
+
+ const API_STATE& state = GetApiState(pDC);
+ const SWR_RASTSTATE& rastState = state.rastState;
+ const SWR_FRONTEND_STATE& feState = state.frontendState;
+ const SWR_GS_STATE& gsState = state.gsState;
+
+ simdscalar vRecipW0 = _simd_set1_ps(1.0f);
+ simdscalar vRecipW1 = _simd_set1_ps(1.0f);
+
+ if (!feState.vpTransformDisable)
+ {
+ // perspective divide
+ vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
+ vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
+
+ prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0);
+ prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1);
+
+ prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0);
+ prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1);
+
+ prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0);
+ prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
+
+ // viewport transform to screen coords
+ viewportTransform<2>(prim, state.vpMatrix[0]);
+ }
+
+ // adjust for pixel center location
+ simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
+ prim[0].x = _simd_add_ps(prim[0].x, offset);
+ prim[0].y = _simd_add_ps(prim[0].y, offset);
+
+ prim[1].x = _simd_add_ps(prim[1].x, offset);
+ prim[1].y = _simd_add_ps(prim[1].y, offset);
+
+ // convert to fixed point
+ simdscalari vXi[2], vYi[2];
+ vXi[0] = fpToFixedPointVertical(prim[0].x);
+ vYi[0] = fpToFixedPointVertical(prim[0].y);
+ vXi[1] = fpToFixedPointVertical(prim[1].x);
+ vYi[1] = fpToFixedPointVertical(prim[1].y);
+
+ // compute x-major vs y-major mask
+ simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
+ simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
+ simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
+ uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
+
+ // cull zero-length lines
+ simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
+ vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
+
+ primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
+
+ uint32_t *pPrimID = (uint32_t *)&primID;
+
+ simdscalar vUnused = _simd_setzero_ps();
+
+ // Calc bounding box of lines
+ simdBBox bbox;
+ bbox.left = _simd_min_epi32(vXi[0], vXi[1]);
+ bbox.right = _simd_max_epi32(vXi[0], vXi[1]);
+ bbox.top = _simd_min_epi32(vYi[0], vYi[1]);
+ bbox.bottom = _simd_max_epi32(vYi[0], vYi[1]);
+
+ // bloat bbox by line width along minor axis
+ simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
+ simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
+ simdBBox bloatBox;
+ bloatBox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
+ bloatBox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
+ bloatBox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
+ bloatBox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
+
+ bbox.left = _simd_blendv_epi32(bbox.left, bloatBox.left, vYmajorMask);
+ bbox.right = _simd_blendv_epi32(bbox.right, bloatBox.right, vYmajorMask);
+ bbox.top = _simd_blendv_epi32(bloatBox.top, bbox.top, vYmajorMask);
+ bbox.bottom = _simd_blendv_epi32(bloatBox.bottom, bbox.bottom, vYmajorMask);
+
+ // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
+ bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
+ bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
+ bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
+ bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
+
+ // Cull prims completely outside scissor
+ {
+ simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
+ simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
+ simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
+ uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
+ primMask = primMask & ~maskOutsideScissor;
+ }
+
+ if (!primMask)
+ {
+ goto endBinLines;
+ }
+
+ // Convert triangle bbox to macrotile units.
+ bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
+ bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+ bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
+ bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+
+ OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
+ _simd_store_si((simdscalari*)aMTLeft, bbox.left);
+ _simd_store_si((simdscalari*)aMTRight, bbox.right);
+ _simd_store_si((simdscalari*)aMTTop, bbox.top);
+ _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
+
+ // transpose verts needed for backend
+ /// @todo modify BE to take non-transformed verts
+ __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
+ vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
+ vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
+ vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
+ vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
+
+ // store render target array index
+ OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
+ if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
+ {
+ simdvector vRtai[2];
+ pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
+ simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
+ _simd_store_si((simdscalari*)aRTAI, vRtaii);
+ }
+ else
+ {
+ _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
+ }
+
+ // scan remaining valid prims and bin each separately
+ DWORD primIndex;
+ while (_BitScanForward(&primIndex, primMask))
+ {
+ uint32_t linkageCount = state.linkageCount;
+ uint32_t linkageMask = state.linkageMask;
+ uint32_t numScalarAttribs = linkageCount * 4;
+
+ BE_WORK work;
+ work.type = DRAW;
+
+ TRIANGLE_WORK_DESC &desc = work.desc.tri;
+
+ desc.triFlags.frontFacing = 1;
+ desc.triFlags.primID = pPrimID[primIndex];
+ desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
+ desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
+
+ work.pfnWork = RasterizeLine;
+
+ Arena* pArena = pDC->pArena;
+ SWR_ASSERT(pArena != nullptr);
+
+ // store active attribs
+ desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
+ desc.numAttribs = linkageCount;
+ ProcessAttributes<2>(pDC, pa, linkageMask, state.linkageMap, primIndex, desc.pAttribs);
+
+ // store line vertex data
+ desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
+ _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
+ _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
+ _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
+ _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
+
+ // store user clip distances
+ if (rastState.clipDistanceMask)
+ {
+ uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
+ desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
+ ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
+ }
+
+ MacroTileMgr *pTileMgr = pDC->pTileMgr;
+ for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
+ {
+ for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
+ {
+#if KNOB_ENABLE_TOSS_POINTS
+ if (!KNOB_TOSS_SETUP_TRIS)
+#endif
+ {
+ pTileMgr->enqueue(x, y, &work);
+ }
+ }
+ }
+
+ primMask &= ~(1 << primIndex);
+ }
+
+endBinLines:
+
+ RDTSC_STOP(FEBinLines, 1, 0);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h
new file mode 100644
index 00000000000..acb935fc251
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h
@@ -0,0 +1,327 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file frontend.h
+*
+* @brief Definitions for Frontend which handles vertex processing,
+* primitive assembly, clipping, binning, etc.
+*
+******************************************************************************/
+#pragma once
+#include "context.h"
+
+INLINE
+__m128i fpToFixedPoint(const __m128 vIn)
+{
+ __m128 vFixed = _mm_mul_ps(vIn, _mm_set1_ps(FIXED_POINT_SCALE));
+ return _mm_cvtps_epi32(vFixed);
+}
+
+INLINE
+simdscalari fpToFixedPointVertical(const simdscalar vIn)
+{
+ simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(FIXED_POINT_SCALE));
+ return _simd_cvtps_epi32(vFixed);
+}
+
+
+// Calculates the A and B coefficients for the 3 edges of the triangle
+//
+// maths for edge equations:
+// standard form of a line in 2d
+// Ax + By + C = 0
+// A = y0 - y1
+// B = x1 - x0
+// C = x0y1 - x1y0
+INLINE
+void triangleSetupAB(const __m128 vX, const __m128 vY, __m128 & vA, __m128 & vB)
+{
+ // vYsub = y1 y2 y0 dc
+ __m128 vYsub = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(3, 0, 2, 1));
+ // vY = y0 y1 y2 dc
+ vA = _mm_sub_ps(vY, vYsub);
+
+ // Result:
+ // A[0] = y0 - y1
+ // A[1] = y1 - y2
+ // A[2] = y2 - y0
+
+ // vXsub = x1 x2 x0 dc
+ __m128 vXsub = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(3, 0, 2, 1));
+ // vX = x0 x1 x2 dc
+ vB = _mm_sub_ps(vXsub, vX);
+
+ // Result:
+ // B[0] = x1 - x0
+ // B[1] = x2 - x1
+ // B[2] = x0 - x2
+}
+
+INLINE
+void triangleSetupABVertical(const simdscalar vX[3], const simdscalar vY[3], simdscalar (&vA)[3], simdscalar (&vB)[3])
+{
+ // generate edge equations
+ // A = y0 - y1
+ // B = x1 - x0
+ vA[0] = _simd_sub_ps(vY[0], vY[1]);
+ vA[1] = _simd_sub_ps(vY[1], vY[2]);
+ vA[2] = _simd_sub_ps(vY[2], vY[0]);
+
+ vB[0] = _simd_sub_ps(vX[1], vX[0]);
+ vB[1] = _simd_sub_ps(vX[2], vX[1]);
+ vB[2] = _simd_sub_ps(vX[0], vX[2]);
+}
+
+INLINE
+void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i & vA, __m128i & vB)
+{
+ // generate edge equations
+ // A = y0 - y1
+ // B = x1 - x0
+ // C = x0y1 - x1y0
+ __m128i vYsub = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 2, 1));
+ vA = _mm_sub_epi32(vY, vYsub);
+
+ __m128i vXsub = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 2, 1));
+ vB = _mm_sub_epi32(vXsub, vX);
+}
+
+INLINE
+void triangleSetupABIntVertical(const simdscalari vX[3], const simdscalari vY[3], simdscalari (&vA)[3], simdscalari (&vB)[3])
+{
+ // A = y0 - y1
+ // B = x1 - x0
+ vA[0] = _simd_sub_epi32(vY[0], vY[1]);
+ vA[1] = _simd_sub_epi32(vY[1], vY[2]);
+ vA[2] = _simd_sub_epi32(vY[2], vY[0]);
+
+ vB[0] = _simd_sub_epi32(vX[1], vX[0]);
+ vB[1] = _simd_sub_epi32(vX[2], vX[1]);
+ vB[2] = _simd_sub_epi32(vX[0], vX[2]);
+}
+// Calculate the determinant of the triangle
+// 2 vectors between the 3 points: P, Q
+// Px = x0-x2, Py = y0-y2
+// Qx = x1-x2, Qy = y1-y2
+// |Px Qx|
+// det = | | = PxQy - PyQx
+// |Py Qy|
+// simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2)
+// try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx
+// : B[2]*A[1] - (-(y2-y0))*(-(x2-x1))
+// : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1)
+// : B[2]*A[1] - A[2]*B[1]
+INLINE
+float calcDeterminantInt(const __m128i vA, const __m128i vB)
+{
+ // vAShuf = [A1, A0, A2, A0]
+ __m128i vAShuf = _mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 2, 0, 1));
+ // vBShuf = [B2, B0, B1, B0]
+ __m128i vBShuf = _mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 1, 0, 2));
+ // vMul = [A1*B2, B1*A2]
+ __m128i vMul = _mm_mul_epi32(vAShuf, vBShuf);
+
+ // shuffle upper to lower
+ // vMul2 = [B1*A2, B1*A2]
+ __m128i vMul2 = _mm_shuffle_epi32(vMul, _MM_SHUFFLE(3, 2, 3, 2));
+ //vMul = [A1*B2 - B1*A2]
+ vMul = _mm_sub_epi64(vMul, vMul2);
+
+ // According to emmintrin.h __mm_store1_pd(), address must be 16-byte aligned
+ OSALIGN(int64_t, 16) result;
+ _mm_store1_pd((double*)&result, _mm_castsi128_pd(vMul));
+
+ double fResult = (double)result;
+ fResult = fResult * (1.0 / FIXED_POINT16_SCALE);
+
+ return (float)fResult;
+}
+
+INLINE
+void calcDeterminantIntVertical(const simdscalari vA[3], const simdscalari vB[3], simdscalari *pvDet)
+{
+ // refer to calcDeterminantInt comment for calculation explanation
+ // A1*B2
+ simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5
+ simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7
+
+ simdscalari vB2Lo = _simd_unpacklo_epi32(vB[2], vB[2]);
+ simdscalari vB2Hi = _simd_unpackhi_epi32(vB[2], vB[2]);
+
+ simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5
+ simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7
+
+ // B1*A2
+ simdscalari vA2Lo = _simd_unpacklo_epi32(vA[2], vA[2]);
+ simdscalari vA2Hi = _simd_unpackhi_epi32(vA[2], vA[2]);
+
+ simdscalari vB1Lo = _simd_unpacklo_epi32(vB[1], vB[1]);
+ simdscalari vB1Hi = _simd_unpackhi_epi32(vB[1], vB[1]);
+
+ simdscalari vA2B1Lo = _simd_mul_epi32(vA2Lo, vB1Lo);
+ simdscalari vA2B1Hi = _simd_mul_epi32(vA2Hi, vB1Hi);
+
+ // A1*B2 - A2*B1
+ simdscalari detLo = _simd_sub_epi64(vA1B2Lo, vA2B1Lo);
+ simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi);
+
+ // shuffle 0 1 4 5 -> 0 1 2 3
+ simdscalari vResultLo = _mm256_permute2f128_si256(detLo, detHi, 0x20);
+ simdscalari vResultHi = _mm256_permute2f128_si256(detLo, detHi, 0x31);
+
+ pvDet[0] = vResultLo;
+ pvDet[1] = vResultHi;
+}
+
+INLINE
+void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128 &vB, __m128 &vC)
+{
+ // C = -Ax - By
+ vC = _mm_mul_ps(vA, vX);
+ __m128 vCy = _mm_mul_ps(vB, vY);
+ vC = _mm_mul_ps(vC, _mm_set1_ps(-1.0f));
+ vC = _mm_sub_ps(vC, vCy);
+}
+
+INLINE
+void viewportTransform(__m128 &vX, __m128 &vY, __m128 &vZ, const SWR_VIEWPORT_MATRIX &vpMatrix)
+{
+ vX = _mm_mul_ps(vX, _mm_set1_ps(vpMatrix.m00));
+ vX = _mm_add_ps(vX, _mm_set1_ps(vpMatrix.m30));
+
+ vY = _mm_mul_ps(vY, _mm_set1_ps(vpMatrix.m11));
+ vY = _mm_add_ps(vY, _mm_set1_ps(vpMatrix.m31));
+
+ vZ = _mm_mul_ps(vZ, _mm_set1_ps(vpMatrix.m22));
+ vZ = _mm_add_ps(vZ, _mm_set1_ps(vpMatrix.m32));
+}
+
+template<uint32_t NumVerts>
+INLINE
+void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRIX & vpMatrix)
+{
+ simdscalar m00 = _simd_load1_ps(&vpMatrix.m00);
+ simdscalar m30 = _simd_load1_ps(&vpMatrix.m30);
+ simdscalar m11 = _simd_load1_ps(&vpMatrix.m11);
+ simdscalar m31 = _simd_load1_ps(&vpMatrix.m31);
+ simdscalar m22 = _simd_load1_ps(&vpMatrix.m22);
+ simdscalar m32 = _simd_load1_ps(&vpMatrix.m32);
+
+ for (uint32_t i = 0; i < NumVerts; ++i)
+ {
+ v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
+ v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
+ v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
+ }
+}
+
+INLINE
+void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, BBOX &bbox)
+{
+ // Need horizontal fp min here
+ __m128i vX1 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 2, 0, 1));
+ __m128i vX2 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 1, 2));
+
+ __m128i vY1 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 2, 0, 1));
+ __m128i vY2 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 1, 2));
+
+
+ __m128i vMinX = _mm_min_epi32(vX, vX1);
+ vMinX = _mm_min_epi32(vMinX, vX2);
+
+ __m128i vMaxX = _mm_max_epi32(vX, vX1);
+ vMaxX = _mm_max_epi32(vMaxX, vX2);
+
+ __m128i vMinY = _mm_min_epi32(vY, vY1);
+ vMinY = _mm_min_epi32(vMinY, vY2);
+
+ __m128i vMaxY = _mm_max_epi32(vY, vY1);
+ vMaxY = _mm_max_epi32(vMaxY, vY2);
+
+ bbox.left = _mm_extract_epi32(vMinX, 0);
+ bbox.right = _mm_extract_epi32(vMaxX, 0);
+ bbox.top = _mm_extract_epi32(vMinY, 0);
+ bbox.bottom = _mm_extract_epi32(vMaxY, 0);
+
+#if 0
+ Jacob: A = _mm_shuffle_ps(X, Y, 0 0 0 0)
+B = _mm_shuffle_ps(Z, W, 0 0 0 0)
+A = _mm_shuffle_epi32(A, 3 0 3 0)
+A = _mm_shuffle_ps(A, B, 1 0 1 0)
+#endif
+
+}
+
+INLINE
+void calcBoundingBoxIntVertical(const simdscalari (&vX)[3], const simdscalari (&vY)[3], simdBBox &bbox)
+{
+ simdscalari vMinX = vX[0];
+ vMinX = _simd_min_epi32(vMinX, vX[1]);
+ vMinX = _simd_min_epi32(vMinX, vX[2]);
+
+ simdscalari vMaxX = vX[0];
+ vMaxX = _simd_max_epi32(vMaxX, vX[1]);
+ vMaxX = _simd_max_epi32(vMaxX, vX[2]);
+
+ simdscalari vMinY = vY[0];
+ vMinY = _simd_min_epi32(vMinY, vY[1]);
+ vMinY = _simd_min_epi32(vMinY, vY[2]);
+
+ simdscalari vMaxY = vY[0];
+ vMaxY = _simd_max_epi32(vMaxY, vY[1]);
+ vMaxY = _simd_max_epi32(vMaxY, vY[2]);
+
+ bbox.left = vMinX;
+ bbox.right = vMaxX;
+ bbox.top = vMinY;
+ bbox.bottom = vMaxY;
+}
+
+INLINE
+bool CanUseSimplePoints(DRAW_CONTEXT *pDC)
+{
+ const API_STATE& state = GetApiState(pDC);
+
+ return (state.rastState.sampleCount == SWR_MULTISAMPLE_1X &&
+ state.rastState.pointSize == 1.0f &&
+ !state.rastState.pointParam &&
+ !state.rastState.pointSpriteEnable);
+}
+
+uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements);
+uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts);
+
+// Templated Draw front-end function. All combinations of template parameter values are available
+template <bool IsIndexedT, bool HasTessellationT, bool HasGeometryShaderT, bool HasStreamOutT, bool HasRastT>
+void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+
+void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+void ProcessInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+void ProcessQueryStats(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+
+struct PA_STATE_BASE; // forward decl
+void BinTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector tri[3], uint32_t primMask, simdscalari primID);
+void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
+void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
+
diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h
new file mode 100644
index 00000000000..d7feb86273d
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h
@@ -0,0 +1,142 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file knobs.h
+*
+* @brief Static (Compile-Time) Knobs for Core.
+*
+******************************************************************************/
+#pragma once
+
+#include <stdint.h>
+#include <gen_knobs.h>
+
+#define KNOB_ARCH_AVX 0
+#define KNOB_ARCH_AVX2 1
+#define KNOB_ARCH_AVX512 2
+
+///////////////////////////////////////////////////////////////////////////////
+// Architecture validation
+///////////////////////////////////////////////////////////////////////////////
+#if !defined(KNOB_ARCH)
+#define KNOB_ARCH KNOB_ARCH_AVX
+#endif
+
+#if (KNOB_ARCH == KNOB_ARCH_AVX)
+#define KNOB_ARCH_ISA AVX
+#define KNOB_ARCH_STR "AVX"
+#define KNOB_SIMD_WIDTH 8
+#elif (KNOB_ARCH == KNOB_ARCH_AVX2)
+#define KNOB_ARCH_ISA AVX2
+#define KNOB_ARCH_STR "AVX2"
+#define KNOB_SIMD_WIDTH 8
+#elif (KNOB_ARCH == KNOB_ARCH_AVX512)
+#define KNOB_ARCH_ISA AVX512F
+#define KNOB_ARCH_STR "AVX512"
+#define KNOB_SIMD_WIDTH 16
+#error "AVX512 not yet supported"
+#else
+#error "Unknown architecture"
+#endif
+
+#define MAX_KNOB_ARCH_STR_LEN sizeof("AVX512_PLUS_PADDING")
+
+///////////////////////////////////////////////////////////////////////////////
+// Configuration knobs
+///////////////////////////////////////////////////////////////////////////////
+#define KNOB_MAX_NUM_THREADS 256 // Supports up to dual-HSW-Xeon.
+
+// Maximum supported number of active vertex buffer streams
+#define KNOB_NUM_STREAMS 32
+
+// Maximum supported number of attributes per vertex
+#define KNOB_NUM_ATTRIBUTES 38
+
+// Maximum supported active viewports and scissors
+#define KNOB_NUM_VIEWPORTS_SCISSORS 16
+
+// Guardband range used by the clipper
+#define KNOB_GUARDBAND_WIDTH 32768.0f
+#define KNOB_GUARDBAND_HEIGHT 32768.0f
+
+///////////////////////////////
+// Macro tile configuration
+///////////////////////////////
+
+// raster tile dimensions
+#define KNOB_TILE_X_DIM 8
+#define KNOB_TILE_X_DIM_SHIFT 3
+#define KNOB_TILE_Y_DIM 8
+#define KNOB_TILE_Y_DIM_SHIFT 3
+
+// fixed macrotile pixel dimension for now, eventually will be
+// dynamically set based on tile format and pixel size
+#define KNOB_MACROTILE_X_DIM 64
+#define KNOB_MACROTILE_Y_DIM 64
+#define KNOB_MACROTILE_X_DIM_FIXED (KNOB_MACROTILE_X_DIM << 8)
+#define KNOB_MACROTILE_Y_DIM_FIXED (KNOB_MACROTILE_Y_DIM << 8)
+#define KNOB_MACROTILE_X_DIM_FIXED_SHIFT 14
+#define KNOB_MACROTILE_Y_DIM_FIXED_SHIFT 14
+#define KNOB_MACROTILE_X_DIM_IN_TILES (KNOB_MACROTILE_X_DIM >> KNOB_TILE_X_DIM_SHIFT)
+#define KNOB_MACROTILE_Y_DIM_IN_TILES (KNOB_MACROTILE_Y_DIM >> KNOB_TILE_Y_DIM_SHIFT)
+
+// total # of hot tiles available. This should be enough to
+// fully render a 16kx16k 128bpp render target
+#define KNOB_NUM_HOT_TILES_X 256
+#define KNOB_NUM_HOT_TILES_Y 256
+#define KNOB_COLOR_HOT_TILE_FORMAT R32G32B32A32_FLOAT
+#define KNOB_DEPTH_HOT_TILE_FORMAT R32_FLOAT
+#define KNOB_STENCIL_HOT_TILE_FORMAT R8_UINT
+
+// Max scissor rectangle
+#define KNOB_MAX_SCISSOR_X KNOB_NUM_HOT_TILES_X * KNOB_MACROTILE_X_DIM
+#define KNOB_MAX_SCISSOR_Y KNOB_NUM_HOT_TILES_Y * KNOB_MACROTILE_Y_DIM
+
+#if KNOB_SIMD_WIDTH==8 && KNOB_TILE_X_DIM < 4
+#error "incompatible width/tile dimensions"
+#endif
+
+#if KNOB_SIMD_WIDTH == 8
+#define SIMD_TILE_X_DIM 4
+#define SIMD_TILE_Y_DIM 2
+#else
+#error "Invalid simd width"
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// Optimization knobs
+///////////////////////////////////////////////////////////////////////////////
+#define KNOB_USE_FAST_SRGB TRUE
+
+// enables cut-aware primitive assembler
+#define KNOB_ENABLE_CUT_AWARE_PA TRUE
+
+///////////////////////////////////////////////////////////////////////////////
+// Debug knobs
+///////////////////////////////////////////////////////////////////////////////
+//#define KNOB_ENABLE_RDTSC
+
+// Set to 1 to use the dynamic KNOB_TOSS_XXXX knobs.
+#if !defined(KNOB_ENABLE_TOSS_POINTS)
+#define KNOB_ENABLE_TOSS_POINTS 0
+#endif
+
diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
new file mode 100644
index 00000000000..3f19555557f
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
@@ -0,0 +1,98 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file knobs_init.h
+*
+* @brief Dynamic Knobs Initialization for Core.
+*
+******************************************************************************/
+#pragma once
+
+#include <core/knobs.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdio.h>
+
+// Assume the type is compatible with a 32-bit integer
+template <typename T>
+static inline void ConvertEnvToKnob(const char* pOverride, T& knobValue)
+{
+ uint32_t value = 0;
+ if (sscanf(pOverride, "%u", &value))
+ {
+ knobValue = static_cast<T>(value);
+ }
+}
+
+static inline void ConvertEnvToKnob(const char* pOverride, bool& knobValue)
+{
+ size_t len = strlen(pOverride);
+ if (len == 1)
+ {
+ auto c = tolower(pOverride[0]);
+ if (c == 'y' || c == 't' || c == '1')
+ {
+ knobValue = true;
+ return;
+ }
+ if (c == 'n' || c == 'f' || c == '0')
+ {
+ knobValue = false;
+ return;
+ }
+ }
+
+ // Try converting to a number and casting to bool
+ uint32_t value = 0;
+ if (sscanf(pOverride, "%u", &value))
+ {
+ knobValue = value != 0;
+ return;
+ }
+}
+
+static inline void ConvertEnvToKnob(const char* pOverride, float& knobValue)
+{
+ float value = knobValue;
+ if (sscanf(pOverride, "%f", &value))
+ {
+ knobValue = value;
+ }
+}
+
+template <typename T>
+static inline void InitKnob(T& knob)
+{
+
+ // TODO, read registry first
+
+ // Second, read environment variables
+ const char* pOverride = getenv(knob.Name());
+
+ if (pOverride)
+ {
+ auto knobValue = knob.Value();
+ ConvertEnvToKnob(pOverride, knobValue);
+ knob.Value(knobValue);
+ }
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/multisample.cpp b/src/gallium/drivers/swr/rasterizer/core/multisample.cpp
new file mode 100644
index 00000000000..d51a546b063
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/multisample.cpp
@@ -0,0 +1,51 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file multisample.cpp
+*
+******************************************************************************/
+
+#include "multisample.h"
+
+const uint32_t MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosXi[2] {0xC0, 0x40};
+const uint32_t MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosYi[2] {0xC0, 0x40};
+const uint32_t MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosXi[4] {0x60, 0xE0, 0x20, 0xA0};
+const uint32_t MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosYi[4] {0x20, 0x60, 0xA0, 0xE0};
+const uint32_t MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosXi[8] {0x90, 0x70, 0xD0, 0x50, 0x30, 0x10, 0xB0, 0xF0};
+const uint32_t MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosYi[8] {0x50, 0xB0, 0x90, 0x30, 0xD0, 0x70, 0xF0, 0x10};
+const uint32_t MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosXi[16]
+{0x90, 0x70, 0x50, 0xC0, 0x30, 0xA0, 0xD0, 0xB0, 0x60, 0x80, 0x40, 0x20, 0x00, 0xF0, 0xE0, 0x10};
+const uint32_t MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosYi[16]
+{0x90, 0x50, 0xA0, 0x70, 0x60, 0xD0, 0xB0, 0x30, 0xE0, 0x10, 0x20, 0xC0, 0x80, 0x40, 0xF0, 0x00};
+
+const float MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosX{0.5f};
+const float MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosY{0.5f};
+const float MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosX[2]{0.75f, 0.25f};
+const float MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosY[2]{0.75f, 0.25f};
+const float MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosX[4]{0.375f, 0.875, 0.125, 0.625};
+const float MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosY[4]{0.125, 0.375, 0.625, 0.875};
+const float MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosX[8]{0.5625, 0.4375, 0.8125, 0.3125, 0.1875, 0.0625, 0.6875, 0.9375};
+const float MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosY[8]{0.3125, 0.6875, 0.5625, 0.1875, 0.8125, 0.4375, 0.9375, 0.0625};
+const float MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosX[16]
+{0.5625, 0.4375, 0.3125, 0.7500, 0.1875, 0.6250, 0.8125, 0.6875, 0.3750, 0.5000, 0.2500, 0.1250, 0.0000, 0.9375, 0.8750, 0.0625};
+const float MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosY[16]
+{0.5625, 0.3125, 0.6250, 0.4375, 0.3750, 0.8125, 0.6875, 0.1875, 0.8750, 0.0625, 0.1250, 0.7500, 0.5000, 0.2500, 0.9375, 0.0000};
diff --git a/src/gallium/drivers/swr/rasterizer/core/multisample.h b/src/gallium/drivers/swr/rasterizer/core/multisample.h
new file mode 100644
index 00000000000..4ae777e2fc5
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/multisample.h
@@ -0,0 +1,620 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file multisample.h
+*
+******************************************************************************/
+
+#pragma once
+
+#include "context.h"
+#include "format_traits.h"
+
+INLINE
+uint32_t GetNumSamples(SWR_MULTISAMPLE_COUNT sampleCount)
+{
+ static const uint32_t sampleCountLUT[SWR_MULTISAMPLE_TYPE_MAX] {1, 2, 4, 8, 16};
+ assert(sampleCount < SWR_MULTISAMPLE_TYPE_MAX);
+ return sampleCountLUT[sampleCount];
+}
+
+INLINE
+SWR_MULTISAMPLE_COUNT GetSampleCount(uint32_t numSamples)
+{
+ switch(numSamples)
+ {
+ case 1: return SWR_MULTISAMPLE_1X;
+ case 2: return SWR_MULTISAMPLE_2X;
+ case 4: return SWR_MULTISAMPLE_4X;
+ case 8: return SWR_MULTISAMPLE_8X;
+ case 16: return SWR_MULTISAMPLE_16X;
+ default: assert(0); return SWR_MULTISAMPLE_1X;
+ }
+}
+
+// hardcoded offsets based on Direct3d standard multisample positions
+// 8 x 8 pixel grid ranging from (0, 0) to (15, 15), with (0, 0) = UL pixel corner
+// coords are 0.8 fixed point offsets from (0, 0)
+template<SWR_MULTISAMPLE_COUNT sampleCount>
+struct MultisampleTraits
+{
+ INLINE static __m128i vXi(uint32_t sampleNum) = delete;
+ INLINE static __m128i vYi(uint32_t sampleNum) = delete;
+ INLINE static simdscalar vX(uint32_t sampleNum) = delete;
+ INLINE static simdscalar vY(uint32_t sampleNum) = delete;
+ INLINE static float X(uint32_t sampleNum) = delete;
+ INLINE static float Y(uint32_t sampleNum) = delete;
+ INLINE static __m128i TileSampleOffsetsX() = delete;
+ INLINE static __m128i TileSampleOffsetsY() = delete;
+ INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) = delete;
+ INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) = delete;
+ INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) = delete;
+ INLINE static simdscalari FullSampleMask() = delete;
+
+ static const uint32_t numSamples = 0;
+};
+
+template<>
+struct MultisampleTraits<SWR_MULTISAMPLE_1X>
+{
+ INLINE static __m128i vXi(uint32_t sampleNum)
+ {
+ static const __m128i X = _mm_set1_epi32(samplePosXi);
+ return X;
+ }
+
+ INLINE static __m128i vYi(uint32_t sampleNum)
+ {
+ static const __m128i Y = _mm_set1_epi32(samplePosYi);
+ return Y;
+ }
+
+ INLINE static simdscalar vX(uint32_t sampleNum)
+ {
+ static const simdscalar X = _simd_set1_ps(0.5f);
+ return X;
+ }
+
+ INLINE static simdscalar vY(uint32_t sampleNum)
+ {
+ static const simdscalar Y = _simd_set1_ps(0.5f);
+ return Y;
+ }
+
+ INLINE static float X(uint32_t sampleNum) {return samplePosX;};
+ INLINE static float Y(uint32_t sampleNum) {return samplePosY;};
+
+ INLINE static __m128i TileSampleOffsetsX()
+ {
+ static const uint32_t bboxLeftEdge = 0x80;
+ static const uint32_t bboxRightEdge = 0x80;
+ // BR, BL, UR, UL
+ static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge);
+ return tileSampleOffsetX;
+ }
+
+ INLINE static __m128i TileSampleOffsetsY()
+ {
+ static const uint32_t bboxTopEdge = 0x80;
+ static const uint32_t bboxBottomEdge = 0x80;
+ // BR, BL, UR, UL
+ static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge);
+ return tileSampleOffsetY;
+ }
+
+ INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
+ {
+ return 0;
+ }
+
+ INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
+ {
+ return 0;
+ }
+
+ INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
+ {
+ return 0;
+ }
+
+ INLINE static simdscalari FullSampleMask(){return _simd_set1_epi32(0x1);};
+
+ static const uint32_t samplePosXi {0x80};
+ static const uint32_t samplePosYi {0x80};
+ static const float samplePosX;
+ static const float samplePosY;
+ static const uint32_t numSamples = 1;
+};
+
+template<>
+struct MultisampleTraits<SWR_MULTISAMPLE_2X>
+{
+ INLINE static __m128i vXi(uint32_t sampleNum)
+ {
+ SWR_ASSERT(sampleNum < numSamples);
+ static const __m128i X[numSamples] {_mm_set1_epi32(samplePosXi[0]), _mm_set1_epi32(samplePosXi[1])};
+ return X[sampleNum];
+ }
+
+ INLINE static __m128i vYi(uint32_t sampleNum)
+ {
+ SWR_ASSERT(sampleNum < numSamples);
+ static const __m128i Y[numSamples] {_mm_set1_epi32(samplePosYi[0]), _mm_set1_epi32(samplePosYi[1])};
+ return Y[sampleNum];
+ }
+
+ INLINE static simdscalar vX(uint32_t sampleNum)
+ {
+ static const simdscalar X[numSamples] {_simd_set1_ps(0.75f), _simd_set1_ps(0.25f)};
+ assert(sampleNum < numSamples);
+ return X[sampleNum];
+ }
+
+ INLINE static simdscalar vY(uint32_t sampleNum)
+ {
+ static const simdscalar Y[numSamples] {_simd_set1_ps(0.75f), _simd_set1_ps(0.25f)};
+ assert(sampleNum < numSamples);
+ return Y[sampleNum];
+ }
+
+ INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; };
+ INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; };
+
+ INLINE static __m128i TileSampleOffsetsX()
+ {
+ static const uint32_t bboxLeftEdge = 0x40;
+ static const uint32_t bboxRightEdge = 0xC0;
+ // BR, BL, UR, UL
+ static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge);
+ return tileSampleOffsetX;
+ }
+
+ INLINE static __m128i TileSampleOffsetsY()
+ {
+ static const uint32_t bboxTopEdge = 0x40;
+ static const uint32_t bboxBottomEdge = 0xC0;
+ // BR, BL, UR, UL
+ static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge);
+ return tileSampleOffsetY;
+ }
+
+ INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
+ {
+ static const uint32_t RasterTileColorOffsets[numSamples]
+ { 0,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8)
+ };
+ assert(sampleNum < numSamples);
+ return RasterTileColorOffsets[sampleNum];
+ }
+
+ INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
+ {
+ static const uint32_t RasterTileDepthOffsets[numSamples]
+ { 0,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8)
+ };
+ assert(sampleNum < numSamples);
+ return RasterTileDepthOffsets[sampleNum];
+ }
+
+ INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
+ {
+ static const uint32_t RasterTileStencilOffsets[numSamples]
+ { 0,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8)
+ };
+ assert(sampleNum < numSamples);
+ return RasterTileStencilOffsets[sampleNum];
+ }
+
+ INLINE static simdscalari FullSampleMask()
+ {
+ static const simdscalari mask =_simd_set1_epi32(0x3);
+ return mask;
+ }
+
+ static const uint32_t samplePosXi[2];
+ static const uint32_t samplePosYi[2];
+ static const float samplePosX[2];
+ static const float samplePosY[2];
+ static const uint32_t numSamples = 2;
+};
+
+template<>
+struct MultisampleTraits<SWR_MULTISAMPLE_4X>
+{
+ INLINE static __m128i vXi(uint32_t sampleNum)
+ {
+ static const __m128i X[numSamples]
+ {_mm_set1_epi32(samplePosXi[0]), _mm_set1_epi32(samplePosXi[1]), _mm_set1_epi32(samplePosXi[2]), _mm_set1_epi32(samplePosXi[3])};
+ SWR_ASSERT(sampleNum < numSamples);
+ return X[sampleNum];
+ }
+
+ INLINE static __m128i vYi(uint32_t sampleNum)
+ {
+ static const __m128i Y[numSamples]
+ {_mm_set1_epi32(samplePosYi[0]), _mm_set1_epi32(samplePosYi[1]), _mm_set1_epi32(samplePosYi[2]), _mm_set1_epi32(samplePosYi[3])};
+ SWR_ASSERT(sampleNum < numSamples);
+ return Y[sampleNum];
+ }
+
+ INLINE static simdscalar vX(uint32_t sampleNum)
+ {
+ static const simdscalar X[numSamples]
+ {_simd_set1_ps(0.375f), _simd_set1_ps(0.875), _simd_set1_ps(0.125), _simd_set1_ps(0.625)};
+ assert(sampleNum < numSamples);
+ return X[sampleNum];
+ }
+
+ INLINE static simdscalar vY(uint32_t sampleNum)
+ {
+ static const simdscalar Y[numSamples]
+ {_simd_set1_ps(0.125), _simd_set1_ps(0.375f), _simd_set1_ps(0.625), _simd_set1_ps(0.875)};
+ assert(sampleNum < numSamples);
+ return Y[sampleNum];
+ }
+
+ INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; };
+ INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; };
+
+ INLINE static __m128i TileSampleOffsetsX()
+ {
+ static const uint32_t bboxLeftEdge = 0x20;
+ static const uint32_t bboxRightEdge = 0xE0;
+ // BR, BL, UR, UL
+ static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge);
+ return tileSampleOffsetX;
+ }
+
+ INLINE static __m128i TileSampleOffsetsY()
+ {
+ static const uint32_t bboxTopEdge = 0x20;
+ static const uint32_t bboxBottomEdge = 0xE0;
+ // BR, BL, UR, UL
+ static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge);
+ return tileSampleOffsetY;
+ }
+
+ INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
+ {
+ static const uint32_t RasterTileColorOffsets[numSamples]
+ { 0,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
+ };
+ assert(sampleNum < numSamples);
+ return RasterTileColorOffsets[sampleNum];
+ }
+
+ INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
+ {
+ static const uint32_t RasterTileDepthOffsets[numSamples]
+ { 0,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
+ };
+ assert(sampleNum < numSamples);
+ return RasterTileDepthOffsets[sampleNum];
+ }
+
+ INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
+ {
+ static const uint32_t RasterTileStencilOffsets[numSamples]
+ { 0,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
+ };
+ assert(sampleNum < numSamples);
+ return RasterTileStencilOffsets[sampleNum];
+ }
+
+ INLINE static simdscalari FullSampleMask()
+ {
+ static const simdscalari mask = _simd_set1_epi32(0xF);
+ return mask;
+ }
+
+ static const uint32_t samplePosXi[4];
+ static const uint32_t samplePosYi[4];
+ static const float samplePosX[4];
+ static const float samplePosY[4];
+ static const uint32_t numSamples = 4;
+};
+
+template<>
+struct MultisampleTraits<SWR_MULTISAMPLE_8X>
+{
+ INLINE static __m128i vXi(uint32_t sampleNum)
+ {
+ static const __m128i X[numSamples]
+ {_mm_set1_epi32(samplePosXi[0]), _mm_set1_epi32(samplePosXi[1]), _mm_set1_epi32(samplePosXi[2]), _mm_set1_epi32(samplePosXi[3]),
+ _mm_set1_epi32(samplePosXi[4]), _mm_set1_epi32(samplePosXi[5]), _mm_set1_epi32(samplePosXi[6]), _mm_set1_epi32(samplePosXi[7])};
+ SWR_ASSERT(sampleNum < numSamples);
+ return X[sampleNum];
+ }
+
+ INLINE static __m128i vYi(uint32_t sampleNum)
+ {
+ static const __m128i Y[numSamples]
+ {_mm_set1_epi32(samplePosYi[0]), _mm_set1_epi32(samplePosYi[1]), _mm_set1_epi32(samplePosYi[2]), _mm_set1_epi32(samplePosYi[3]),
+ _mm_set1_epi32(samplePosYi[4]), _mm_set1_epi32(samplePosYi[5]), _mm_set1_epi32(samplePosYi[6]), _mm_set1_epi32(samplePosYi[7])};
+ SWR_ASSERT(sampleNum < numSamples);
+ return Y[sampleNum];
+ }
+
+ INLINE static simdscalar vX(uint32_t sampleNum)
+ {
+ static const simdscalar X[numSamples]
+ {_simd_set1_ps(0.5625), _simd_set1_ps(0.4375), _simd_set1_ps(0.8125), _simd_set1_ps(0.3125),
+ _simd_set1_ps(0.1875), _simd_set1_ps(0.0625), _simd_set1_ps(0.6875), _simd_set1_ps(0.9375)};
+ assert(sampleNum < numSamples);
+ return X[sampleNum];
+ }
+
+ INLINE static simdscalar vY(uint32_t sampleNum)
+ {
+ static const simdscalar Y[numSamples]
+ {_simd_set1_ps(0.3125), _simd_set1_ps(0.6875), _simd_set1_ps(0.5625), _simd_set1_ps(0.1875),
+ _simd_set1_ps(0.8125), _simd_set1_ps(0.4375), _simd_set1_ps(0.9375), _simd_set1_ps(0.0625)};
+ assert(sampleNum < numSamples);
+ return Y[sampleNum];
+ }
+
+ INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; };
+ INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; };
+
+ INLINE static __m128i TileSampleOffsetsX()
+ {
+ static const uint32_t bboxLeftEdge = 0x10;
+ static const uint32_t bboxRightEdge = 0xF0;
+ // BR, BL, UR, UL
+ static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge);
+ return tileSampleOffsetX;
+ }
+
+ INLINE static __m128i TileSampleOffsetsY()
+ {
+ static const uint32_t bboxTopEdge = 0x10;
+ static const uint32_t bboxBottomEdge = 0xF0;
+ // BR, BL, UR, UL
+ static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge);
+ return tileSampleOffsetY;
+ }
+
+ INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
+ {
+ static const uint32_t RasterTileColorOffsets[numSamples]
+ { 0,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
+ };
+ assert(sampleNum < numSamples);
+ return RasterTileColorOffsets[sampleNum];
+ }
+
+ INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
+ {
+ static const uint32_t RasterTileDepthOffsets[numSamples]
+ { 0,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
+ };
+ assert(sampleNum < numSamples);
+ return RasterTileDepthOffsets[sampleNum];
+ }
+
+ INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
+ {
+ static const uint32_t RasterTileStencilOffsets[numSamples]
+ { 0,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
+ };
+ assert(sampleNum < numSamples);
+ return RasterTileStencilOffsets[sampleNum];
+ }
+
+ INLINE static simdscalari FullSampleMask()
+ {
+ static const simdscalari mask = _simd_set1_epi32(0xFF);
+ return mask;
+ }
+
+ static const uint32_t samplePosXi[8];
+ static const uint32_t samplePosYi[8];
+ static const float samplePosX[8];
+ static const float samplePosY[8];
+ static const uint32_t numSamples = 8;
+};
+
+template<>
+struct MultisampleTraits<SWR_MULTISAMPLE_16X>
+{
+ INLINE static __m128i vXi(uint32_t sampleNum)
+ {
+ static const __m128i X[numSamples]
+ {_mm_set1_epi32(samplePosXi[0]), _mm_set1_epi32(samplePosXi[1]), _mm_set1_epi32(samplePosXi[2]), _mm_set1_epi32(samplePosXi[3]),
+ _mm_set1_epi32(samplePosXi[4]), _mm_set1_epi32(samplePosXi[5]), _mm_set1_epi32(samplePosXi[6]), _mm_set1_epi32(samplePosXi[7]),
+ _mm_set1_epi32(samplePosXi[8]), _mm_set1_epi32(samplePosXi[9]), _mm_set1_epi32(samplePosXi[10]), _mm_set1_epi32(samplePosXi[11]),
+ _mm_set1_epi32(samplePosXi[12]), _mm_set1_epi32(samplePosXi[13]), _mm_set1_epi32(samplePosXi[14]), _mm_set1_epi32(samplePosXi[15])};
+ SWR_ASSERT(sampleNum < numSamples);
+ return X[sampleNum];
+ }
+
+ INLINE static __m128i vYi(uint32_t sampleNum)
+ {
+ static const __m128i Y[numSamples]
+ {_mm_set1_epi32(samplePosYi[0]), _mm_set1_epi32(samplePosYi[1]), _mm_set1_epi32(samplePosYi[2]), _mm_set1_epi32(samplePosYi[3]),
+ _mm_set1_epi32(samplePosYi[4]), _mm_set1_epi32(samplePosYi[5]), _mm_set1_epi32(samplePosYi[6]), _mm_set1_epi32(samplePosYi[7]),
+ _mm_set1_epi32(samplePosYi[8]), _mm_set1_epi32(samplePosYi[9]), _mm_set1_epi32(samplePosYi[10]), _mm_set1_epi32(samplePosYi[11]),
+ _mm_set1_epi32(samplePosYi[12]), _mm_set1_epi32(samplePosYi[13]), _mm_set1_epi32(samplePosYi[14]), _mm_set1_epi32(samplePosYi[15])};
+ SWR_ASSERT(sampleNum < numSamples);
+ return Y[sampleNum];
+ }
+
+ INLINE static simdscalar vX(uint32_t sampleNum)
+ {
+ static const simdscalar X[numSamples]
+ {_simd_set1_ps(0.5625), _simd_set1_ps(0.4375), _simd_set1_ps(0.3125), _simd_set1_ps(0.7500),
+ _simd_set1_ps(0.1875), _simd_set1_ps(0.6250), _simd_set1_ps(0.8125), _simd_set1_ps(0.6875),
+ _simd_set1_ps(0.3750), _simd_set1_ps(0.5000), _simd_set1_ps(0.2500), _simd_set1_ps(0.1250),
+ _simd_set1_ps(0.0000), _simd_set1_ps(0.9375), _simd_set1_ps(0.8750), _simd_set1_ps(0.0625)};
+ assert(sampleNum < numSamples);
+ return X[sampleNum];
+ }
+
+ INLINE static simdscalar vY(uint32_t sampleNum)
+ {
+ static const simdscalar Y[numSamples]
+ {_simd_set1_ps(0.5625), _simd_set1_ps(0.3125), _simd_set1_ps(0.6250), _simd_set1_ps(0.4375),
+ _simd_set1_ps(0.3750), _simd_set1_ps(0.8125), _simd_set1_ps(0.6875), _simd_set1_ps(0.1875),
+ _simd_set1_ps(0.8750), _simd_set1_ps(0.0625), _simd_set1_ps(0.1250), _simd_set1_ps(0.7500),
+ _simd_set1_ps(0.5000), _simd_set1_ps(0.2500), _simd_set1_ps(0.9375), _simd_set1_ps(0.0000)};
+ assert(sampleNum < numSamples);
+ return Y[sampleNum];
+ }
+
+ INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; };
+ INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; };
+
+ INLINE static __m128i TileSampleOffsetsX()
+ {
+ static const uint32_t bboxLeftEdge = 0x00;
+ static const uint32_t bboxRightEdge = 0xF0;
+ // BR, BL, UR, UL
+ static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge);
+ return tileSampleOffsetX;
+ }
+
+ INLINE static __m128i TileSampleOffsetsY()
+ {
+ static const uint32_t bboxTopEdge = 0x00;
+ static const uint32_t bboxBottomEdge = 0xF0;
+ // BR, BL, UR, UL
+ static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge);
+ return tileSampleOffsetY;
+ }
+
+ INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
+ {
+ static const uint32_t RasterTileColorOffsets[numSamples]
+ { 0,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
+ };
+ assert(sampleNum < numSamples);
+ return RasterTileColorOffsets[sampleNum];
+ }
+
+ INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
+ {
+ static const uint32_t RasterTileDepthOffsets[numSamples]
+ { 0,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
+ };
+ assert(sampleNum < numSamples);
+ return RasterTileDepthOffsets[sampleNum];
+ }
+
+ INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
+ {
+ static const uint32_t RasterTileStencilOffsets[numSamples]
+ { 0,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
+ };
+ assert(sampleNum < numSamples);
+ return RasterTileStencilOffsets[sampleNum];
+ }
+
+ INLINE static simdscalari FullSampleMask()
+ {
+ static const simdscalari mask = _simd_set1_epi32(0xFFFF);
+ return mask;
+ }
+
+ static const uint32_t samplePosXi[16];
+ static const uint32_t samplePosYi[16];
+ static const float samplePosX[16];
+ static const float samplePosY[16];
+ static const uint32_t numSamples = 16;
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h
new file mode 100644
index 00000000000..2028d9fbcfe
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/pa.h
@@ -0,0 +1,1208 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file pa.h
+*
+* @brief Definitions for primitive assembly.
+* N primitives are assembled at a time, where N is the SIMD width.
+* A state machine, that is specific for a given topology, drives the
+* assembly of vertices into triangles.
+*
+******************************************************************************/
+#pragma once
+
+#include "frontend.h"
+
+struct PA_STATE
+{
+ DRAW_CONTEXT *pDC; // draw context
+ uint8_t* pStreamBase; // vertex stream
+ uint32_t streamSizeInVerts; // total size of the input stream in verts
+
+ // The topology the binner will use. In some cases the FE changes the topology from the api state.
+ PRIMITIVE_TOPOLOGY binTopology;
+
+ PA_STATE() {}
+ PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) :
+ pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {}
+
+ virtual bool HasWork() = 0;
+ virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
+ virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
+ virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
+ virtual bool NextPrim() = 0;
+ virtual simdvertex& GetNextVsOutput() = 0;
+ virtual bool GetNextStreamOutput() = 0;
+ virtual simdmask& GetNextVsIndices() = 0;
+ virtual uint32_t NumPrims() = 0;
+ virtual void Reset() = 0;
+ virtual simdscalari GetPrimID(uint32_t startID) = 0;
+};
+
+// The Optimized PA is a state machine that assembles triangles from vertex shader simd
+// output. Here is the sequence
+// 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
+// 2. Execute PA function to assemble and bin triangles.
+// a. The PA function is a set of functions that collectively make up the
+// state machine for a given topology.
+// 1. We use a state index to track which PA function to call.
+// b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
+// 1. We call this the current and previous simd vertex.
+// 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
+// order to assemble the second triangle, for a triangle list, we'll need the
+// last vertex from the previous simd and the first 2 vertices from the current simd.
+// 3. At times the PA can assemble multiple triangles from the 2 simd vertices.
+//
+// This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
+// cuts
+struct PA_STATE_OPT : public PA_STATE
+{
+ simdvertex leadingVertex; // For tri-fan
+ uint32_t numPrims; // Total number of primitives for draw.
+ uint32_t numPrimsComplete; // Total number of complete primitives.
+
+ uint32_t numSimdPrims; // Number of prims in current simd.
+
+ uint32_t cur; // index to current VS output.
+ uint32_t prev; // index to prev VS output. Not really needed in the state.
+ uint32_t first; // index to first VS output. Used for trifan.
+
+ uint32_t counter; // state counter
+ bool reset; // reset state
+
+ uint32_t primIDIncr; // how much to increment for each vector (typically vector / {1, 2})
+ simdscalari primID;
+
+ typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
+ typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+
+ PFN_PA_FUNC pfnPaFunc; // PA state machine function for assembling 4 triangles.
+ PFN_PA_SINGLE_FUNC pfnPaSingleFunc; // PA state machine function for assembling single triangle.
+ PFN_PA_FUNC pfnPaFuncReset; // initial state to set on reset
+
+ // state used to advance the PA when Next is called
+ PFN_PA_FUNC pfnPaNextFunc;
+ uint32_t nextNumSimdPrims;
+ uint32_t nextNumPrimsIncrement;
+ bool nextReset;
+ bool isStreaming;
+
+ simdmask tmpIndices; // temporary index store for unused virtual function
+
+ PA_STATE_OPT() {}
+ PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
+ bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
+
+ bool HasWork()
+ {
+ return (this->numPrimsComplete < this->numPrims) ? true : false;
+ }
+
+ simdvector& GetSimdVector(uint32_t index, uint32_t slot)
+ {
+ simdvertex* pVertex = (simdvertex*)pStreamBase;
+ return pVertex[index].attrib[slot];
+ }
+
+ // Assembles 4 triangles. Each simdvector is a single vertex from 4
+ // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
+ bool Assemble(uint32_t slot, simdvector verts[])
+ {
+ return this->pfnPaFunc(*this, slot, verts);
+ }
+
+ // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
+ void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
+ {
+ return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
+ }
+
+ bool NextPrim()
+ {
+ this->pfnPaFunc = this->pfnPaNextFunc;
+ this->numSimdPrims = this->nextNumSimdPrims;
+ this->numPrimsComplete += this->nextNumPrimsIncrement;
+ this->reset = this->nextReset;
+
+ if (this->isStreaming)
+ {
+ this->reset = false;
+ }
+
+ bool morePrims = false;
+
+ if (this->numSimdPrims > 0)
+ {
+ morePrims = true;
+ this->numSimdPrims--;
+ }
+ else
+ {
+ this->counter = (this->reset) ? 0 : (this->counter + 1);
+ this->reset = false;
+ }
+
+ this->pfnPaFunc = this->pfnPaNextFunc;
+
+ if (!HasWork())
+ {
+ morePrims = false; // no more to do
+ }
+
+ return morePrims;
+ }
+
+ simdvertex& GetNextVsOutput()
+ {
+ // increment cur and prev indices
+ const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD_WIDTH;
+ this->prev = this->cur; // prev is undefined for first state.
+ this->cur = this->counter % numSimdVerts;
+
+ simdvertex* pVertex = (simdvertex*)pStreamBase;
+ return pVertex[this->cur];
+ }
+
+ simdmask& GetNextVsIndices()
+ {
+ // unused in optimized PA, pass tmp buffer back
+ return tmpIndices;
+ }
+
+ bool GetNextStreamOutput()
+ {
+ this->prev = this->cur;
+ this->cur = this->counter;
+
+ return HasWork();
+ }
+
+ uint32_t NumPrims()
+ {
+ return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
+ (KNOB_SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : KNOB_SIMD_WIDTH;
+ }
+
+ void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
+ PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
+ uint32_t numSimdPrims = 0,
+ uint32_t numPrimsIncrement = 0,
+ bool reset = false)
+ {
+ this->pfnPaNextFunc = pfnPaNextFunc;
+ this->nextNumSimdPrims = numSimdPrims;
+ this->nextNumPrimsIncrement = numPrimsIncrement;
+ this->nextReset = reset;
+
+ this->pfnPaSingleFunc = pfnPaNextSingleFunc;
+ }
+
+ void Reset()
+ {
+ this->pfnPaFunc = this->pfnPaFuncReset;
+ this->numPrimsComplete = 0;
+ this->numSimdPrims = 0;
+ this->cur = 0;
+ this->prev = 0;
+ this->first = 0;
+ this->counter = 0;
+ this->reset = false;
+ }
+
+ simdscalari GetPrimID(uint32_t startID)
+ {
+ return _simd_add_epi32(this->primID,
+ _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / KNOB_SIMD_WIDTH)));
+ }
+};
+
+// helper C wrappers to avoid having to rewrite all the PA topology state functions
+INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
+ PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
+ uint32_t numSimdPrims = 0,
+ uint32_t numPrimsIncrement = 0,
+ bool reset = false)
+{
+ return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
+}
+INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
+{
+ return pa.GetSimdVector(index, slot);
+}
+
+INLINE __m128 swizzleLane0(const simdvector &a)
+{
+ simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
+ simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
+ return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
+}
+
+INLINE __m128 swizzleLane1(const simdvector &a)
+{
+ simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
+ simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
+ return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
+}
+
+INLINE __m128 swizzleLane2(const simdvector &a)
+{
+ simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
+ simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
+ return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
+}
+
+INLINE __m128 swizzleLane3(const simdvector &a)
+{
+ simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
+ simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
+ return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
+}
+
+INLINE __m128 swizzleLane4(const simdvector &a)
+{
+ simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
+ simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
+ return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
+
+}
+
+INLINE __m128 swizzleLane5(const simdvector &a)
+{
+ simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
+ simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
+ return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
+}
+
+INLINE __m128 swizzleLane6(const simdvector &a)
+{
+ simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
+ simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
+ return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
+}
+
+INLINE __m128 swizzleLane7(const simdvector &a)
+{
+ simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
+ simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
+ return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
+}
+
+INLINE __m128 swizzleLaneN(const simdvector &a, int lane)
+{
+ switch (lane) {
+ case 0:
+ return swizzleLane0(a);
+ case 1:
+ return swizzleLane1(a);
+ case 2:
+ return swizzleLane2(a);
+ case 3:
+ return swizzleLane3(a);
+ case 4:
+ return swizzleLane4(a);
+ case 5:
+ return swizzleLane5(a);
+ case 6:
+ return swizzleLane6(a);
+ case 7:
+ return swizzleLane7(a);
+ default:
+ return _mm_setzero_ps();
+ }
+}
+
+// Cut-aware primitive assembler.
+struct PA_STATE_CUT : public PA_STATE
+{
+ simdmask* pCutIndices; // cut indices buffer, 1 bit per vertex
+ uint32_t numVerts; // number of vertices available in buffer store
+ uint32_t numAttribs; // number of attributes
+ int32_t numRemainingVerts; // number of verts remaining to be assembled
+ uint32_t numVertsToAssemble; // total number of verts to assemble for the draw
+ OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH]; // current index buffer for gather
+ simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd
+ uint32_t numPrimsAssembled; // number of primitives that are fully assembled
+ uint32_t headVertex; // current unused vertex slot in vertex buffer store
+ uint32_t tailVertex; // beginning vertex currently assembling
+ uint32_t curVertex; // current unprocessed vertex
+ uint32_t startPrimId; // starting prim id
+ simdscalari vPrimId; // vector of prim ID
+ bool needOffsets; // need to compute gather offsets for current SIMD
+ uint32_t vertsPerPrim;
+ simdvertex tmpVertex; // temporary simdvertex for unimplemented API
+ bool processCutVerts; // vertex indices with cuts should be processed as normal, otherwise they
+ // are ignored. Fetch shader sends invalid verts on cuts that should be ignored
+ // while the GS sends valid verts for every index
+ // Topology state tracking
+ uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
+ uint32_t curIndex;
+ bool reverseWinding; // indicates reverse winding for strips
+ int32_t adjExtraVert; // extra vert uses for tristrip w/ adj
+
+ typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
+ PFN_PA_FUNC pfnPa; // per-topology function that processes a single vert
+
+ PA_STATE_CUT() {}
+ PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts,
+ uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
+ : PA_STATE(pDC, in_pStream, in_streamSizeInVerts)
+ {
+ numVerts = in_streamSizeInVerts;
+ numAttribs = in_numAttribs;
+ binTopology = topo;
+ needOffsets = false;
+ processCutVerts = in_processCutVerts;
+
+ numVertsToAssemble = numRemainingVerts = in_numVerts;
+ numPrimsAssembled = 0;
+ headVertex = tailVertex = curVertex = 0;
+
+ curIndex = 0;
+ pCutIndices = in_pIndices;
+ memset(indices, 0, sizeof(indices));
+ vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+ reverseWinding = false;
+ adjExtraVert = -1;
+
+ bool gsEnabled = pDC->pState->state.gsState.gsEnable;
+ vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
+
+ switch (topo)
+ {
+ case TOP_TRIANGLE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
+ case TOP_TRI_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
+ case TOP_TRIANGLE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
+ case TOP_TRI_STRIP_ADJ: if (gsEnabled)
+ {
+ pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
+ }
+ else
+ {
+ pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
+ }
+ break;
+
+ case TOP_POINT_LIST: pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
+ case TOP_LINE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
+ case TOP_LINE_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
+ case TOP_LINE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
+ case TOP_LISTSTRIP_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
+ default: assert(0 && "Unimplemented topology");
+ }
+ }
+
+ simdvertex& GetNextVsOutput()
+ {
+ uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
+ this->headVertex = (this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts;
+ this->needOffsets = true;
+ return ((simdvertex*)pStreamBase)[vertexIndex];
+ }
+
+ simdmask& GetNextVsIndices()
+ {
+ uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
+ simdmask* pCurCutIndex = this->pCutIndices + vertexIndex;
+ return *pCurCutIndex;
+ }
+
+ simdvector& GetSimdVector(uint32_t index, uint32_t slot)
+ {
+ // unused
+ SWR_ASSERT(0 && "Not implemented");
+ return this->tmpVertex.attrib[0];
+ }
+
+ bool GetNextStreamOutput()
+ {
+ this->headVertex += KNOB_SIMD_WIDTH;
+ this->needOffsets = true;
+ return HasWork();
+ }
+
+ simdscalari GetPrimID(uint32_t startID)
+ {
+ return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
+ }
+
+ void Reset()
+ {
+ this->numRemainingVerts = this->numVertsToAssemble;
+ this->numPrimsAssembled = 0;
+ this->curIndex = 0;
+ this->curVertex = 0;
+ this->tailVertex = 0;
+ this->headVertex = 0;
+ this->reverseWinding = false;
+ this->adjExtraVert = -1;
+ this->vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+ }
+
+ bool HasWork()
+ {
+ return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
+ }
+
+ bool IsVertexStoreFull()
+ {
+ return ((this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts) == this->tailVertex;
+ }
+
+ void RestartTopology()
+ {
+ this->curIndex = 0;
+ this->reverseWinding = false;
+ this->adjExtraVert = -1;
+ }
+
+ bool IsCutIndex(uint32_t vertex)
+ {
+ uint32_t vertexIndex = vertex / KNOB_SIMD_WIDTH;
+ uint32_t vertexOffset = vertex & (KNOB_SIMD_WIDTH - 1);
+ return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
+ }
+
+ // iterates across the unprocessed verts until we hit the end or we
+ // have assembled SIMD prims
+ void ProcessVerts()
+ {
+ while (this->numPrimsAssembled != KNOB_SIMD_WIDTH &&
+ this->numRemainingVerts > 0 &&
+ this->curVertex != this->headVertex)
+ {
+ // if cut index, restart topology
+ if (IsCutIndex(this->curVertex))
+ {
+ if (this->processCutVerts)
+ {
+ (this->*pfnPa)(this->curVertex, false);
+ }
+ // finish off tri strip w/ adj before restarting topo
+ if (this->adjExtraVert != -1)
+ {
+ (this->*pfnPa)(this->curVertex, true);
+ }
+ RestartTopology();
+ }
+ else
+ {
+ (this->*pfnPa)(this->curVertex, false);
+ }
+
+ this->curVertex = (this->curVertex + 1) % this->numVerts;
+ this->numRemainingVerts--;
+ }
+
+ // special case last primitive for tri strip w/ adj
+ if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
+ {
+ (this->*pfnPa)(this->curVertex, true);
+ }
+ }
+
+ void Advance()
+ {
+ // done with current batch
+ // advance tail to the current unsubmitted vertex
+ this->tailVertex = this->curVertex;
+ this->numPrimsAssembled = 0;
+ this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(KNOB_SIMD_WIDTH));
+ }
+
+ bool NextPrim()
+ {
+ // if we've assembled enough prims, we can advance to the next set of verts
+ if (this->numPrimsAssembled == KNOB_SIMD_WIDTH || this->numRemainingVerts <= 0)
+ {
+ Advance();
+ }
+ return false;
+ }
+
+ void ComputeOffsets()
+ {
+ for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
+ {
+ simdscalari vIndices = *(simdscalari*)&this->indices[v][0];
+
+ // step to simdvertex batch
+ const uint32_t simdShift = 3; // @todo make knob
+ simdscalari vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
+ this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(simdvertex)));
+
+ // step to index
+ const uint32_t simdMask = 0x7; // @todo make knob
+ simdscalari vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
+ this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
+ }
+ }
+
+ bool Assemble(uint32_t slot, simdvector result[])
+ {
+ // process any outstanding verts
+ ProcessVerts();
+
+ // return false if we don't have enough prims assembled
+ if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts > 0)
+ {
+ return false;
+ }
+
+ // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
+ if (this->needOffsets)
+ {
+ ComputeOffsets();
+ this->needOffsets = false;
+ }
+
+ for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
+ {
+ simdscalari offsets = this->vOffsets[v];
+
+ // step to attribute
+ offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
+
+ float* pBase = (float*)this->pStreamBase;
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ result[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
+
+ // move base to next component
+ pBase += KNOB_SIMD_WIDTH;
+ }
+ }
+
+ return true;
+ }
+
+ void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
+ {
+ // move to slot
+ for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
+ {
+ uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
+ uint32_t offset = pOffset[triIndex];
+ offset += sizeof(simdvector) * slot;
+ float* pVert = (float*)&tri[v];
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ float* pComponent = (float*)(this->pStreamBase + offset);
+ pVert[c] = *pComponent;
+ offset += KNOB_SIMD_WIDTH * sizeof(float);
+ }
+ }
+ }
+
+ uint32_t NumPrims()
+ {
+ return this->numPrimsAssembled;
+ }
+
+ // Per-topology functions
+ void ProcessVertTriStrip(uint32_t index, bool finish)
+ {
+ this->vert[this->curIndex] = index;
+ this->curIndex++;
+ if (this->curIndex == 3)
+ {
+ // assembled enough verts for prim, add to gather indices
+ this->indices[0][this->numPrimsAssembled] = this->vert[0];
+ if (reverseWinding)
+ {
+ this->indices[1][this->numPrimsAssembled] = this->vert[2];
+ this->indices[2][this->numPrimsAssembled] = this->vert[1];
+ }
+ else
+ {
+ this->indices[1][this->numPrimsAssembled] = this->vert[1];
+ this->indices[2][this->numPrimsAssembled] = this->vert[2];
+ }
+
+ // increment numPrimsAssembled
+ this->numPrimsAssembled++;
+
+ // set up next prim state
+ this->vert[0] = this->vert[1];
+ this->vert[1] = this->vert[2];
+ this->curIndex = 2;
+ this->reverseWinding ^= 1;
+ }
+ }
+
+ template<bool gsEnabled>
+ void AssembleTriStripAdj()
+ {
+ if (!gsEnabled)
+ {
+ this->vert[1] = this->vert[2];
+ this->vert[2] = this->vert[4];
+
+ this->indices[0][this->numPrimsAssembled] = this->vert[0];
+ this->indices[1][this->numPrimsAssembled] = this->vert[1];
+ this->indices[2][this->numPrimsAssembled] = this->vert[2];
+
+ this->vert[4] = this->vert[2];
+ this->vert[2] = this->vert[1];
+ }
+ else
+ {
+ this->indices[0][this->numPrimsAssembled] = this->vert[0];
+ this->indices[1][this->numPrimsAssembled] = this->vert[1];
+ this->indices[2][this->numPrimsAssembled] = this->vert[2];
+ this->indices[3][this->numPrimsAssembled] = this->vert[3];
+ this->indices[4][this->numPrimsAssembled] = this->vert[4];
+ this->indices[5][this->numPrimsAssembled] = this->vert[5];
+ }
+ this->numPrimsAssembled++;
+ }
+
+
+ template<bool gsEnabled>
+ void ProcessVertTriStripAdj(uint32_t index, bool finish)
+ {
+ // handle last primitive of tristrip
+ if (finish && this->adjExtraVert != -1)
+ {
+ this->vert[3] = this->adjExtraVert;
+ AssembleTriStripAdj<gsEnabled>();
+ this->adjExtraVert = -1;
+ return;
+ }
+
+ switch (this->curIndex)
+ {
+ case 0:
+ case 1:
+ case 2:
+ case 4:
+ this->vert[this->curIndex] = index;
+ this->curIndex++;
+ break;
+ case 3:
+ this->vert[5] = index;
+ this->curIndex++;
+ break;
+ case 5:
+ if (this->adjExtraVert == -1)
+ {
+ this->adjExtraVert = index;
+ }
+ else
+ {
+ this->vert[3] = index;
+ if (!gsEnabled)
+ {
+ AssembleTriStripAdj<gsEnabled>();
+
+ uint32_t nextTri[6];
+ if (this->reverseWinding)
+ {
+ nextTri[0] = this->vert[4];
+ nextTri[1] = this->vert[0];
+ nextTri[2] = this->vert[2];
+ nextTri[4] = this->vert[3];
+ nextTri[5] = this->adjExtraVert;
+ }
+ else
+ {
+ nextTri[0] = this->vert[2];
+ nextTri[1] = this->adjExtraVert;
+ nextTri[2] = this->vert[3];
+ nextTri[4] = this->vert[4];
+ nextTri[5] = this->vert[0];
+ }
+ for (uint32_t i = 0; i < 6; ++i)
+ {
+ this->vert[i] = nextTri[i];
+ }
+
+ this->adjExtraVert = -1;
+ this->reverseWinding ^= 1;
+ }
+ else
+ {
+ this->curIndex++;
+ }
+ }
+ break;
+ case 6:
+ SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
+ AssembleTriStripAdj<gsEnabled>();
+
+ uint32_t nextTri[6];
+ if (this->reverseWinding)
+ {
+ nextTri[0] = this->vert[4];
+ nextTri[1] = this->vert[0];
+ nextTri[2] = this->vert[2];
+ nextTri[4] = this->vert[3];
+ nextTri[5] = this->adjExtraVert;
+ }
+ else
+ {
+ nextTri[0] = this->vert[2];
+ nextTri[1] = this->adjExtraVert;
+ nextTri[2] = this->vert[3];
+ nextTri[4] = this->vert[4];
+ nextTri[5] = this->vert[0];
+ }
+ for (uint32_t i = 0; i < 6; ++i)
+ {
+ this->vert[i] = nextTri[i];
+ }
+ this->reverseWinding ^= 1;
+ this->adjExtraVert = index;
+ this->curIndex--;
+ break;
+ }
+ }
+
+ void ProcessVertTriList(uint32_t index, bool finish)
+ {
+ this->vert[this->curIndex] = index;
+ this->curIndex++;
+ if (this->curIndex == 3)
+ {
+ // assembled enough verts for prim, add to gather indices
+ this->indices[0][this->numPrimsAssembled] = this->vert[0];
+ this->indices[1][this->numPrimsAssembled] = this->vert[1];
+ this->indices[2][this->numPrimsAssembled] = this->vert[2];
+
+ // increment numPrimsAssembled
+ this->numPrimsAssembled++;
+
+ // set up next prim state
+ this->curIndex = 0;
+ }
+ }
+
+ void ProcessVertTriListAdj(uint32_t index, bool finish)
+ {
+ this->vert[this->curIndex] = index;
+ this->curIndex++;
+ if (this->curIndex == 6)
+ {
+ // assembled enough verts for prim, add to gather indices
+ this->indices[0][this->numPrimsAssembled] = this->vert[0];
+ this->indices[1][this->numPrimsAssembled] = this->vert[1];
+ this->indices[2][this->numPrimsAssembled] = this->vert[2];
+ this->indices[3][this->numPrimsAssembled] = this->vert[3];
+ this->indices[4][this->numPrimsAssembled] = this->vert[4];
+ this->indices[5][this->numPrimsAssembled] = this->vert[5];
+
+ // increment numPrimsAssembled
+ this->numPrimsAssembled++;
+
+ // set up next prim state
+ this->curIndex = 0;
+ }
+ }
+
+ void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
+ {
+ this->vert[this->curIndex] = index;
+ this->curIndex++;
+ if (this->curIndex == 6)
+ {
+ // assembled enough verts for prim, add to gather indices
+ this->indices[0][this->numPrimsAssembled] = this->vert[0];
+ this->indices[1][this->numPrimsAssembled] = this->vert[2];
+ this->indices[2][this->numPrimsAssembled] = this->vert[4];
+
+ // increment numPrimsAssembled
+ this->numPrimsAssembled++;
+
+ // set up next prim state
+ this->curIndex = 0;
+ }
+ }
+
+
+ void ProcessVertLineList(uint32_t index, bool finish)
+ {
+ this->vert[this->curIndex] = index;
+ this->curIndex++;
+ if (this->curIndex == 2)
+ {
+ this->indices[0][this->numPrimsAssembled] = this->vert[0];
+ this->indices[1][this->numPrimsAssembled] = this->vert[1];
+
+ this->numPrimsAssembled++;
+ this->curIndex = 0;
+ }
+ }
+
+ void ProcessVertLineStrip(uint32_t index, bool finish)
+ {
+ this->vert[this->curIndex] = index;
+ this->curIndex++;
+ if (this->curIndex == 2)
+ {
+ // assembled enough verts for prim, add to gather indices
+ this->indices[0][this->numPrimsAssembled] = this->vert[0];
+ this->indices[1][this->numPrimsAssembled] = this->vert[1];
+
+ // increment numPrimsAssembled
+ this->numPrimsAssembled++;
+
+ // set up next prim state
+ this->vert[0] = this->vert[1];
+ this->curIndex = 1;
+ }
+ }
+
+ void ProcessVertLineStripAdj(uint32_t index, bool finish)
+ {
+ this->vert[this->curIndex] = index;
+ this->curIndex++;
+ if (this->curIndex == 4)
+ {
+ // assembled enough verts for prim, add to gather indices
+ this->indices[0][this->numPrimsAssembled] = this->vert[0];
+ this->indices[1][this->numPrimsAssembled] = this->vert[1];
+ this->indices[2][this->numPrimsAssembled] = this->vert[2];
+ this->indices[3][this->numPrimsAssembled] = this->vert[3];
+
+ // increment numPrimsAssembled
+ this->numPrimsAssembled++;
+
+ // set up next prim state
+ this->vert[0] = this->vert[1];
+ this->vert[1] = this->vert[2];
+ this->vert[2] = this->vert[3];
+ this->curIndex = 3;
+ }
+ }
+
+ void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
+ {
+ this->vert[this->curIndex] = index;
+ this->curIndex++;
+ if (this->curIndex == 4)
+ {
+ // assembled enough verts for prim, add to gather indices
+ this->indices[0][this->numPrimsAssembled] = this->vert[1];
+ this->indices[1][this->numPrimsAssembled] = this->vert[2];
+
+ // increment numPrimsAssembled
+ this->numPrimsAssembled++;
+
+ // set up next prim state
+ this->vert[0] = this->vert[1];
+ this->vert[1] = this->vert[2];
+ this->vert[2] = this->vert[3];
+ this->curIndex = 3;
+ }
+ }
+
+ void ProcessVertLineListAdj(uint32_t index, bool finish)
+ {
+ this->vert[this->curIndex] = index;
+ this->curIndex++;
+ if (this->curIndex == 4)
+ {
+ this->indices[0][this->numPrimsAssembled] = this->vert[0];
+ this->indices[1][this->numPrimsAssembled] = this->vert[1];
+ this->indices[2][this->numPrimsAssembled] = this->vert[2];
+ this->indices[3][this->numPrimsAssembled] = this->vert[3];
+
+ this->numPrimsAssembled++;
+ this->curIndex = 0;
+ }
+ }
+
+ void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
+ {
+ this->vert[this->curIndex] = index;
+ this->curIndex++;
+ if (this->curIndex == 4)
+ {
+ this->indices[0][this->numPrimsAssembled] = this->vert[1];
+ this->indices[1][this->numPrimsAssembled] = this->vert[2];
+
+ this->numPrimsAssembled++;
+ this->curIndex = 0;
+ }
+ }
+
+ void ProcessVertPointList(uint32_t index, bool finish)
+ {
+ this->vert[this->curIndex] = index;
+ this->curIndex++;
+ if (this->curIndex == 1)
+ {
+ this->indices[0][this->numPrimsAssembled] = this->vert[0];
+ this->numPrimsAssembled++;
+ this->curIndex = 0;
+ }
+ }
+};
+
+// Primitive Assembly for data output from the DomainShader.
+struct PA_TESS : PA_STATE
+{
+ PA_TESS(
+ DRAW_CONTEXT *in_pDC,
+ const simdscalar* in_pVertData,
+ uint32_t in_attributeStrideInVectors,
+ uint32_t in_numAttributes,
+ uint32_t* (&in_ppIndices)[3],
+ uint32_t in_numPrims,
+ PRIMITIVE_TOPOLOGY in_binTopology) :
+
+ PA_STATE(in_pDC, nullptr, 0),
+ m_pVertexData(in_pVertData),
+ m_attributeStrideInVectors(in_attributeStrideInVectors),
+ m_numAttributes(in_numAttributes),
+ m_numPrims(in_numPrims)
+ {
+ m_vPrimId = _simd_setzero_si();
+ binTopology = in_binTopology;
+ m_ppIndices[0] = in_ppIndices[0];
+ m_ppIndices[1] = in_ppIndices[1];
+ m_ppIndices[2] = in_ppIndices[2];
+
+ switch (binTopology)
+ {
+ case TOP_POINT_LIST:
+ m_numVertsPerPrim = 1;
+ break;
+
+ case TOP_LINE_LIST:
+ m_numVertsPerPrim = 2;
+ break;
+
+ case TOP_TRIANGLE_LIST:
+ m_numVertsPerPrim = 3;
+ break;
+
+ default:
+ SWR_ASSERT(0, "Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
+ break;
+ }
+ }
+
+ bool HasWork()
+ {
+ return m_numPrims != 0;
+ }
+
+ simdvector& GetSimdVector(uint32_t index, uint32_t slot)
+ {
+ SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__);
+ static simdvector junk = { 0 };
+ return junk;
+ }
+
+ static simdscalari GenPrimMask(uint32_t numPrims)
+ {
+ SWR_ASSERT(numPrims <= KNOB_SIMD_WIDTH);
+#if KNOB_SIMD_WIDTH == 8
+ static const OSALIGN(int32_t, 64) maskGen[KNOB_SIMD_WIDTH * 2] =
+ {
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 0, 0, 0, 0, 0, 0, 0
+ };
+#elif KNOB_SIMD_WIDTH == 16
+ static const OSALIGN(int32_t, 128) maskGen[KNOB_SIMD_WIDTH * 2] =
+ {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ };
+#else
+#error "Help, help, I can't get up!"
+#endif
+
+ return _simd_loadu_si((const simdscalari*)&maskGen[KNOB_SIMD_WIDTH - numPrims]);
+ }
+
+ bool Assemble(uint32_t slot, simdvector verts[])
+ {
+ static_assert(KNOB_SIMD_WIDTH == 8, "Need to revisit this when AVX512 is implemented");
+ SWR_ASSERT(slot < m_numAttributes);
+
+ uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
+ if (0 == numPrimsToAssemble)
+ {
+ return false;
+ }
+
+ simdscalari mask = GenPrimMask(numPrimsToAssemble);
+
+ const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
+ for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
+ {
+ simdscalari indices = _simd_load_si((const simdscalari*)m_ppIndices[i]);
+
+ const float* pBase = pBaseAttrib;
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ verts[i].v[c] = _simd_mask_i32gather_ps(
+ _simd_setzero_ps(),
+ pBase,
+ indices,
+ _simd_castsi_ps(mask),
+ 4 /* gcc doesn't like sizeof(float) */);
+ pBase += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
+ }
+ }
+
+ return true;
+ }
+
+ void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
+ {
+ SWR_ASSERT(slot < m_numAttributes);
+ SWR_ASSERT(primIndex < PA_TESS::NumPrims());
+
+ const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
+ for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
+ {
+ uint32_t index = m_ppIndices[i][primIndex];
+ const float* pVertData = pVertDataBase;
+ float* pVert = (float*)&verts[i];
+
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ pVert[c] = pVertData[index];
+ pVertData += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
+ }
+ }
+ }
+
+ bool NextPrim()
+ {
+ uint32_t numPrims = PA_TESS::NumPrims();
+ m_numPrims -= numPrims;
+ m_ppIndices[0] += numPrims;
+ m_ppIndices[1] += numPrims;
+ m_ppIndices[2] += numPrims;
+
+ return HasWork();
+ }
+
+ simdvertex& GetNextVsOutput()
+ {
+ SWR_ASSERT(0, "%s", __FUNCTION__);
+ static simdvertex junk;
+ return junk;
+ }
+
+ bool GetNextStreamOutput()
+ {
+ SWR_ASSERT(0, "%s", __FUNCTION__);
+ return false;
+ }
+
+ simdmask& GetNextVsIndices()
+ {
+ SWR_ASSERT(0, "%s", __FUNCTION__);
+ static simdmask junk;
+ return junk;
+ }
+
+ uint32_t NumPrims()
+ {
+ return std::min<uint32_t>(m_numPrims, KNOB_SIMD_WIDTH);
+ }
+
+ void Reset() { SWR_ASSERT(0); };
+
+ simdscalari GetPrimID(uint32_t startID)
+ {
+ return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
+ }
+
+private:
+ const simdscalar* m_pVertexData = nullptr;
+ uint32_t m_attributeStrideInVectors = 0;
+ uint32_t m_numAttributes = 0;
+ uint32_t m_numPrims = 0;
+ uint32_t* m_ppIndices[3];
+
+ uint32_t m_numVertsPerPrim = 0;
+
+ simdscalari m_vPrimId;
+};
+
+// Primitive Assembler factory class, responsible for creating and initializing the correct assembler
+// based on state.
+template <bool IsIndexedT>
+struct PA_FACTORY
+{
+ PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts) : topo(in_topo)
+ {
+#if KNOB_ENABLE_CUT_AWARE_PA == TRUE
+ const API_STATE& state = GetApiState(pDC);
+ if ((IsIndexedT && (
+ topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
+ topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
+ topo == TOP_TRIANGLE_LIST || topo == TOP_LINE_LIST_ADJ ||
+ topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ ||
+ topo == TOP_TRI_STRIP_ADJ)) ||
+
+ // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
+ // for them in the optimized PA
+ (!IsIndexedT && (
+ topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ)))
+ {
+ memset(&indexStore, 0, sizeof(indexStore));
+ DWORD numAttribs;
+ _BitScanReverse(&numAttribs, state.feAttribMask);
+ numAttribs++;
+ new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH,
+ &this->indexStore[0], numVerts, numAttribs, state.topology, false);
+ cutPA = true;
+ }
+ else
+#endif
+ {
+ uint32_t numPrims = GetNumPrims(in_topo, numVerts);
+ new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, false);
+ cutPA = false;
+ }
+
+ }
+
+ PA_STATE& GetPA()
+ {
+#if KNOB_ENABLE_CUT_AWARE_PA == TRUE
+ if (cutPA)
+ {
+ return this->paCut;
+ }
+ else
+#endif
+ {
+ return this->paOpt;
+ }
+ }
+
+ PA_STATE_OPT paOpt;
+ PA_STATE_CUT paCut;
+ bool cutPA;
+
+ PRIMITIVE_TOPOLOGY topo;
+
+ simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM];
+ simdmask indexStore[MAX_NUM_VERTS_PER_PRIM];
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
new file mode 100644
index 00000000000..9850b436e39
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
@@ -0,0 +1,1177 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file pa_avx.cpp
+*
+* @brief AVX implementation for primitive assembly.
+* N primitives are assembled at a time, where N is the SIMD width.
+* A state machine, that is specific for a given topology, drives the
+* assembly of vertices into triangles.
+*
+******************************************************************************/
+#include "context.h"
+#include "pa.h"
+#include "frontend.h"
+
+#if (KNOB_SIMD_WIDTH == 8)
+
+bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+
+bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+
+bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+
+bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+
+bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+
+bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t index, __m128 verts[]);
+
+bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 lineverts[]);
+
+bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+
+bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+
+template <uint32_t TotalControlPoints>
+void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+{
+ // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
+ // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute.
+ // Each attribute has 4 components.
+
+ /// @todo Optimize this
+
+ float* pOutVec = (float*)verts;
+
+ for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
+ {
+ uint32_t input_cp = primIndex * TotalControlPoints + cp;
+ uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
+ uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
+
+ // Loop over all components of the attribute
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
+ pOutVec[cp * 4 + i] = pInputVec[input_lane];
+ }
+ }
+}
+
+template<uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
+static bool PaPatchList(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+ SetNextPaState(
+ pa,
+ PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
+ PaPatchListSingle<TotalControlPoints>);
+
+ return false;
+}
+
+template<uint32_t TotalControlPoints>
+static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+ // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
+ // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute.
+ // Each attribute has 4 components.
+
+ /// @todo Optimize this
+
+ // Loop over all components of the attribute
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
+ {
+ float vec[KNOB_SIMD_WIDTH];
+ for (uint32_t lane = 0; lane < KNOB_SIMD_WIDTH; ++lane)
+ {
+ uint32_t input_cp = lane * TotalControlPoints + cp;
+ uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
+ uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
+
+ const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
+ vec[lane] = pInputVec[input_lane];
+ }
+ verts[cp][i] = _simd_loadu_ps(vec);
+ }
+ }
+
+ SetNextPaState(
+ pa,
+ PaPatchList<TotalControlPoints>,
+ PaPatchListSingle<TotalControlPoints>,
+ 0,
+ KNOB_SIMD_WIDTH,
+ true);
+
+ return true;
+}
+
+#define PA_PATCH_LIST_TERMINATOR(N) \
+ template<> bool PaPatchList<N, N>(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])\
+ { return PaPatchListTerm<N>(pa, slot, verts); }
+PA_PATCH_LIST_TERMINATOR(1)
+PA_PATCH_LIST_TERMINATOR(2)
+PA_PATCH_LIST_TERMINATOR(3)
+PA_PATCH_LIST_TERMINATOR(4)
+PA_PATCH_LIST_TERMINATOR(5)
+PA_PATCH_LIST_TERMINATOR(6)
+PA_PATCH_LIST_TERMINATOR(7)
+PA_PATCH_LIST_TERMINATOR(8)
+PA_PATCH_LIST_TERMINATOR(9)
+PA_PATCH_LIST_TERMINATOR(10)
+PA_PATCH_LIST_TERMINATOR(11)
+PA_PATCH_LIST_TERMINATOR(12)
+PA_PATCH_LIST_TERMINATOR(13)
+PA_PATCH_LIST_TERMINATOR(14)
+PA_PATCH_LIST_TERMINATOR(15)
+PA_PATCH_LIST_TERMINATOR(16)
+PA_PATCH_LIST_TERMINATOR(17)
+PA_PATCH_LIST_TERMINATOR(18)
+PA_PATCH_LIST_TERMINATOR(19)
+PA_PATCH_LIST_TERMINATOR(20)
+PA_PATCH_LIST_TERMINATOR(21)
+PA_PATCH_LIST_TERMINATOR(22)
+PA_PATCH_LIST_TERMINATOR(23)
+PA_PATCH_LIST_TERMINATOR(24)
+PA_PATCH_LIST_TERMINATOR(25)
+PA_PATCH_LIST_TERMINATOR(26)
+PA_PATCH_LIST_TERMINATOR(27)
+PA_PATCH_LIST_TERMINATOR(28)
+PA_PATCH_LIST_TERMINATOR(29)
+PA_PATCH_LIST_TERMINATOR(30)
+PA_PATCH_LIST_TERMINATOR(31)
+PA_PATCH_LIST_TERMINATOR(32)
+#undef PA_PATCH_LIST_TERMINATOR
+
+bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+ SetNextPaState(pa, PaTriList1, PaTriListSingle0);
+ return false; // Not enough vertices to assemble 4 or 8 triangles.
+}
+
+bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+ SetNextPaState(pa, PaTriList2, PaTriListSingle0);
+ return false; // Not enough vertices to assemble 8 triangles.
+}
+
+bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+ simdvector& a = PaGetSimdVector(pa, 0, slot);
+ simdvector& b = PaGetSimdVector(pa, 1, slot);
+ simdvector& c = PaGetSimdVector(pa, 2, slot);
+ simdscalar s;
+
+ // Tri Pattern - provoking vertex is always v0
+ // v0 -> 0 3 6 9 12 15 18 21
+ // v1 -> 1 4 7 10 13 16 19 22
+ // v2 -> 2 5 8 11 14 17 20 23
+
+ for(int i = 0; i < 4; ++i)
+ {
+ simdvector& v0 = verts[0];
+ v0[i] = _simd_blend_ps(a[i], b[i], 0x92);
+ v0[i] = _simd_blend_ps(v0[i], c[i], 0x24);
+ v0[i] = _mm256_permute_ps(v0[i], 0x6C);
+ s = _mm256_permute2f128_ps(v0[i], v0[i], 0x21);
+ v0[i] = _simd_blend_ps(v0[i], s, 0x44);
+
+ simdvector& v1 = verts[1];
+ v1[i] = _simd_blend_ps(a[i], b[i], 0x24);
+ v1[i] = _simd_blend_ps(v1[i], c[i], 0x49);
+ v1[i] = _mm256_permute_ps(v1[i], 0xB1);
+ s = _mm256_permute2f128_ps(v1[i], v1[i], 0x21);
+ v1[i] = _simd_blend_ps(v1[i], s, 0x66);
+
+ simdvector& v2 = verts[2];
+ v2[i] = _simd_blend_ps(a[i], b[i], 0x49);
+ v2[i] = _simd_blend_ps(v2[i], c[i], 0x92);
+ v2[i] = _mm256_permute_ps(v2[i], 0xC6);
+ s = _mm256_permute2f128_ps(v2[i], v2[i], 0x21);
+ v2[i] = _simd_blend_ps(v2[i], s, 0x22);
+ }
+
+ SetNextPaState(pa, PaTriList0, PaTriListSingle0, 0, KNOB_SIMD_WIDTH, true);
+ return true;
+}
+
+void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+{
+ // We have 12 simdscalars contained within 3 simdvectors which
+ // hold at least 8 triangles worth of data. We want to assemble a single
+ // triangle with data in horizontal form.
+ simdvector& a = PaGetSimdVector(pa, 0, slot);
+ simdvector& b = PaGetSimdVector(pa, 1, slot);
+ simdvector& c = PaGetSimdVector(pa, 2, slot);
+
+ // Convert from vertical to horizontal.
+ // Tri Pattern - provoking vertex is always v0
+ // v0 -> 0 3 6 9 12 15 18 21
+ // v1 -> 1 4 7 10 13 16 19 22
+ // v2 -> 2 5 8 11 14 17 20 23
+ switch(primIndex)
+ {
+ case 0:
+ verts[0] = swizzleLane0(a);
+ verts[1] = swizzleLane1(a);
+ verts[2] = swizzleLane2(a);
+ break;
+ case 1:
+ verts[0] = swizzleLane3(a);
+ verts[1] = swizzleLane4(a);
+ verts[2] = swizzleLane5(a);
+ break;
+ case 2:
+ verts[0] = swizzleLane6(a);
+ verts[1] = swizzleLane7(a);
+ verts[2] = swizzleLane0(b);
+ break;
+ case 3:
+ verts[0] = swizzleLane1(b);
+ verts[1] = swizzleLane2(b);
+ verts[2] = swizzleLane3(b);
+ break;
+ case 4:
+ verts[0] = swizzleLane4(b);
+ verts[1] = swizzleLane5(b);
+ verts[2] = swizzleLane6(b);
+ break;
+ case 5:
+ verts[0] = swizzleLane7(b);
+ verts[1] = swizzleLane0(c);
+ verts[2] = swizzleLane1(c);
+ break;
+ case 6:
+ verts[0] = swizzleLane2(c);
+ verts[1] = swizzleLane3(c);
+ verts[2] = swizzleLane4(c);
+ break;
+ case 7:
+ verts[0] = swizzleLane5(c);
+ verts[1] = swizzleLane6(c);
+ verts[2] = swizzleLane7(c);
+ break;
+ };
+}
+
+bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+ SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0);
+ return false; // Not enough vertices to assemble 8 triangles.
+}
+
+bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+ simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
+ simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
+ simdscalar s;
+
+ for(int i = 0; i < 4; ++i)
+ {
+ simdscalar a0 = a[i];
+ simdscalar b0 = b[i];
+
+ // Tri Pattern - provoking vertex is always v0
+ // v0 -> 01234567
+ // v1 -> 13355779
+ // v2 -> 22446688
+ simdvector& v0 = verts[0];
+ v0[i] = a0;
+
+ // s -> 4567891011
+ s = _mm256_permute2f128_ps(a0, b0, 0x21);
+ // s -> 23456789
+ s = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
+
+ simdvector& v1 = verts[1];
+ // v1 -> 13355779
+ v1[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(3, 1, 3, 1));
+
+ simdvector& v2 = verts[2];
+ // v2 -> 22446688
+ v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(2, 2, 2, 2));
+ }
+
+ SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0, 0, KNOB_SIMD_WIDTH);
+ return true;
+}
+
+void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+{
+ simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
+ simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
+
+ // Convert from vertical to horizontal.
+ // Tri Pattern - provoking vertex is always v0
+ // v0 -> 01234567
+ // v1 -> 13355779
+ // v2 -> 22446688
+ switch(primIndex)
+ {
+ case 0:
+ verts[0] = swizzleLane0(a);
+ verts[1] = swizzleLane1(a);
+ verts[2] = swizzleLane2(a);
+ break;
+ case 1:
+ verts[0] = swizzleLane1(a);
+ verts[1] = swizzleLane3(a);
+ verts[2] = swizzleLane2(a);
+ break;
+ case 2:
+ verts[0] = swizzleLane2(a);
+ verts[1] = swizzleLane3(a);
+ verts[2] = swizzleLane4(a);
+ break;
+ case 3:
+ verts[0] = swizzleLane3(a);
+ verts[1] = swizzleLane5(a);
+ verts[2] = swizzleLane4(a);
+ break;
+ case 4:
+ verts[0] = swizzleLane4(a);
+ verts[1] = swizzleLane5(a);
+ verts[2] = swizzleLane6(a);
+ break;
+ case 5:
+ verts[0] = swizzleLane5(a);
+ verts[1] = swizzleLane7(a);
+ verts[2] = swizzleLane6(a);
+ break;
+ case 6:
+ verts[0] = swizzleLane6(a);
+ verts[1] = swizzleLane7(a);
+ verts[2] = swizzleLane0(b);
+ break;
+ case 7:
+ verts[0] = swizzleLane7(a);
+ verts[1] = swizzleLane1(b);
+ verts[2] = swizzleLane0(b);
+ break;
+ };
+}
+
+bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+ simdvector& a = PaGetSimdVector(pa, pa.cur, slot);
+
+ // Extract vertex 0 to every lane of first vector
+ for(int i = 0; i < 4; ++i)
+ {
+ __m256 a0 = a[i];
+ simdvector& v0 = verts[0];
+ v0[i] = _simd_shuffle_ps(a0, a0, _MM_SHUFFLE(0, 0, 0, 0));
+ v0[i] = _mm256_permute2f128_ps(v0[i], a0, 0x00);
+ }
+
+ // store off leading vertex for attributes
+ simdvertex* pVertex = (simdvertex*)pa.pStreamBase;
+ pa.leadingVertex = pVertex[pa.cur];
+
+ SetNextPaState(pa, PaTriFan1, PaTriFanSingle0);
+ return false; // Not enough vertices to assemble 8 triangles.
+}
+
+bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+ simdvector& leadVert = pa.leadingVertex.attrib[slot];
+ simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
+ simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
+ simdscalar s;
+
+ // need to fill vectors 1/2 with new verts, and v0 with anchor vert.
+ for(int i = 0; i < 4; ++i)
+ {
+ simdscalar a0 = a[i];
+ simdscalar b0 = b[i];
+
+ __m256 comp = leadVert[i];
+ simdvector& v0 = verts[0];
+ v0[i] = _simd_shuffle_ps(comp, comp, _MM_SHUFFLE(0, 0, 0, 0));
+ v0[i] = _mm256_permute2f128_ps(v0[i], comp, 0x00);
+
+ simdvector& v2 = verts[2];
+ s = _mm256_permute2f128_ps(a0, b0, 0x21);
+ v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
+
+ simdvector& v1 = verts[1];
+ v1[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1));
+ }
+
+ SetNextPaState(pa, PaTriFan1, PaTriFanSingle0, 0, KNOB_SIMD_WIDTH);
+ return true;
+}
+
+void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+{
+ // vert 0 from leading vertex
+ simdvector& lead = pa.leadingVertex.attrib[slot];
+ verts[0] = swizzleLane0(lead);
+
+ simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
+ simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
+
+ // vert 1
+ if (primIndex < 7)
+ {
+ verts[1] = swizzleLaneN(a, primIndex + 1);
+ }
+ else
+ {
+ verts[1] = swizzleLane0(b);
+ }
+
+ // vert 2
+ if (primIndex < 6)
+ {
+ verts[2] = swizzleLaneN(a, primIndex + 2);
+ }
+ else
+ {
+ verts[2] = swizzleLaneN(b, primIndex - 6);
+ }
+}
+
+bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+ SetNextPaState(pa, PaQuadList1, PaQuadListSingle0);
+ return false; // Not enough vertices to assemble 8 triangles.
+}
+
+bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+ simdvector& a = PaGetSimdVector(pa, 0, slot);
+ simdvector& b = PaGetSimdVector(pa, 1, slot);
+ simdscalar s1, s2;
+
+ for(int i = 0; i < 4; ++i)
+ {
+ simdscalar a0 = a[i];
+ simdscalar b0 = b[i];
+
+ s1 = _mm256_permute2f128_ps(a0, b0, 0x20);
+ s2 = _mm256_permute2f128_ps(a0, b0, 0x31);
+
+ simdvector& v0 = verts[0];
+ v0[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(0, 0, 0, 0));
+
+ simdvector& v1 = verts[1];
+ v1[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(2, 1, 2, 1));
+
+ simdvector& v2 = verts[2];
+ v2[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2));
+ }
+
+ SetNextPaState(pa, PaQuadList0, PaQuadListSingle0, 0, KNOB_SIMD_WIDTH, true);
+ return true;
+}
+
+void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+{
+ simdvector& a = PaGetSimdVector(pa, 0, slot);
+ simdvector& b = PaGetSimdVector(pa, 1, slot);
+
+ switch (primIndex)
+ {
+ case 0:
+ // triangle 0 - 0 1 2
+ verts[0] = swizzleLane0(a);
+ verts[1] = swizzleLane1(a);
+ verts[2] = swizzleLane2(a);
+ break;
+
+ case 1:
+ // triangle 1 - 0 2 3
+ verts[0] = swizzleLane0(a);
+ verts[1] = swizzleLane2(a);
+ verts[2] = swizzleLane3(a);
+ break;
+
+ case 2:
+ // triangle 2 - 4 5 6
+ verts[0] = swizzleLane4(a);
+ verts[1] = swizzleLane5(a);
+ verts[2] = swizzleLane6(a);
+ break;
+
+ case 3:
+ // triangle 3 - 4 6 7
+ verts[0] = swizzleLane4(a);
+ verts[1] = swizzleLane6(a);
+ verts[2] = swizzleLane7(a);
+ break;
+
+ case 4:
+ // triangle 4 - 8 9 10 (0 1 2)
+ verts[0] = swizzleLane0(b);
+ verts[1] = swizzleLane1(b);
+ verts[2] = swizzleLane2(b);
+ break;
+
+ case 5:
+ // triangle 1 - 0 2 3
+ verts[0] = swizzleLane0(b);
+ verts[1] = swizzleLane2(b);
+ verts[2] = swizzleLane3(b);
+ break;
+
+ case 6:
+ // triangle 2 - 4 5 6
+ verts[0] = swizzleLane4(b);
+ verts[1] = swizzleLane5(b);
+ verts[2] = swizzleLane6(b);
+ break;
+
+ case 7:
+ // triangle 3 - 4 6 7
+ verts[0] = swizzleLane4(b);
+ verts[1] = swizzleLane6(b);
+ verts[2] = swizzleLane7(b);
+ break;
+ }
+}
+
+void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t lineIndex, __m128 verts[])
+{
+ PaLineStripSingle0(pa, slot, lineIndex, verts);
+
+ if (pa.numPrimsComplete + lineIndex == pa.numPrims - 1) {
+ simdvector &start = PaGetSimdVector(pa, pa.first, slot);
+ verts[1] = swizzleLane0(start);
+ }
+}
+
+bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+ SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0);
+ return false;
+}
+
+bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+ PaLineStrip1(pa, slot, verts);
+
+ if (pa.numPrimsComplete + KNOB_SIMD_WIDTH > pa.numPrims - 1) {
+ // loop reconnect now
+ int lane = pa.numPrims - pa.numPrimsComplete - 1;
+ simdvector &start = PaGetSimdVector(pa, pa.first, slot);
+ for (int i = 0; i < 4; i++) {
+ float *startVtx = (float *)&(start[i]);
+ float *targetVtx = (float *)&(verts[1][i]);
+ targetVtx[lane] = startVtx[0];
+ }
+ }
+
+ SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0, 0, KNOB_SIMD_WIDTH);
+ return true;
+}
+
+
+bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+ SetNextPaState(pa, PaLineList1, PaLineListSingle0);
+ return false; // Not enough vertices to assemble 8 lines
+}
+
+bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+ simdvector& a = PaGetSimdVector(pa, 0, slot);
+ simdvector& b = PaGetSimdVector(pa, 1, slot);
+ /// @todo: verify provoking vertex is correct
+ // Line list 0 1 2 3 4 5 6 7
+ // 8 9 10 11 12 13 14 15
+
+ // shuffle:
+ // 0 2 4 6 8 10 12 14
+ // 1 3 5 7 9 11 13 15
+
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ // 0 1 2 3 8 9 10 11
+ __m256 vALowBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x20);
+ // 4 5 6 7 12 13 14 15
+ __m256 vAHighBHigh = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x31);
+
+ // 0 2 4 6 8 10 12 14
+ verts[0].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(2, 0, 2, 0));
+ // 1 3 5 7 9 11 13 15
+ verts[1].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(3, 1, 3, 1));
+ }
+
+ SetNextPaState(pa, PaLineList0, PaLineListSingle0, 0, KNOB_SIMD_WIDTH, true);
+ return true;
+}
+
+void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+{
+ simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
+ simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
+
+ switch (primIndex)
+ {
+ case 0:
+ verts[0] = swizzleLane0(a);
+ verts[1] = swizzleLane1(a);
+ break;
+ case 1:
+ verts[0] = swizzleLane2(a);
+ verts[1] = swizzleLane3(a);
+ break;
+ case 2:
+ verts[0] = swizzleLane4(a);
+ verts[1] = swizzleLane5(a);
+ break;
+ case 3:
+ verts[0] = swizzleLane6(a);
+ verts[1] = swizzleLane7(a);
+ break;
+ case 4:
+ verts[0] = swizzleLane0(b);
+ verts[1] = swizzleLane1(b);
+ break;
+ case 5:
+ verts[0] = swizzleLane2(b);
+ verts[1] = swizzleLane3(b);
+ break;
+ case 6:
+ verts[0] = swizzleLane4(b);
+ verts[1] = swizzleLane5(b);
+ break;
+ case 7:
+ verts[0] = swizzleLane6(b);
+ verts[1] = swizzleLane7(b);
+ break;
+ }
+}
+
+bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+ SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0);
+ return false; // Not enough vertices to assemble 8 lines
+}
+
+bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+ simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
+ simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
+
+ /// @todo: verify provoking vertex is correct
+ // Line list 0 1 2 3 4 5 6 7
+ // 8 9 10 11 12 13 14 15
+
+ // shuffle:
+ // 0 1 2 3 4 5 6 7
+ // 1 2 3 4 5 6 7 8
+
+ verts[0] = a;
+
+ for(uint32_t i = 0; i < 4; ++i)
+ {
+ // 1 2 3 x 5 6 7 x
+ __m256 vPermA = _mm256_permute_ps(a.v[i], 0x39); // indices hi->low 00 11 10 01 (0 3 2 1)
+ // 4 5 6 7 8 9 10 11
+ __m256 vAHighBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x21);
+
+ // x x x 4 x x x 8
+ __m256 vPermB = _mm256_permute_ps(vAHighBLow, 0); // indices hi->low (0 0 0 0)
+
+ verts[1].v[i] = _mm256_blend_ps(vPermA, vPermB, 0x88);
+ }
+
+ SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0, 0, KNOB_SIMD_WIDTH);
+ return true;
+}
+
+void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t lineIndex, __m128 verts[])
+{
+ simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
+ simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
+
+ switch (lineIndex)
+ {
+ case 0:
+ verts[0] = swizzleLane0(a);
+ verts[1] = swizzleLane1(a);
+ break;
+ case 1:
+ verts[0] = swizzleLane1(a);
+ verts[1] = swizzleLane2(a);
+ break;
+ case 2:
+ verts[0] = swizzleLane2(a);
+ verts[1] = swizzleLane3(a);
+ break;
+ case 3:
+ verts[0] = swizzleLane3(a);
+ verts[1] = swizzleLane4(a);
+ break;
+ case 4:
+ verts[0] = swizzleLane4(a);
+ verts[1] = swizzleLane5(a);
+ break;
+ case 5:
+ verts[0] = swizzleLane5(a);
+ verts[1] = swizzleLane6(a);
+ break;
+ case 6:
+ verts[0] = swizzleLane6(a);
+ verts[1] = swizzleLane7(a);
+ break;
+ case 7:
+ verts[0] = swizzleLane7(a);
+ verts[1] = swizzleLane0(b);
+ break;
+ }
+}
+
+bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+ simdvector& a = PaGetSimdVector(pa, pa.cur, slot);
+
+ verts[0] = a; // points only have 1 vertex.
+
+ SetNextPaState(pa, PaPoints0, PaPointsSingle0, 0, KNOB_SIMD_WIDTH, true);
+ return true;
+}
+
+void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+{
+ simdvector &a = PaGetSimdVector(pa, pa.cur, slot);
+ switch(primIndex)
+ {
+ case 0:
+ verts[0] = swizzleLane0(a);
+ break;
+ case 1:
+ verts[0] = swizzleLane1(a);
+ break;
+ case 2:
+ verts[0] = swizzleLane2(a);
+ break;
+ case 3:
+ verts[0] = swizzleLane3(a);
+ break;
+ case 4:
+ verts[0] = swizzleLane4(a);
+ break;
+ case 5:
+ verts[0] = swizzleLane5(a);
+ break;
+ case 6:
+ verts[0] = swizzleLane6(a);
+ break;
+ case 7:
+ verts[0] = swizzleLane7(a);
+ break;
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief State 1 for RECT_LIST topology.
+/// There is not enough to assemble 8 triangles.
+bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+ SetNextPaState(pa, PaRectList1, PaRectListSingle0);
+ return false;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief State 1 for RECT_LIST topology.
+/// Rect lists has the following format.
+/// w x y z
+/// v2 o---o v5 o---o v8 o---o v11 o---o
+/// | \ | | \ | | \ | | \ |
+/// v1 o---o v4 o---o v7 o---o v10 o---o
+/// v0 v3 v6 v9
+///
+/// Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
+///
+/// tri0 = { v0, v1, v2 } tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
+/// tri2 = { v3, v4, v5 } tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
+/// etc.
+///
+/// PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
+/// where v0 contains all the first vertices for 8 triangles.
+///
+/// Result:
+/// verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
+/// verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
+/// verts[2] = { v2, w, v5, x, v8, y, v11, z }
+///
+/// @param pa - State for PA state machine.
+/// @param slot - Index into VS output which is either a position (slot 0) or attribute.
+/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
+bool PaRectList1(
+ PA_STATE_OPT& pa,
+ uint32_t slot,
+ simdvector verts[])
+{
+ // SIMD vectors a and b are the last two vertical outputs from the vertex shader.
+ simdvector& a = PaGetSimdVector(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7 }
+ simdvector& b = PaGetSimdVector(pa, 1, slot); // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 }
+
+ __m256 tmp0, tmp1, tmp2;
+
+ // Loop over each component in the simdvector.
+ for(int i = 0; i < 4; ++i)
+ {
+ simdvector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
+ tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
+ v0[i] = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care.
+ tmp1 = _mm256_permute_ps(v0[i], 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * }
+ v0[i] = _mm256_permute_ps(v0[i], 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 }
+ v0[i] = _mm256_blend_ps(tmp1, v0[i], 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 }
+
+ /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
+ /// AVX2 should make this much cheaper.
+ simdvector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
+ v1[i] = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * }
+ tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 }
+ tmp2 = _mm256_blend_ps(v1[i], tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 }
+ tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * }
+ v1[i] = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 }
+ v1[i] = _mm256_blend_ps(tmp2, v1[i], 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 }
+ v1[i] = _mm256_blend_ps(v1[i], tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
+
+ // verts[2] = { v2, w, v5, x, v8, y, v11, z }
+ simdvector& v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z }
+ v2[i] = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * }
+ tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * }
+ v2[i] = _mm256_blend_ps(tmp1, v2[i], 0xF0);
+
+ // Need to compute 4th implied vertex for the rectangle.
+ tmp2 = _mm256_sub_ps(v0[i], v1[i]);
+ tmp2 = _mm256_add_ps(tmp2, v2[i]); // tmp2 = { w, *, x, *, y, *, z, * }
+ tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z }
+ v2[i] = _mm256_blend_ps(v2[i], tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z }
+ }
+
+ SetNextPaState(pa, PaRectList1, PaRectListSingle0, 0, KNOB_SIMD_WIDTH, true);
+ return true;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief State 2 for RECT_LIST topology.
+/// Not implemented unless there is a use case for more then 8 rects.
+/// @param pa - State for PA state machine.
+/// @param slot - Index into VS output which is either a position (slot 0) or attribute.
+/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
+bool PaRectList2(
+ PA_STATE_OPT& pa,
+ uint32_t slot,
+ simdvector verts[])
+{
+ SWR_ASSERT(0); // Is rect list used for anything other then clears?
+ SetNextPaState(pa, PaRectList0, PaRectListSingle0, 0, KNOB_SIMD_WIDTH, true);
+ return true;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief This procedure is called by the Binner to assemble the attributes.
+/// Unlike position, which is stored vertically, the attributes are
+/// stored horizontally. The outputs from the VS, labeled as 'a' and
+/// 'b' are vertical. This function needs to transpose the lanes
+/// containing the vertical attribute data into horizontal form.
+/// @param pa - State for PA state machine.
+/// @param slot - Index into VS output for a given attribute.
+/// @param primIndex - Binner processes each triangle individually.
+/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
+void PaRectListSingle0(
+ PA_STATE_OPT& pa,
+ uint32_t slot,
+ uint32_t primIndex,
+ __m128 verts[])
+{
+ // We have 12 simdscalars contained within 3 simdvectors which
+ // hold at least 8 triangles worth of data. We want to assemble a single
+ // triangle with data in horizontal form.
+ simdvector& a = PaGetSimdVector(pa, 0, slot);
+
+ // Convert from vertical to horizontal.
+ switch(primIndex)
+ {
+ case 0:
+ verts[0] = swizzleLane0(a);
+ verts[1] = swizzleLane1(a);
+ verts[2] = swizzleLane2(a);
+ break;
+ case 1:
+ verts[0] = swizzleLane0(a);
+ verts[1] = swizzleLane2(a);
+ verts[2] = _mm_blend_ps(verts[0], verts[1], 0x2);
+ break;
+ case 2:
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ case 7:
+ SWR_ASSERT(0);
+ break;
+ };
+}
+
+PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* pStream, uint32_t in_streamSizeInVerts,
+ bool in_isStreaming, PRIMITIVE_TOPOLOGY topo) : PA_STATE(in_pDC, pStream, in_streamSizeInVerts), numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0),
+ cur(0), prev(0), first(0), counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming)
+{
+ const API_STATE& state = GetApiState(pDC);
+
+ this->binTopology = topo == TOP_UNKNOWN ? state.topology : topo;
+
+ switch (this->binTopology)
+ {
+ case TOP_TRIANGLE_LIST:
+ this->pfnPaFunc = PaTriList0;
+ break;
+ case TOP_TRIANGLE_STRIP:
+ this->pfnPaFunc = PaTriStrip0;
+ break;
+ case TOP_TRIANGLE_FAN:
+ this->pfnPaFunc = PaTriFan0;
+ break;
+ case TOP_QUAD_LIST:
+ this->pfnPaFunc = PaQuadList0;
+ this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles
+ break;
+ case TOP_QUAD_STRIP:
+ // quad strip pattern when decomposed into triangles is the same as verts strips
+ this->pfnPaFunc = PaTriStrip0;
+ this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles
+ break;
+ case TOP_LINE_LIST:
+ this->pfnPaFunc = PaLineList0;
+ this->numPrims = in_numPrims;
+ break;
+ case TOP_LINE_STRIP:
+ this->pfnPaFunc = PaLineStrip0;
+ this->numPrims = in_numPrims;
+ break;
+ case TOP_LINE_LOOP:
+ this->pfnPaFunc = PaLineLoop0;
+ this->numPrims = in_numPrims;
+ break;
+ case TOP_POINT_LIST:
+ // use point binner and rasterizer if supported
+ this->pfnPaFunc = PaPoints0;
+ this->numPrims = in_numPrims;
+ break;
+ case TOP_RECT_LIST:
+ this->pfnPaFunc = PaRectList0;
+ this->numPrims = in_numPrims * 2;
+ break;
+
+ case TOP_PATCHLIST_1:
+ this->pfnPaFunc = PaPatchList<1>;
+ break;
+ case TOP_PATCHLIST_2:
+ this->pfnPaFunc = PaPatchList<2>;
+ break;
+ case TOP_PATCHLIST_3:
+ this->pfnPaFunc = PaPatchList<3>;
+ break;
+ case TOP_PATCHLIST_4:
+ this->pfnPaFunc = PaPatchList<4>;
+ break;
+ case TOP_PATCHLIST_5:
+ this->pfnPaFunc = PaPatchList<5>;
+ break;
+ case TOP_PATCHLIST_6:
+ this->pfnPaFunc = PaPatchList<6>;
+ break;
+ case TOP_PATCHLIST_7:
+ this->pfnPaFunc = PaPatchList<7>;
+ break;
+ case TOP_PATCHLIST_8:
+ this->pfnPaFunc = PaPatchList<8>;
+ break;
+ case TOP_PATCHLIST_9:
+ this->pfnPaFunc = PaPatchList<9>;
+ break;
+ case TOP_PATCHLIST_10:
+ this->pfnPaFunc = PaPatchList<10>;
+ break;
+ case TOP_PATCHLIST_11:
+ this->pfnPaFunc = PaPatchList<11>;
+ break;
+ case TOP_PATCHLIST_12:
+ this->pfnPaFunc = PaPatchList<12>;
+ break;
+ case TOP_PATCHLIST_13:
+ this->pfnPaFunc = PaPatchList<13>;
+ break;
+ case TOP_PATCHLIST_14:
+ this->pfnPaFunc = PaPatchList<14>;
+ break;
+ case TOP_PATCHLIST_15:
+ this->pfnPaFunc = PaPatchList<15>;
+ break;
+ case TOP_PATCHLIST_16:
+ this->pfnPaFunc = PaPatchList<16>;
+ break;
+ case TOP_PATCHLIST_17:
+ this->pfnPaFunc = PaPatchList<17>;
+ break;
+ case TOP_PATCHLIST_18:
+ this->pfnPaFunc = PaPatchList<18>;
+ break;
+ case TOP_PATCHLIST_19:
+ this->pfnPaFunc = PaPatchList<19>;
+ break;
+ case TOP_PATCHLIST_20:
+ this->pfnPaFunc = PaPatchList<20>;
+ break;
+ case TOP_PATCHLIST_21:
+ this->pfnPaFunc = PaPatchList<21>;
+ break;
+ case TOP_PATCHLIST_22:
+ this->pfnPaFunc = PaPatchList<22>;
+ break;
+ case TOP_PATCHLIST_23:
+ this->pfnPaFunc = PaPatchList<23>;
+ break;
+ case TOP_PATCHLIST_24:
+ this->pfnPaFunc = PaPatchList<24>;
+ break;
+ case TOP_PATCHLIST_25:
+ this->pfnPaFunc = PaPatchList<25>;
+ break;
+ case TOP_PATCHLIST_26:
+ this->pfnPaFunc = PaPatchList<26>;
+ break;
+ case TOP_PATCHLIST_27:
+ this->pfnPaFunc = PaPatchList<27>;
+ break;
+ case TOP_PATCHLIST_28:
+ this->pfnPaFunc = PaPatchList<28>;
+ break;
+ case TOP_PATCHLIST_29:
+ this->pfnPaFunc = PaPatchList<29>;
+ break;
+ case TOP_PATCHLIST_30:
+ this->pfnPaFunc = PaPatchList<30>;
+ break;
+ case TOP_PATCHLIST_31:
+ this->pfnPaFunc = PaPatchList<31>;
+ break;
+ case TOP_PATCHLIST_32:
+ this->pfnPaFunc = PaPatchList<32>;
+ break;
+
+ default:
+ SWR_ASSERT(0);
+ break;
+ };
+
+ this->pfnPaFuncReset = this->pfnPaFunc;
+
+ // simdscalari id8 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+ // simdscalari id4 = _mm256_set_epi32(0, 0, 1, 1, 2, 2, 3, 3);
+ simdscalari id8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+ simdscalari id4 = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
+
+ switch(this->binTopology)
+ {
+ case TOP_TRIANGLE_LIST:
+ case TOP_TRIANGLE_STRIP:
+ case TOP_TRIANGLE_FAN:
+ case TOP_LINE_STRIP:
+ case TOP_LINE_LIST:
+ case TOP_LINE_LOOP:
+ this->primIDIncr = 8;
+ this->primID = id8;
+ break;
+ case TOP_QUAD_LIST:
+ case TOP_QUAD_STRIP:
+ case TOP_RECT_LIST:
+ this->primIDIncr = 4;
+ this->primID = id4;
+ break;
+ case TOP_POINT_LIST:
+ this->primIDIncr = 8;
+ this->primID = id8;
+ break;
+ case TOP_PATCHLIST_1:
+ case TOP_PATCHLIST_2:
+ case TOP_PATCHLIST_3:
+ case TOP_PATCHLIST_4:
+ case TOP_PATCHLIST_5:
+ case TOP_PATCHLIST_6:
+ case TOP_PATCHLIST_7:
+ case TOP_PATCHLIST_8:
+ case TOP_PATCHLIST_9:
+ case TOP_PATCHLIST_10:
+ case TOP_PATCHLIST_11:
+ case TOP_PATCHLIST_12:
+ case TOP_PATCHLIST_13:
+ case TOP_PATCHLIST_14:
+ case TOP_PATCHLIST_15:
+ case TOP_PATCHLIST_16:
+ case TOP_PATCHLIST_17:
+ case TOP_PATCHLIST_18:
+ case TOP_PATCHLIST_19:
+ case TOP_PATCHLIST_20:
+ case TOP_PATCHLIST_21:
+ case TOP_PATCHLIST_22:
+ case TOP_PATCHLIST_23:
+ case TOP_PATCHLIST_24:
+ case TOP_PATCHLIST_25:
+ case TOP_PATCHLIST_26:
+ case TOP_PATCHLIST_27:
+ case TOP_PATCHLIST_28:
+ case TOP_PATCHLIST_29:
+ case TOP_PATCHLIST_30:
+ case TOP_PATCHLIST_31:
+ case TOP_PATCHLIST_32:
+ // Always run KNOB_SIMD_WIDTH number of patches at a time.
+ this->primIDIncr = 8;
+ this->primID = id8;
+ break;
+
+ default:
+ SWR_ASSERT(0);
+ break;
+ };
+
+}
+#endif
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
new file mode 100644
index 00000000000..587e336d87d
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
@@ -0,0 +1,1393 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file rasterizer.cpp
+*
+* @brief Implementation for the rasterizer.
+*
+******************************************************************************/
+
+#include <vector>
+#include <algorithm>
+
+#include "rasterizer.h"
+#include "multisample.h"
+#include "rdtsc_core.h"
+#include "backend.h"
+#include "utils.h"
+#include "frontend.h"
+#include "tilemgr.h"
+#include "memory/tilingtraits.h"
+
+void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers,
+ uint32_t numSamples, uint32_t renderTargetArrayIndex);
+void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers, uint32_t colorTileStep, uint32_t depthTileStep, uint32_t stencilTileStep);
+void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow,
+ uint32_t colorRowStep, uint32_t depthRowStep, uint32_t stencilRowStep);
+
+#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
+const __m128 gMaskToVec[] = {
+ MASKTOVEC(0,0,0,0),
+ MASKTOVEC(0,0,0,1),
+ MASKTOVEC(0,0,1,0),
+ MASKTOVEC(0,0,1,1),
+ MASKTOVEC(0,1,0,0),
+ MASKTOVEC(0,1,0,1),
+ MASKTOVEC(0,1,1,0),
+ MASKTOVEC(0,1,1,1),
+ MASKTOVEC(1,0,0,0),
+ MASKTOVEC(1,0,0,1),
+ MASKTOVEC(1,0,1,0),
+ MASKTOVEC(1,0,1,1),
+ MASKTOVEC(1,1,0,0),
+ MASKTOVEC(1,1,0,1),
+ MASKTOVEC(1,1,1,0),
+ MASKTOVEC(1,1,1,1),
+};
+
+const __m256d gMaskToVecpd[] =
+{
+ MASKTOVEC(0, 0, 0, 0),
+ MASKTOVEC(0, 0, 0, 1),
+ MASKTOVEC(0, 0, 1, 0),
+ MASKTOVEC(0, 0, 1, 1),
+ MASKTOVEC(0, 1, 0, 0),
+ MASKTOVEC(0, 1, 0, 1),
+ MASKTOVEC(0, 1, 1, 0),
+ MASKTOVEC(0, 1, 1, 1),
+ MASKTOVEC(1, 0, 0, 0),
+ MASKTOVEC(1, 0, 0, 1),
+ MASKTOVEC(1, 0, 1, 0),
+ MASKTOVEC(1, 0, 1, 1),
+ MASKTOVEC(1, 1, 0, 0),
+ MASKTOVEC(1, 1, 0, 1),
+ MASKTOVEC(1, 1, 1, 0),
+ MASKTOVEC(1, 1, 1, 1),
+};
+
+struct POS
+{
+ int32_t x, y;
+};
+
+struct EDGE
+{
+ double a, b; // a, b edge coefficients in fix8
+ double stepQuadX; // step to adjacent horizontal quad in fix16
+ double stepQuadY; // step to adjacent vertical quad in fix16
+ double stepRasterTileX; // step to adjacent horizontal raster tile in fix16
+ double stepRasterTileY; // step to adjacent vertical raster tile in fix16
+
+ __m256d vQuadOffsets; // offsets for 4 samples of a quad
+ __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief rasterize a raster tile partially covered by the triangle
+/// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
+/// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
+/// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
+/// Used to step between quads when sweeping over the raster tile.
+template<uint32_t NumEdges>
+INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges)
+{
+ uint64_t coverageMask = 0;
+
+ __m256d vEdges[NumEdges];
+ __m256d vStepX[NumEdges];
+ __m256d vStepY[NumEdges];
+
+ for (uint32_t e = 0; e < NumEdges; ++e)
+ {
+ // Step to the pixel sample locations of the 1st quad
+ vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);
+
+ // compute step to next quad (mul by 2 in x and y direction)
+ vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
+ vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
+ }
+
+ // fast unrolled version for 8x8 tile
+#if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
+ int edgeMask[NumEdges];
+ uint64_t mask;
+
+ auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);};
+ auto update_lambda = [&](int e){mask &= edgeMask[e];};
+ auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);};
+ auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);};
+ auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);};
+
+// evaluate which pixels in the quad are covered
+#define EVAL \
+ UnrollerL<0, NumEdges, 1>::step(eval_lambda);
+
+ // update coverage mask
+#define UPDATE_MASK(bit) \
+ mask = edgeMask[0]; \
+ UnrollerL<1, NumEdges, 1>::step(update_lambda); \
+ coverageMask |= (mask << bit);
+
+ // step in the +x direction to the next quad
+#define INCX \
+ UnrollerL<0, NumEdges, 1>::step(incx_lambda);
+
+ // step in the +y direction to the next quad
+#define INCY \
+ UnrollerL<0, NumEdges, 1>::step(incy_lambda);
+
+ // step in the -x direction to the next quad
+#define DECX \
+ UnrollerL<0, NumEdges, 1>::step(decx_lambda);
+
+ // sweep 2x2 quad back and forth through the raster tile,
+ // computing coverage masks for the entire tile
+
+ // raster tile
+ // 0 1 2 3 4 5 6 7
+ // x x
+ // x x ------------------>
+ // x x |
+ // <-----------------x x V
+ // ..
+
+ // row 0
+ EVAL;
+ UPDATE_MASK(0);
+ INCX;
+ EVAL;
+ UPDATE_MASK(4);
+ INCX;
+ EVAL;
+ UPDATE_MASK(8);
+ INCX;
+ EVAL;
+ UPDATE_MASK(12);
+ INCY;
+
+ //row 1
+ EVAL;
+ UPDATE_MASK(28);
+ DECX;
+ EVAL;
+ UPDATE_MASK(24);
+ DECX;
+ EVAL;
+ UPDATE_MASK(20);
+ DECX;
+ EVAL;
+ UPDATE_MASK(16);
+ INCY;
+
+ // row 2
+ EVAL;
+ UPDATE_MASK(32);
+ INCX;
+ EVAL;
+ UPDATE_MASK(36);
+ INCX;
+ EVAL;
+ UPDATE_MASK(40);
+ INCX;
+ EVAL;
+ UPDATE_MASK(44);
+ INCY;
+
+ // row 3
+ EVAL;
+ UPDATE_MASK(60);
+ DECX;
+ EVAL;
+ UPDATE_MASK(56);
+ DECX;
+ EVAL;
+ UPDATE_MASK(52);
+ DECX;
+ EVAL;
+ UPDATE_MASK(48);
+#else
+ uint32_t bit = 0;
+ for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y)
+ {
+ __m256d vStartOfRowEdge[NumEdges];
+ for (uint32_t e = 0; e < NumEdges; ++e)
+ {
+ vStartOfRowEdge[e] = vEdges[e];
+ }
+
+ for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x)
+ {
+ int edgeMask[NumEdges];
+ for (uint32_t e = 0; e < NumEdges; ++e)
+ {
+ edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
+ }
+
+ uint64_t mask = edgeMask[0];
+ for (uint32_t e = 1; e < NumEdges; ++e)
+ {
+ mask &= edgeMask[e];
+ }
+ coverageMask |= (mask << bit);
+
+ // step to the next pixel in the x
+ for (uint32_t e = 0; e < NumEdges; ++e)
+ {
+ vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
+ }
+ bit+=4;
+ }
+
+ // step to the next row
+ for (uint32_t e = 0; e < NumEdges; ++e)
+ {
+ vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
+ }
+ }
+#endif
+ return coverageMask;
+
+}
+// Top left rule:
+// Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
+// Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
+// Top left: a sample is in if it is a top or left edge.
+// Out: !(horizontal && above) = !horizontal && below
+// Out: !horizontal && left = !(!horizontal && left) = horizontal and right
+INLINE __m256d adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, const __m256d vEdge)
+{
+ // if vA < 0, vC--
+ // if vA == 0 && vB < 0, vC--
+
+ __m256d vEdgeOut = vEdge;
+ __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
+
+ // if vA < 0 (line is not horizontal and below)
+ int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
+
+ // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
+ __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
+ int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
+ msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
+
+ // if either of these are true and we're on the line (edge == 0), bump it outside the line
+ vEdgeOut = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
+ return vEdgeOut;
+}
+
+// max(abs(dz/dx), abs(dz,dy)
+INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
+{
+ /*
+ // evaluate i,j at (0,0)
+ float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
+ float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
+
+ // evaluate i,j at (1,0)
+ float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
+ float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
+
+ // compute dz/dx
+ float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
+ float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
+ float dzdx = abs(d10 - d00);
+
+ // evaluate i,j at (0,1)
+ float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
+ float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
+
+ float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
+ float dzdy = abs(d01 - d00);
+ */
+
+ // optimized version of above
+ float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
+ float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
+
+ return std::max(dzdx, dzdy);
+}
+
+INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
+{
+ if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
+ {
+ return (1.0f / (1 << 24));
+ }
+ else if (pState->depthFormat == R16_UNORM)
+ {
+ return (1.0f / (1 << 16));
+ }
+ else
+ {
+ SWR_ASSERT(pState->depthFormat == R32_FLOAT);
+
+ // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
+ float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
+ uint32_t zMaxInt = *(uint32_t*)&zMax;
+ zMaxInt &= 0x7f800000;
+ zMax = *(float*)&zMaxInt;
+
+ return zMax * (1.0f / (1 << 23));
+ }
+}
+
+INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
+{
+ if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
+ {
+ return 0.0f;
+ }
+
+ float scale = pState->slopeScaledDepthBias;
+ if (scale != 0.0f)
+ {
+ scale *= ComputeMaxDepthSlope(pTri);
+ }
+
+ float bias = pState->depthBias * ComputeBiasFactor(pState, pTri, z) + scale;
+ if (pState->depthBiasClamp > 0.0f)
+ {
+ bias = std::min(bias, pState->depthBiasClamp);
+ }
+ else if (pState->depthBiasClamp < 0.0f)
+ {
+ bias = std::max(bias, pState->depthBiasClamp);
+ }
+
+ return bias;
+}
+
+// Prevent DCE by writing coverage mask from rasterizer to volatile
+#if KNOB_ENABLE_TOSS_POINTS
+__declspec(thread) volatile uint64_t gToss;
+#endif
+
+static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
+// try to avoid _chkstk insertions; make this thread local
+static THREAD OSALIGN(float, 16) perspAttribsTLS[vertsPerTri * KNOB_NUM_ATTRIBUTES * componentsPerAttrib];
+
+INLINE
+void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
+{
+ edge.a = a;
+ edge.b = b;
+
+ // compute constant steps to adjacent quads
+ edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
+ edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));
+
+ // compute constant steps to adjacent raster tiles
+ edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
+ edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));
+
+ // compute quad offsets
+ const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
+ const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
+
+ __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
+ __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
+ edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
+
+ // compute raster tile offsets
+ const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0);
+ const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0);
+
+ __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
+ __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
+ edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
+}
+
+INLINE
+void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
+{
+ ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
+}
+
+template<bool RasterizeScissorEdges, SWR_MULTISAMPLE_COUNT sampleCount>
+void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
+{
+ const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
+#if KNOB_ENABLE_TOSS_POINTS
+ if (KNOB_TOSS_BIN_TRIS)
+ {
+ return;
+ }
+#endif
+ RDTSC_START(BERasterizeTriangle);
+
+ RDTSC_START(BETriangleSetup);
+ const API_STATE &state = GetApiState(pDC);
+ const SWR_RASTSTATE &rastState = state.rastState;
+ const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+
+ OSALIGN(SWR_TRIANGLE_DESC, 16) triDesc;
+ triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
+
+ __m128 vX, vY, vZ, vRecipW;
+
+ // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
+ // eg: vX = [x0 x1 x2 dc]
+ vX = _mm_load_ps(workDesc.pTriBuffer);
+ vY = _mm_load_ps(workDesc.pTriBuffer + 4);
+ vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
+ vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
+
+ // convert to fixed point
+ __m128i vXi = fpToFixedPoint(vX);
+ __m128i vYi = fpToFixedPoint(vY);
+
+ // quantize floating point position to fixed point precision
+ // to prevent attribute creep around the triangle vertices
+ vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
+ vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
+
+ // triangle setup - A and B edge equation coefs
+ __m128 vA, vB;
+ triangleSetupAB(vX, vY, vA, vB);
+
+ __m128i vAi, vBi;
+ triangleSetupABInt(vXi, vYi, vAi, vBi);
+
+ // determinant
+ float det = calcDeterminantInt(vAi, vBi);
+
+ /// @todo: This test is flipped...we have a stray '-' sign somewhere
+ // Convert CW triangles to CCW
+ if (det > 0.0)
+ {
+ vA = _mm_mul_ps(vA, _mm_set1_ps(-1));
+ vB = _mm_mul_ps(vB, _mm_set1_ps(-1));
+ vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
+ vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
+ det = -det;
+ }
+
+ __m128 vC;
+ // Finish triangle setup - C edge coef
+ triangleSetupC(vX, vY, vA, vB, vC);
+
+ // compute barycentric i and j
+ // i = (A1x + B1y + C1)/det
+ // j = (A2x + B2y + C2)/det
+ __m128 vDet = _mm_set1_ps(det);
+ __m128 vRecipDet = _mm_div_ps(_mm_set1_ps(1.0f), vDet);//_mm_rcp_ps(vDet);
+ _mm_store_ss(&triDesc.recipDet, vRecipDet);
+
+ // only extract coefs for 2 of the barycentrics; the 3rd can be
+ // determined from the barycentric equation:
+ // i + j + k = 1 <=> k = 1 - j - i
+ _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
+ _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
+ _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
+ _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
+ _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
+ _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
+
+ OSALIGN(float, 16) oneOverW[4];
+ _mm_store_ps(oneOverW, vRecipW);
+ triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
+ triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
+ triDesc.OneOverW[2] = oneOverW[2];
+
+ // calculate perspective correct coefs per vertex attrib
+ float* pPerspAttribs = perspAttribsTLS;
+ float* pAttribs = workDesc.pAttribs;
+ triDesc.pPerspAttribs = pPerspAttribs;
+ triDesc.pAttribs = pAttribs;
+ float *pRecipW = workDesc.pTriBuffer + 12;
+ triDesc.pRecipW = pRecipW;
+ __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW);
+ __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1);
+ __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1);
+ for(uint32_t i = 0; i < workDesc.numAttribs; i++)
+ {
+ __m128 attribA = _mm_load_ps(pAttribs);
+ __m128 attribB = _mm_load_ps(pAttribs+=4);
+ __m128 attribC = _mm_load_ps(pAttribs+=4);
+ pAttribs+=4;
+
+ attribA = _mm_mul_ps(attribA, vOneOverWV0);
+ attribB = _mm_mul_ps(attribB, vOneOverWV1);
+ attribC = _mm_mul_ps(attribC, vOneOverWV2);
+
+ _mm_store_ps(pPerspAttribs, attribA);
+ _mm_store_ps(pPerspAttribs+=4, attribB);
+ _mm_store_ps(pPerspAttribs+=4, attribC);
+ pPerspAttribs+=4;
+ }
+
+ // compute bary Z
+ // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
+ OSALIGN(float, 16) a[4];
+ _mm_store_ps(a, vZ);
+ triDesc.Z[0] = a[0] - a[2];
+ triDesc.Z[1] = a[1] - a[2];
+ triDesc.Z[2] = a[2];
+
+ // add depth bias
+ triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
+
+ // Compute edge data
+ OSALIGNSIMD(int32_t) aAi[4], aBi[4];
+ _mm_store_si128((__m128i*)aAi, vAi);
+ _mm_store_si128((__m128i*)aBi, vBi);
+
+ const uint32_t numEdges = 3 + (RasterizeScissorEdges ? 4 : 0);
+ EDGE rastEdges[7];
+
+ // compute triangle edges
+ ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
+ ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
+ ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
+
+ // compute scissor edges if enabled
+ if (RasterizeScissorEdges)
+ {
+ POS topLeft{state.scissorInFixedPoint.left, state.scissorInFixedPoint.top};
+ POS bottomLeft{state.scissorInFixedPoint.left, state.scissorInFixedPoint.bottom};
+ POS topRight{state.scissorInFixedPoint.right, state.scissorInFixedPoint.top};
+ POS bottomRight{state.scissorInFixedPoint.right, state.scissorInFixedPoint.bottom};
+
+ // construct 4 scissor edges in ccw direction
+ ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
+ ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
+ ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
+ ComputeEdgeData(topRight, topLeft, rastEdges[6]);
+ }
+
+ // Calc bounding box of triangle
+ OSALIGN(BBOX, 16) bbox;
+ calcBoundingBoxInt(vXi, vYi, bbox);
+
+ // Intersect with scissor/viewport
+ bbox.left = std::max(bbox.left, state.scissorInFixedPoint.left);
+ bbox.right = std::min(bbox.right - 1, state.scissorInFixedPoint.right);
+ bbox.top = std::max(bbox.top, state.scissorInFixedPoint.top);
+ bbox.bottom = std::min(bbox.bottom - 1, state.scissorInFixedPoint.bottom);
+
+ triDesc.triFlags = workDesc.triFlags;
+
+ // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
+ uint32_t macroX, macroY;
+ MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
+ int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
+ int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
+ int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
+ int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
+
+ OSALIGN(BBOX, 16) intersect;
+ intersect.left = std::max(bbox.left, macroBoxLeft);
+ intersect.top = std::max(bbox.top, macroBoxTop);
+ intersect.right = std::min(bbox.right, macroBoxRight);
+ intersect.bottom = std::min(bbox.bottom, macroBoxBottom);
+
+ SWR_ASSERT(intersect.left <= intersect.right && intersect.top <= intersect.bottom && intersect.left >= 0 && intersect.right >= 0 && intersect.top >= 0 && intersect.bottom >= 0);
+
+ RDTSC_STOP(BETriangleSetup, 0, pDC->drawId);
+
+ // update triangle desc
+ uint32_t tileX = intersect.left >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
+ uint32_t tileY = intersect.top >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
+ uint32_t maxTileX = intersect.right >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
+ uint32_t maxTileY = intersect.bottom >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
+ uint32_t numTilesX = maxTileX - tileX + 1;
+ uint32_t numTilesY = maxTileY - tileY + 1;
+
+ if (numTilesX == 0 || numTilesY == 0)
+ {
+ RDTSC_EVENT(BEEmptyTriangle, 1, 0);
+ RDTSC_STOP(BERasterizeTriangle, 1, 0);
+ return;
+ }
+
+ RDTSC_START(BEStepSetup);
+
+ // Step to pixel center of top-left pixel of the triangle bbox
+ // Align intersect bbox (top/left) to raster tile's (top/left).
+ int32_t x = AlignDown(intersect.left, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
+ int32_t y = AlignDown(intersect.top, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
+
+ if(sampleCount == SWR_MULTISAMPLE_1X)
+ {
+ // Add 0.5, in fixed point, to offset to pixel center
+ x += (FIXED_POINT_SCALE / 2);
+ y += (FIXED_POINT_SCALE / 2);
+ }
+
+ __m128i vTopLeftX = _mm_set1_epi32(x);
+ __m128i vTopLeftY = _mm_set1_epi32(y);
+
+ // evaluate edge equations at top-left pixel using 64bit math
+ // all other evaluations will be 32bit steps from it
+ // small triangles could skip this and do all 32bit math
+ // edge 0
+ //
+ // line = Ax + By + C
+ // solving for C:
+ // C = -Ax - By
+ // we know x0 and y0 are on the line; plug them in:
+ // C = -Ax0 - By0
+ // plug C back into line equation:
+ // line = Ax - Bx - Ax0 - Bx1
+ // line = A(x - x0) + B(y - y0)
+ // line = A(x0+dX) + B(y0+dY) + C = Ax0 + AdX + By0 + BdY + c = AdX + BdY
+
+ // edge 0 and 1
+ // edge0 = A0(x - x0) + B0(y - y0)
+ // edge1 = A1(x - x1) + B1(y - y1)
+ __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
+ __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
+
+ __m256d vEdgeFix16[7];
+
+ // evaluate A(dx) and B(dY) for all points
+ __m256d vAipd = _mm256_cvtepi32_pd(vAi);
+ __m256d vBipd = _mm256_cvtepi32_pd(vBi);
+ __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
+ __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
+
+ __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
+ __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
+ __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
+
+ // adjust for top-left rule
+ vEdge = adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
+
+ // broadcast respective edge results to all lanes
+ double* pEdge = (double*)&vEdge;
+ vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
+ vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
+ vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
+
+ // evaluate edge equations for scissor edges
+ if (RasterizeScissorEdges)
+ {
+ const BBOX &scissor = state.scissorInFixedPoint;
+ vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.left)) + (rastEdges[3].b * (y - scissor.top)));
+ vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.left)) + (rastEdges[4].b * (y - scissor.bottom)));
+ vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.right)) + (rastEdges[5].b * (y - scissor.bottom)));
+ vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.right)) + (rastEdges[6].b * (y - scissor.top)));
+ }
+
+ // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
+ // used to for testing if entire raster tile is inside a triangle
+ vEdgeFix16[0] = _mm256_add_pd(vEdgeFix16[0], rastEdges[0].vRasterTileOffsets);
+ vEdgeFix16[1] = _mm256_add_pd(vEdgeFix16[1], rastEdges[1].vRasterTileOffsets);
+ vEdgeFix16[2] = _mm256_add_pd(vEdgeFix16[2], rastEdges[2].vRasterTileOffsets);
+
+ // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
+ // step sample positions to the raster tile bbox of multisample points
+ // min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples)
+ // | |
+ // | |
+ // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples)
+ __m256d vEdge0TileBbox, vEdge1TileBbox, vEdge2TileBbox;
+ if (sampleCount > SWR_MULTISAMPLE_1X)
+ {
+ __m128i vTileSampleBBoxXh = MultisampleTraits<sampleCount>::TileSampleOffsetsX();
+ __m128i vTileSampleBBoxYh = MultisampleTraits<sampleCount>::TileSampleOffsetsY();
+
+ __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
+ __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
+
+ // step edge equation tests from Tile
+ // used to for testing if entire raster tile is inside a triangle
+ __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].a), vTileSampleBBoxXFix8);
+ __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].b), vTileSampleBBoxYFix8);
+ vEdge0TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+
+ vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].a), vTileSampleBBoxXFix8);
+ vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].b), vTileSampleBBoxYFix8);
+ vEdge1TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+
+ vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].a), vTileSampleBBoxXFix8);
+ vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].b), vTileSampleBBoxYFix8);
+ vEdge2TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+ }
+
+ RDTSC_STOP(BEStepSetup, 0, pDC->drawId);
+
+ uint32_t tY = tileY;
+ uint32_t tX = tileX;
+ uint32_t maxY = maxTileY;
+ uint32_t maxX = maxTileX;
+
+ // compute steps between raster tiles for render output buffers
+ static const uint32_t colorRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8)) * MultisampleTraits<sampleCount>::numSamples};
+ static const uint32_t colorRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * colorRasterTileStep};
+ static const uint32_t depthRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8)) * MultisampleTraits<sampleCount>::numSamples};
+ static const uint32_t depthRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM)* depthRasterTileStep};
+ static const uint32_t stencilRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8)) * MultisampleTraits<sampleCount>::numSamples};
+ static const uint32_t stencilRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * stencilRasterTileStep};
+ RenderOutputBuffers renderBuffers, currentRenderBufferRow;
+
+ GetRenderHotTiles(pDC, macroTile, tileX, tileY, renderBuffers, MultisampleTraits<sampleCount>::numSamples,
+ triDesc.triFlags.renderTargetArrayIndex);
+ currentRenderBufferRow = renderBuffers;
+
+ // rasterize and generate coverage masks per sample
+ uint32_t maxSamples = MultisampleTraits<sampleCount>::numSamples;
+ for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
+ {
+ __m256d vStartOfRowEdge[numEdges];
+ for (uint32_t e = 0; e < numEdges; ++e)
+ {
+ vStartOfRowEdge[e] = vEdgeFix16[e];
+ }
+
+ for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
+ {
+ uint64_t anyCoveredSamples = 0;
+
+ // is the corner of the edge outside of the raster tile? (vEdge < 0)
+ int mask0, mask1, mask2;
+ if (sampleCount == SWR_MULTISAMPLE_1X)
+ {
+ mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
+ mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
+ mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
+ }
+ else
+ {
+ __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
+ // evaluate edge equations at the tile multisample bounding box
+ vSampleBboxTest0 = _mm256_add_pd(vEdge0TileBbox, vEdgeFix16[0]);
+ vSampleBboxTest1 = _mm256_add_pd(vEdge1TileBbox, vEdgeFix16[1]);
+ vSampleBboxTest2 = _mm256_add_pd(vEdge2TileBbox, vEdgeFix16[2]);
+ mask0 = _mm256_movemask_pd(vSampleBboxTest0);
+ mask1 = _mm256_movemask_pd(vSampleBboxTest1);
+ mask2 = _mm256_movemask_pd(vSampleBboxTest2);
+ }
+
+ for (uint32_t sampleNum = 0; sampleNum < maxSamples; sampleNum++)
+ {
+ // trivial reject, at least one edge has all 4 corners of raster tile outside
+ bool trivialReject = (!(mask0 && mask1 && mask2)) ? true : false;
+
+ if (!trivialReject)
+ {
+ // trivial accept mask
+ triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
+ if ((mask0 & mask1 & mask2) == 0xf)
+ {
+ anyCoveredSamples = triDesc.coverageMask[sampleNum];
+ // trivial accept, all 4 corners of all 3 edges are negative
+ // i.e. raster tile completely inside triangle
+ RDTSC_EVENT(BETrivialAccept, 1, 0);
+ }
+ else
+ {
+ __m256d vEdge0AtSample, vEdge1AtSample, vEdge2AtSample;
+ if(sampleCount == SWR_MULTISAMPLE_1X)
+ {
+ // should get optimized out for single sample case (global value numbering or copy propagation)
+ vEdge0AtSample = vEdgeFix16[0];
+ vEdge1AtSample = vEdgeFix16[1];
+ vEdge2AtSample = vEdgeFix16[2];
+ }
+ else
+ {
+ __m128i vSampleOffsetXh = MultisampleTraits<sampleCount>::vXi(sampleNum);
+ __m128i vSampleOffsetYh = MultisampleTraits<sampleCount>::vYi(sampleNum);
+ __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
+ __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
+
+ // *note*: none of this needs to be vectorized as rasterizePartialTile just takes vEdge[0]
+ // for each edge and broadcasts it before offsetting to individual pixel quads
+
+ // step edge equation tests from UL tile corner to pixel sample position
+ __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].a), vSampleOffsetX);
+ __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].b), vSampleOffsetY);
+ vEdge0AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+ vEdge0AtSample = _mm256_add_pd(vEdgeFix16[0], vEdge0AtSample);
+
+ vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].a), vSampleOffsetX);
+ vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].b), vSampleOffsetY);
+ vEdge1AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+ vEdge1AtSample = _mm256_add_pd(vEdgeFix16[1], vEdge1AtSample);
+
+ vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].a), vSampleOffsetX);
+ vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].b), vSampleOffsetY);
+ vEdge2AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+ vEdge2AtSample = _mm256_add_pd(vEdgeFix16[2], vEdge2AtSample);
+ }
+
+ double startQuadEdges[numEdges];
+ const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+ _mm256_maskstore_pd(&startQuadEdges[0], vLane0Mask, vEdge0AtSample);
+ _mm256_maskstore_pd(&startQuadEdges[1], vLane0Mask, vEdge1AtSample);
+ _mm256_maskstore_pd(&startQuadEdges[2], vLane0Mask, vEdge2AtSample);
+
+ for (uint32_t e = 3; e < numEdges; ++e)
+ {
+ _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeFix16[e]);
+ }
+
+ // not trivial accept or reject, must rasterize full tile
+ RDTSC_START(BERasterizePartial);
+ if (RasterizeScissorEdges)
+ {
+ triDesc.coverageMask[sampleNum] = rasterizePartialTile<7>(pDC, startQuadEdges, rastEdges);
+ }
+ else
+ {
+ triDesc.coverageMask[sampleNum] = rasterizePartialTile<3>(pDC, startQuadEdges, rastEdges);
+ }
+ RDTSC_STOP(BERasterizePartial, 0, 0);
+
+ anyCoveredSamples |= triDesc.coverageMask[sampleNum];
+ }
+ }
+ else
+ {
+ // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
+ if(sampleCount > SWR_MULTISAMPLE_1X)
+ {
+ triDesc.coverageMask[sampleNum] = 0;
+ }
+ RDTSC_EVENT(BETrivialReject, 1, 0);
+ }
+ }
+
+#if KNOB_ENABLE_TOSS_POINTS
+ if(KNOB_TOSS_RS)
+ {
+ gToss = triDesc.coverageMask[0];
+ }
+ else
+#endif
+ if(anyCoveredSamples)
+ {
+ RDTSC_START(BEPixelBackend);
+ backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
+ RDTSC_STOP(BEPixelBackend, 0, 0);
+ }
+
+ // step to the next tile in X
+ for (uint32_t e = 0; e < numEdges; ++e)
+ {
+ vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
+ }
+ StepRasterTileX(state.psState.numRenderTargets, renderBuffers, colorRasterTileStep, depthRasterTileStep, stencilRasterTileStep);
+ }
+
+ // step to the next tile in Y
+ for (uint32_t e = 0; e < numEdges; ++e)
+ {
+ vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
+ }
+ StepRasterTileY(state.psState.numRenderTargets, renderBuffers, currentRenderBufferRow, colorRasterTileRowStep, depthRasterTileRowStep, stencilRasterTileRowStep);
+ }
+
+ RDTSC_STOP(BERasterizeTriangle, 1, 0);
+}
+
+void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
+{
+ const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
+ const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
+ const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
+
+ bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0;
+
+ // load point vertex
+ float x = *workDesc.pTriBuffer;
+ float y = *(workDesc.pTriBuffer + 1);
+ float z = *(workDesc.pTriBuffer + 2);
+
+ // create a copy of the triangle buffer to write our adjusted vertices to
+ OSALIGNSIMD(float) newTriBuffer[4 * 4];
+ TRIANGLE_WORK_DESC newWorkDesc = workDesc;
+ newWorkDesc.pTriBuffer = &newTriBuffer[0];
+
+ // create a copy of the attrib buffer to write our adjusted attribs to
+ OSALIGNSIMD(float) newAttribBuffer[4 * 3 * KNOB_NUM_ATTRIBUTES];
+ newWorkDesc.pAttribs = &newAttribBuffer[0];
+
+ newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
+ newWorkDesc.numAttribs = workDesc.numAttribs;
+ newWorkDesc.triFlags = workDesc.triFlags;
+
+ // construct two tris by bloating point by point size
+ float halfPointSize = workDesc.triFlags.pointSize * 0.5f;
+ float lowerX = x - halfPointSize;
+ float upperX = x + halfPointSize;
+ float lowerY = y - halfPointSize;
+ float upperY = y + halfPointSize;
+
+ // tri 0
+ float *pBuf = &newTriBuffer[0];
+ *pBuf++ = lowerX;
+ *pBuf++ = lowerX;
+ *pBuf++ = upperX;
+ pBuf++;
+ *pBuf++ = lowerY;
+ *pBuf++ = upperY;
+ *pBuf++ = upperY;
+ pBuf++;
+ _mm_store_ps(pBuf, _mm_set1_ps(z));
+ _mm_store_ps(pBuf+=4, _mm_set1_ps(1.0f));
+
+ // setup triangle rasterizer function
+ PFN_WORK_FUNC pfnTriRast;
+ if (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN)
+ {
+ pfnTriRast = gRasterizerTable[rastState.scissorEnable][rastState.sampleCount];
+ }
+ else
+ {
+ // for center sample pattern, all samples are at pixel center; calculate coverage
+ // once at center and broadcast the results in the backend
+ pfnTriRast = gRasterizerTable[rastState.scissorEnable][SWR_MULTISAMPLE_1X];
+ }
+
+ // overwrite texcoords for point sprites
+ if (isPointSpriteTexCoordEnabled)
+ {
+ // copy original attribs
+ memcpy(&newAttribBuffer[0], workDesc.pAttribs, 4 * 3 * workDesc.numAttribs * sizeof(float));
+ newWorkDesc.pAttribs = &newAttribBuffer[0];
+
+ // overwrite texcoord for point sprites
+ uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
+ DWORD texCoordAttrib = 0;
+
+ while (_BitScanForward(&texCoordAttrib, texCoordMask))
+ {
+ texCoordMask &= ~(1 << texCoordAttrib);
+ __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
+ if (rastState.pointSpriteTopOrigin)
+ {
+ pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
+ pTexAttrib[1] = _mm_set_ps(1, 0, 1, 0);
+ pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
+ }
+ else
+ {
+ pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
+ pTexAttrib[1] = _mm_set_ps(1, 0, 0, 0);
+ pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
+ }
+ }
+ }
+ else
+ {
+ // no texcoord overwrite, can reuse the attrib buffer from frontend
+ newWorkDesc.pAttribs = workDesc.pAttribs;
+ }
+
+ pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
+
+ // tri 1
+ pBuf = &newTriBuffer[0];
+ *pBuf++ = lowerX;
+ *pBuf++ = upperX;
+ *pBuf++ = upperX;
+ pBuf++;
+ *pBuf++ = lowerY;
+ *pBuf++ = upperY;
+ *pBuf++ = lowerY;
+ // z, w unchanged
+
+ if (isPointSpriteTexCoordEnabled)
+ {
+ uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
+ DWORD texCoordAttrib = 0;
+
+ while (_BitScanForward(&texCoordAttrib, texCoordMask))
+ {
+ texCoordMask &= ~(1 << texCoordAttrib);
+ __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
+ if (rastState.pointSpriteTopOrigin)
+ {
+ pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
+ pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1);
+ pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
+
+ }
+ else
+ {
+ pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
+ pTexAttrib[1] = _mm_set_ps(1, 0, 0, 1);
+ pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
+ }
+ }
+ }
+
+ pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
+}
+
+void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
+{
+#if KNOB_ENABLE_TOSS_POINTS
+ if (KNOB_TOSS_BIN_TRIS)
+ {
+ return;
+ }
+#endif
+
+ const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
+ const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+
+ // map x,y relative offsets from start of raster tile to bit position in
+ // coverage mask for the point
+ static const uint32_t coverageMap[8][8] = {
+ { 0, 1, 4, 5, 8, 9, 12, 13 },
+ { 2, 3, 6, 7, 10, 11, 14, 15 },
+ { 16, 17, 20, 21, 24, 25, 28, 29 },
+ { 18, 19, 22, 23, 26, 27, 30, 31 },
+ { 32, 33, 36, 37, 40, 41, 44, 45 },
+ { 34, 35, 38, 39, 42, 43, 46, 47 },
+ { 48, 49, 52, 53, 56, 57, 60, 61 },
+ { 50, 51, 54, 55, 58, 59, 62, 63 }
+ };
+
+ OSALIGN(SWR_TRIANGLE_DESC, 16) triDesc;
+
+ // pull point information from triangle buffer
+ // @todo use structs for readability
+ uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
+ uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
+ float z = *(workDesc.pTriBuffer + 2);
+
+ // construct triangle descriptor for point
+ // no interpolation, set up i,j for constant interpolation of z and attribs
+ // @todo implement an optimized backend that doesn't require triangle information
+
+ // compute coverage mask from x,y packed into the coverageMask flag
+ // mask indices by the maximum valid index for x/y of coveragemap.
+ uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
+ uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
+ // todo: multisample points?
+ triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX];
+
+ // no persp divide needed for points
+ triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
+ triDesc.triFlags = workDesc.triFlags;
+ triDesc.recipDet = 1.0f;
+ triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
+ triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
+ triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
+ triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
+
+ RenderOutputBuffers renderBuffers;
+ GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT,
+ renderBuffers, 1, triDesc.triFlags.renderTargetArrayIndex);
+
+ RDTSC_START(BEPixelBackend);
+ backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
+ RDTSC_STOP(BEPixelBackend, 0, 0);
+}
+
+// Get pointers to hot tile memory for color RT, depth, stencil
+void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers,
+ uint32_t numSamples, uint32_t renderTargetArrayIndex)
+{
+ const API_STATE& state = GetApiState(pDC);
+ SWR_CONTEXT *pContext = pDC->pContext;
+
+ uint32_t mx, my;
+ MacroTileMgr::getTileIndices(macroID, mx, my);
+ tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
+ tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
+
+ // compute tile offset for active hottile buffers
+ const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
+ uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
+ offset*=numSamples;
+
+ unsigned long rtSlot = 0;
+ uint32_t colorHottileEnableMask = state.colorHottileEnable;
+ while(_BitScanForward(&rtSlot, colorHottileEnableMask))
+ {
+ HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true,
+ numSamples, renderTargetArrayIndex);
+ pColor->state = HOTTILE_DIRTY;
+ renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
+
+ colorHottileEnableMask &= ~(1 << rtSlot);
+ }
+ if(state.depthHottileEnable)
+ {
+ const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
+ uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
+ offset*=numSamples;
+ HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true,
+ numSamples, renderTargetArrayIndex);
+ pDepth->state = HOTTILE_DIRTY;
+ SWR_ASSERT(pDepth->pBuffer != nullptr);
+ renderBuffers.pDepth = pDepth->pBuffer + offset;
+ }
+ if(state.stencilHottileEnable)
+ {
+ const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
+ uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
+ offset*=numSamples;
+ HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true,
+ numSamples, renderTargetArrayIndex);
+ pStencil->state = HOTTILE_DIRTY;
+ SWR_ASSERT(pStencil->pBuffer != nullptr);
+ renderBuffers.pStencil = pStencil->pBuffer + offset;
+ }
+}
+
+INLINE
+void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers, uint32_t colorTileStep, uint32_t depthTileStep, uint32_t stencilTileStep)
+{
+ for(uint32_t rt = 0; rt < NumRT; ++rt)
+ {
+ buffers.pColor[rt] += colorTileStep;
+ }
+
+ buffers.pDepth += depthTileStep;
+ buffers.pStencil += stencilTileStep;
+}
+
+INLINE
+void StepRasterTileY(uint32_t NumRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow, uint32_t colorRowStep, uint32_t depthRowStep, uint32_t stencilRowStep)
+{
+ for(uint32_t rt = 0; rt < NumRT; ++rt)
+ {
+ startBufferRow.pColor[rt] += colorRowStep;
+ buffers.pColor[rt] = startBufferRow.pColor[rt];
+ }
+ startBufferRow.pDepth += depthRowStep;
+ buffers.pDepth = startBufferRow.pDepth;
+
+ startBufferRow.pStencil += stencilRowStep;
+ buffers.pStencil = startBufferRow.pStencil;
+}
+
+// initialize rasterizer function table
+PFN_WORK_FUNC gRasterizerTable[2][SWR_MULTISAMPLE_TYPE_MAX] =
+{
+ RasterizeTriangle<false, SWR_MULTISAMPLE_1X>,
+ RasterizeTriangle<false, SWR_MULTISAMPLE_2X>,
+ RasterizeTriangle<false, SWR_MULTISAMPLE_4X>,
+ RasterizeTriangle<false, SWR_MULTISAMPLE_8X>,
+ RasterizeTriangle<false, SWR_MULTISAMPLE_16X>,
+ RasterizeTriangle<true, SWR_MULTISAMPLE_1X>,
+ RasterizeTriangle<true, SWR_MULTISAMPLE_2X>,
+ RasterizeTriangle<true, SWR_MULTISAMPLE_4X>,
+ RasterizeTriangle<true, SWR_MULTISAMPLE_8X>,
+ RasterizeTriangle<true, SWR_MULTISAMPLE_16X>
+};
+
+void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
+{
+ const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pData);
+#if KNOB_ENABLE_TOSS_POINTS
+ if (KNOB_TOSS_BIN_TRIS)
+ {
+ return;
+ }
+#endif
+
+ // bloat line to two tris and call the triangle rasterizer twice
+ RDTSC_START(BERasterizeLine);
+
+ const API_STATE &state = GetApiState(pDC);
+ const SWR_RASTSTATE &rastState = state.rastState;
+
+ // macrotile dimensioning
+ uint32_t macroX, macroY;
+ MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
+ int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
+ int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
+ int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
+ int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
+
+ // create a copy of the triangle buffer to write our adjusted vertices to
+ OSALIGNSIMD(float) newTriBuffer[4 * 4];
+ TRIANGLE_WORK_DESC newWorkDesc = workDesc;
+ newWorkDesc.pTriBuffer = &newTriBuffer[0];
+
+ // create a copy of the attrib buffer to write our adjusted attribs to
+ OSALIGNSIMD(float) newAttribBuffer[4 * 3 * KNOB_NUM_ATTRIBUTES];
+ newWorkDesc.pAttribs = &newAttribBuffer[0];
+
+ const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f);
+ const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f);
+
+ __m128 vX, vY, vZ, vRecipW;
+
+ vX = _mm_load_ps(workDesc.pTriBuffer);
+ vY = _mm_load_ps(workDesc.pTriBuffer + 4);
+ vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
+ vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
+
+ // triangle 0
+ // v0,v1 -> v0,v0,v1
+ __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0));
+ __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0));
+ __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0));
+ __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0));
+
+ __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth);
+ __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0);
+ if (workDesc.triFlags.yMajor)
+ {
+ vXa = _mm_add_ps(vAdjust, vXa);
+ }
+ else
+ {
+ vYa = _mm_add_ps(vAdjust, vYa);
+ }
+
+ // Store triangle description for rasterizer
+ _mm_store_ps((float*)&newTriBuffer[0], vXa);
+ _mm_store_ps((float*)&newTriBuffer[4], vYa);
+ _mm_store_ps((float*)&newTriBuffer[8], vZa);
+ _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
+
+ // binner bins 3 edges for lines as v0, v1, v1
+ // tri0 needs v0, v0, v1
+ for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
+ {
+ __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a*12 + 0]);
+ __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a*12 + 4]);
+
+ _mm_store_ps((float*)&newAttribBuffer[a*12 + 0], vAttrib0);
+ _mm_store_ps((float*)&newAttribBuffer[a*12 + 4], vAttrib0);
+ _mm_store_ps((float*)&newAttribBuffer[a*12 + 8], vAttrib1);
+ }
+
+ // Store user clip distances for triangle 0
+ float newClipBuffer[3 * 8];
+ uint32_t numClipDist = _mm_popcnt_u32(state.rastState.clipDistanceMask);
+ if (numClipDist)
+ {
+ newWorkDesc.pUserClipBuffer = newClipBuffer;
+
+ float* pOldBuffer = workDesc.pUserClipBuffer;
+ float* pNewBuffer = newClipBuffer;
+ for (uint32_t i = 0; i < numClipDist; ++i)
+ {
+ // read barycentric coeffs from binner
+ float a = *(pOldBuffer++);
+ float b = *(pOldBuffer++);
+
+ // reconstruct original clip distance at vertices
+ float c0 = a + b;
+ float c1 = b;
+
+ // construct triangle barycentrics
+ *(pNewBuffer++) = c0 - c1;
+ *(pNewBuffer++) = c0 - c1;
+ *(pNewBuffer++) = c1;
+ }
+ }
+
+ // make sure this macrotile intersects the triangle
+ __m128i vXai = fpToFixedPoint(vXa);
+ __m128i vYai = fpToFixedPoint(vYa);
+ OSALIGN(BBOX, 16) bboxA;
+ calcBoundingBoxInt(vXai, vYai, bboxA);
+
+ if (!(bboxA.left > macroBoxRight ||
+ bboxA.left > state.scissorInFixedPoint.right ||
+ bboxA.right - 1 < macroBoxLeft ||
+ bboxA.right - 1 < state.scissorInFixedPoint.left ||
+ bboxA.top > macroBoxBottom ||
+ bboxA.top > state.scissorInFixedPoint.bottom ||
+ bboxA.bottom - 1 < macroBoxTop ||
+ bboxA.bottom - 1 < state.scissorInFixedPoint.top)) {
+ // rasterize triangle
+ gRasterizerTable[rastState.scissorEnable][rastState.sampleCount](pDC, workerId, macroTile, (void*)&newWorkDesc);
+ }
+
+ // triangle 1
+ // v0,v1 -> v1,v1,v0
+ vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1));
+ vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1));
+ vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1));
+ vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1));
+
+ vAdjust = _mm_mul_ps(vLineWidth, vBloat1);
+ if (workDesc.triFlags.yMajor)
+ {
+ vXa = _mm_add_ps(vAdjust, vXa);
+ }
+ else
+ {
+ vYa = _mm_add_ps(vAdjust, vYa);
+ }
+
+ // Store triangle description for rasterizer
+ _mm_store_ps((float*)&newTriBuffer[0], vXa);
+ _mm_store_ps((float*)&newTriBuffer[4], vYa);
+ _mm_store_ps((float*)&newTriBuffer[8], vZa);
+ _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
+
+ // binner bins 3 edges for lines as v0, v1, v1
+ // tri1 needs v1, v1, v0
+ for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
+ {
+ __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
+ __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
+
+ _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1);
+ _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1);
+ _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0);
+ }
+
+ // store user clip distance for triangle 1
+ if (numClipDist)
+ {
+ float* pOldBuffer = workDesc.pUserClipBuffer;
+ float* pNewBuffer = newClipBuffer;
+ for (uint32_t i = 0; i < numClipDist; ++i)
+ {
+ // read barycentric coeffs from binner
+ float a = *(pOldBuffer++);
+ float b = *(pOldBuffer++);
+
+ // reconstruct original clip distance at vertices
+ float c0 = a + b;
+ float c1 = b;
+
+ // construct triangle barycentrics
+ *(pNewBuffer++) = c1 - c0;
+ *(pNewBuffer++) = c1 - c0;
+ *(pNewBuffer++) = c0;
+ }
+ }
+
+ vXai = fpToFixedPoint(vXa);
+ vYai = fpToFixedPoint(vYa);
+ calcBoundingBoxInt(vXai, vYai, bboxA);
+
+ if (!(bboxA.left > macroBoxRight ||
+ bboxA.left > state.scissorInFixedPoint.right ||
+ bboxA.right - 1 < macroBoxLeft ||
+ bboxA.right - 1 < state.scissorInFixedPoint.left ||
+ bboxA.top > macroBoxBottom ||
+ bboxA.top > state.scissorInFixedPoint.bottom ||
+ bboxA.bottom - 1 < macroBoxTop ||
+ bboxA.bottom - 1 < state.scissorInFixedPoint.top)) {
+ // rasterize triangle
+ gRasterizerTable[rastState.scissorEnable][rastState.sampleCount](pDC, workerId, macroTile, (void*)&newWorkDesc);
+ }
+
+ RDTSC_STOP(BERasterizeLine, 1, 0);
+}
+
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h
new file mode 100644
index 00000000000..bcfeef48410
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h
@@ -0,0 +1,35 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file rasterizer.h
+*
+* @brief Definitions for the rasterizer.
+*
+******************************************************************************/
+#pragma once
+
+#include "context.h"
+
+extern PFN_WORK_FUNC gRasterizerTable[2][SWR_MULTISAMPLE_TYPE_MAX];
+void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
+void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
+void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp
new file mode 100644
index 00000000000..4b6b536075b
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp
@@ -0,0 +1,91 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+
+#include "rdtsc_core.h"
+#include "common/rdtsc_buckets.h"
+
+// must match CORE_BUCKETS enum order
+BUCKET_DESC gCoreBuckets[] = {
+ { "APIClearRenderTarget", "", true, 0xff0b8bea },
+ { "APIDraw", "", true, 0xff000066 },
+ { "APIDrawWakeAllThreads", "", false, 0xffffffff },
+ { "APIDrawIndexed", "", true, 0xff000066 },
+ { "APIDispatch", "", true, 0xff660000 },
+ { "APIStoreTiles", "", true, 0xff00ffff },
+ { "APIGetDrawContext", "", false, 0xffffffff },
+ { "APISync", "", true, 0xff6666ff },
+ { "APIWaitForIdle", "", true, 0xff0000ff },
+ { "FEProcessDraw", "", true, 0xff009900 },
+ { "FEProcessDrawIndexed", "", true, 0xff009900 },
+ { "FEFetchShader", "", false, 0xffffffff },
+ { "FEVertexShader", "", false, 0xffffffff },
+ { "FEHullShader", "", false, 0xffffffff },
+ { "FETessellation", "", false, 0xffffffff },
+ { "FEDomainShader", "", false, 0xffffffff },
+ { "FEGeometryShader", "", false, 0xffffffff },
+ { "FEStreamout", "", false, 0xffffffff },
+ { "FEPAAssemble", "", false, 0xffffffff },
+ { "FEBinPoints", "", false, 0xff29b854 },
+ { "FEBinLines", "", false, 0xff29b854 },
+ { "FEBinTriangles", "", false, 0xff29b854 },
+ { "FETriangleSetup", "", false, 0xffffffff },
+ { "FEViewportCull", "", false, 0xffffffff },
+ { "FEGuardbandClip", "", false, 0xffffffff },
+ { "FEClipPoints", "", false, 0xffffffff },
+ { "FEClipLines", "", false, 0xffffffff },
+ { "FEClipTriangles", "", false, 0xffffffff },
+ { "FECullZeroAreaAndBackface", "", false, 0xffffffff },
+ { "FECullBetweenCenters", "", false, 0xffffffff },
+ { "FEProcessStoreTiles", "", true, 0xff39c864 },
+ { "FEProcessInvalidateTiles", "", true, 0xffffffff },
+ { "WorkerWorkOnFifoBE", "", false, 0xff40261c },
+ { "WorkerFoundWork", "", false, 0xff573326 },
+ { "BELoadTiles", "", true, 0xffb0e2ff },
+ { "BEDispatch", "", true, 0xff00a2ff },
+ { "BEClear", "", true, 0xff00ccbb },
+ { "BERasterizeLine", "", true, 0xffb26a4e },
+ { "BERasterizeTriangle", "", true, 0xffb26a4e },
+ { "BETriangleSetup", "", false, 0xffffffff },
+ { "BEStepSetup", "", false, 0xffffffff },
+ { "BECullZeroArea", "", false, 0xffffffff },
+ { "BEEmptyTriangle", "", false, 0xffffffff },
+ { "BETrivialAccept", "", false, 0xffffffff },
+ { "BETrivialReject", "", false, 0xffffffff },
+ { "BERasterizePartial", "", false, 0xffffffff },
+ { "BEPixelBackend", "", false, 0xffffffff },
+ { "BESetup", "", false, 0xffffffff },
+ { "BEBarycentric", "", false, 0xffffffff },
+ { "BEEarlyDepthTest", "", false, 0xffffffff },
+ { "BEPixelShader", "", false, 0xffffffff },
+ { "BELateDepthTest", "", false, 0xffffffff },
+ { "BEOutputMerger", "", false, 0xffffffff },
+ { "BEStoreTiles", "", true, 0xff00cccc },
+ { "BEEndTile", "", false, 0xffffffff },
+ { "WorkerWaitForThreadEvent", "", false, 0xffffffff },
+};
+
+/// @todo bucketmanager and mapping should probably be a part of the SWR context
+std::vector<uint32_t> gBucketMap;
+BucketManager gBucketMgr(KNOB_BUCKETS_ENABLE_THREADVIZ);
+
+uint32_t gCurrentFrame = 0;
diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h
new file mode 100644
index 00000000000..5fcc40bf8ee
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h
@@ -0,0 +1,177 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+
+#pragma once
+#include "knobs.h"
+
+#include "common/os.h"
+#include "common/rdtsc_buckets.h"
+
+#include <vector>
+
+enum CORE_BUCKETS
+{
+ APIClearRenderTarget,
+ APIDraw,
+ APIDrawWakeAllThreads,
+ APIDrawIndexed,
+ APIDispatch,
+ APIStoreTiles,
+ APIGetDrawContext,
+ APISync,
+ APIWaitForIdle,
+ FEProcessDraw,
+ FEProcessDrawIndexed,
+ FEFetchShader,
+ FEVertexShader,
+ FEHullShader,
+ FETessellation,
+ FEDomainShader,
+ FEGeometryShader,
+ FEStreamout,
+ FEPAAssemble,
+ FEBinPoints,
+ FEBinLines,
+ FEBinTriangles,
+ FETriangleSetup,
+ FEViewportCull,
+ FEGuardbandClip,
+ FEClipPoints,
+ FEClipLines,
+ FEClipTriangles,
+ FECullZeroAreaAndBackface,
+ FECullBetweenCenters,
+ FEProcessStoreTiles,
+ FEProcessInvalidateTiles,
+ WorkerWorkOnFifoBE,
+ WorkerFoundWork,
+ BELoadTiles,
+ BEDispatch,
+ BEClear,
+ BERasterizeLine,
+ BERasterizeTriangle,
+ BETriangleSetup,
+ BEStepSetup,
+ BECullZeroArea,
+ BEEmptyTriangle,
+ BETrivialAccept,
+ BETrivialReject,
+ BERasterizePartial,
+ BEPixelBackend,
+ BESetup,
+ BEBarycentric,
+ BEEarlyDepthTest,
+ BEPixelShader,
+ BELateDepthTest,
+ BEOutputMerger,
+ BEStoreTiles,
+ BEEndTile,
+ WorkerWaitForThreadEvent,
+
+ NumBuckets
+};
+
+void rdtscReset();
+void rdtscInit(int threadId);
+void rdtscStart(uint32_t bucketId);
+void rdtscStop(uint32_t bucketId, uint32_t count, uint64_t drawId);
+void rdtscEvent(uint32_t bucketId, uint32_t count1, uint32_t count2);
+void rdtscEndFrame();
+
+#ifdef KNOB_ENABLE_RDTSC
+#define RDTSC_RESET() rdtscReset()
+#define RDTSC_INIT(threadId) rdtscInit(threadId)
+#define RDTSC_START(bucket) rdtscStart(bucket)
+#define RDTSC_STOP(bucket, count, draw) rdtscStop(bucket, count, draw)
+#define RDTSC_EVENT(bucket, count1, count2) rdtscEvent(bucket, count1, count2)
+#define RDTSC_ENDFRAME() rdtscEndFrame()
+#else
+#define RDTSC_RESET()
+#define RDTSC_INIT(threadId)
+#define RDTSC_START(bucket)
+#define RDTSC_STOP(bucket, count, draw)
+#define RDTSC_EVENT(bucket, count1, count2)
+#define RDTSC_ENDFRAME()
+#endif
+
+extern std::vector<uint32_t> gBucketMap;
+extern BucketManager gBucketMgr;
+extern BUCKET_DESC gCoreBuckets[];
+extern uint32_t gCurrentFrame;
+
+INLINE void rdtscReset()
+{
+ gCurrentFrame = 0;
+ gBucketMgr.ClearThreads();
+ gBucketMgr.ClearBuckets();
+}
+
+INLINE void rdtscInit(int threadId)
+{
+ // register all the buckets once
+ if (threadId == 0)
+ {
+ gBucketMap.resize(NumBuckets);
+ for (uint32_t i = 0; i < NumBuckets; ++i)
+ {
+ gBucketMap[i] = gBucketMgr.RegisterBucket(gCoreBuckets[i]);
+ }
+ }
+
+ std::string name = threadId == 0 ? "API" : "WORKER";
+ gBucketMgr.RegisterThread(name);
+}
+
+INLINE void rdtscStart(uint32_t bucketId)
+{
+ uint32_t id = gBucketMap[bucketId];
+ gBucketMgr.StartBucket(id);
+}
+
+INLINE void rdtscStop(uint32_t bucketId, uint32_t count, uint64_t drawId)
+{
+ uint32_t id = gBucketMap[bucketId];
+ gBucketMgr.StopBucket(id);
+}
+
+INLINE void rdtscEvent(uint32_t bucketId, uint32_t count1, uint32_t count2)
+{
+ uint32_t id = gBucketMap[bucketId];
+ gBucketMgr.AddEvent(id, count1);
+}
+
+INLINE void rdtscEndFrame()
+{
+ gCurrentFrame++;
+
+ if (gCurrentFrame == KNOB_BUCKETS_START_FRAME)
+ {
+ gBucketMgr.StartCapture();
+ }
+
+ if (gCurrentFrame == KNOB_BUCKETS_END_FRAME)
+ {
+ gBucketMgr.StopCapture();
+ gBucketMgr.PrintReport("rdtsc.txt");
+ }
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h
new file mode 100644
index 00000000000..2758555fd4b
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -0,0 +1,1027 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file state.h
+*
+* @brief Definitions for API state.
+*
+******************************************************************************/
+#pragma once
+
+#include "common/formats.h"
+#include "common/simdintrin.h"
+
+// clear flags
+#define SWR_CLEAR_NONE 0
+#define SWR_CLEAR_COLOR (1 << 0)
+#define SWR_CLEAR_DEPTH (1 << 1)
+#define SWR_CLEAR_STENCIL (1 << 2)
+
+enum DRIVER_TYPE
+{
+ DX,
+ GL
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// PRIMITIVE_TOPOLOGY.
+//////////////////////////////////////////////////////////////////////////
+enum PRIMITIVE_TOPOLOGY
+{
+ TOP_UNKNOWN = 0x0,
+ TOP_POINT_LIST = 0x1,
+ TOP_LINE_LIST = 0x2,
+ TOP_LINE_STRIP = 0x3,
+ TOP_TRIANGLE_LIST = 0x4,
+ TOP_TRIANGLE_STRIP = 0x5,
+ TOP_TRIANGLE_FAN = 0x6,
+ TOP_QUAD_LIST = 0x7,
+ TOP_QUAD_STRIP = 0x8,
+ TOP_LINE_LIST_ADJ = 0x9,
+ TOP_LISTSTRIP_ADJ = 0xA,
+ TOP_TRI_LIST_ADJ = 0xB,
+ TOP_TRI_STRIP_ADJ = 0xC,
+ TOP_TRI_STRIP_REVERSE = 0xD,
+ TOP_POLYGON = 0xE,
+ TOP_RECT_LIST = 0xF,
+ TOP_LINE_LOOP = 0x10,
+ TOP_POINT_LIST_BF = 0x11,
+ TOP_LINE_STRIP_CONT = 0x12,
+ TOP_LINE_STRIP_BF = 0x13,
+ TOP_LINE_STRIP_CONT_BF = 0x14,
+ TOP_TRIANGLE_FAN_NOSTIPPLE = 0x16,
+ TOP_TRIANGLE_DISC = 0x17, /// @todo What is this??
+
+ TOP_PATCHLIST_BASE = 0x1F, // Invalid topology, used to calculate num verts for a patchlist.
+ TOP_PATCHLIST_1 = 0x20, // List of 1-vertex patches
+ TOP_PATCHLIST_2 = 0x21,
+ TOP_PATCHLIST_3 = 0x22,
+ TOP_PATCHLIST_4 = 0x23,
+ TOP_PATCHLIST_5 = 0x24,
+ TOP_PATCHLIST_6 = 0x25,
+ TOP_PATCHLIST_7 = 0x26,
+ TOP_PATCHLIST_8 = 0x27,
+ TOP_PATCHLIST_9 = 0x28,
+ TOP_PATCHLIST_10 = 0x29,
+ TOP_PATCHLIST_11 = 0x2A,
+ TOP_PATCHLIST_12 = 0x2B,
+ TOP_PATCHLIST_13 = 0x2C,
+ TOP_PATCHLIST_14 = 0x2D,
+ TOP_PATCHLIST_15 = 0x2E,
+ TOP_PATCHLIST_16 = 0x2F,
+ TOP_PATCHLIST_17 = 0x30,
+ TOP_PATCHLIST_18 = 0x31,
+ TOP_PATCHLIST_19 = 0x32,
+ TOP_PATCHLIST_20 = 0x33,
+ TOP_PATCHLIST_21 = 0x34,
+ TOP_PATCHLIST_22 = 0x35,
+ TOP_PATCHLIST_23 = 0x36,
+ TOP_PATCHLIST_24 = 0x37,
+ TOP_PATCHLIST_25 = 0x38,
+ TOP_PATCHLIST_26 = 0x39,
+ TOP_PATCHLIST_27 = 0x3A,
+ TOP_PATCHLIST_28 = 0x3B,
+ TOP_PATCHLIST_29 = 0x3C,
+ TOP_PATCHLIST_30 = 0x3D,
+ TOP_PATCHLIST_31 = 0x3E,
+ TOP_PATCHLIST_32 = 0x3F, // List of 32-vertex patches
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_SHADER_TYPE
+//////////////////////////////////////////////////////////////////////////
+enum SWR_SHADER_TYPE
+{
+ SHADER_VERTEX,
+ SHADER_GEOMETRY,
+ SHADER_DOMAIN,
+ SHADER_HULL,
+ SHADER_PIXEL,
+ SHADER_COMPUTE,
+
+ NUM_SHADER_TYPES,
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_RENDERTARGET_ATTACHMENT
+/// @todo Its not clear what an "attachment" means. Its not common term.
+//////////////////////////////////////////////////////////////////////////
+enum SWR_RENDERTARGET_ATTACHMENT
+{
+ SWR_ATTACHMENT_COLOR0,
+ SWR_ATTACHMENT_COLOR1,
+ SWR_ATTACHMENT_COLOR2,
+ SWR_ATTACHMENT_COLOR3,
+ SWR_ATTACHMENT_COLOR4,
+ SWR_ATTACHMENT_COLOR5,
+ SWR_ATTACHMENT_COLOR6,
+ SWR_ATTACHMENT_COLOR7,
+ SWR_ATTACHMENT_DEPTH,
+ SWR_ATTACHMENT_STENCIL,
+
+ SWR_NUM_ATTACHMENTS
+};
+
+#define SWR_NUM_RENDERTARGETS 8
+
+#define SWR_ATTACHMENT_COLOR0_BIT 0x001
+#define SWR_ATTACHMENT_COLOR1_BIT 0x002
+#define SWR_ATTACHMENT_COLOR2_BIT 0x004
+#define SWR_ATTACHMENT_COLOR3_BIT 0x008
+#define SWR_ATTACHMENT_COLOR4_BIT 0x010
+#define SWR_ATTACHMENT_COLOR5_BIT 0x020
+#define SWR_ATTACHMENT_COLOR6_BIT 0x040
+#define SWR_ATTACHMENT_COLOR7_BIT 0x080
+#define SWR_ATTACHMENT_DEPTH_BIT 0x100
+#define SWR_ATTACHMENT_STENCIL_BIT 0x200
+#define SWR_ATTACHMENT_MASK_ALL 0x3ff
+#define SWR_ATTACHMENT_MASK_COLOR 0x0ff
+
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SWR Inner Tessellation factor ID
+/// See above GetTessFactorOutputPosition code for documentation
+enum SWR_INNER_TESSFACTOR_ID
+{
+ SWR_QUAD_U_TRI_INSIDE,
+ SWR_QUAD_V_INSIDE,
+
+ SWR_NUM_INNER_TESS_FACTORS,
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SWR Outer Tessellation factor ID
+/// See above GetTessFactorOutputPosition code for documentation
+enum SWR_OUTER_TESSFACTOR_ID
+{
+ SWR_QUAD_U_EQ0_TRI_U_LINE_DETAIL,
+ SWR_QUAD_V_EQ0_TRI_V_LINE_DENSITY,
+ SWR_QUAD_U_EQ1_TRI_W,
+ SWR_QUAD_V_EQ1,
+
+ SWR_NUM_OUTER_TESS_FACTORS,
+};
+
+
+/////////////////////////////////////////////////////////////////////////
+/// simdvertex
+/// @brief Defines a vertex element that holds all the data for SIMD vertices.
+/// Contains position in clip space, hardcoded to attribute 0,
+/// space for up to 32 attributes, as well as any SGV values generated
+/// by the pipeline
+/////////////////////////////////////////////////////////////////////////
+#define VERTEX_POSITION_SLOT 0
+#define VERTEX_ATTRIB_START_SLOT 1
+#define VERTEX_ATTRIB_END_SLOT 32
+#define VERTEX_RTAI_SLOT 33 // GS writes RenderTargetArrayIndex here
+#define VERTEX_PRIMID_SLOT 34 // GS writes PrimId here
+#define VERTEX_CLIPCULL_DIST_LO_SLOT 35 // VS writes lower 4 clip/cull dist
+#define VERTEX_CLIPCULL_DIST_HI_SLOT 36 // VS writes upper 4 clip/cull dist
+#define VERTEX_POINT_SIZE_SLOT 37 // VS writes point size here
+static_assert(VERTEX_POINT_SIZE_SLOT < KNOB_NUM_ATTRIBUTES, "Mismatched attribute slot size");
+
+// SoAoSoA
+struct simdvertex
+{
+ simdvector attrib[KNOB_NUM_ATTRIBUTES];
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_VS_CONTEXT
+/// @brief Input to vertex shader
+/////////////////////////////////////////////////////////////////////////
+struct SWR_VS_CONTEXT
+{
+ simdvertex* pVin; // IN: SIMD input vertex data store
+ simdvertex* pVout; // OUT: SIMD output vertex data store
+
+ uint32_t InstanceID; // IN: Instance ID, constant across all verts of the SIMD
+ simdscalari VertexID; // IN: Vertex ID
+ simdscalari mask; // IN: Active mask for shader
+};
+
+/////////////////////////////////////////////////////////////////////////
+/// ScalarCPoint
+/// @brief defines a control point element as passed from the output
+/// of the hull shader to the input of the domain shader
+/////////////////////////////////////////////////////////////////////////
+struct ScalarAttrib
+{
+ float x;
+ float y;
+ float z;
+ float w;
+};
+
+struct ScalarCPoint
+{
+ ScalarAttrib attrib[KNOB_NUM_ATTRIBUTES];
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_TESSELLATION_FACTORS
+/// @brief Tessellation factors structure (non-vector)
+/////////////////////////////////////////////////////////////////////////
+struct SWR_TESSELLATION_FACTORS
+{
+ float OuterTessFactors[SWR_NUM_OUTER_TESS_FACTORS];
+ float InnerTessFactors[SWR_NUM_INNER_TESS_FACTORS];
+};
+
+#define MAX_NUM_VERTS_PER_PRIM 32 // support up to 32 control point patches
+struct ScalarPatch
+{
+ SWR_TESSELLATION_FACTORS tessFactors;
+ ScalarCPoint cp[MAX_NUM_VERTS_PER_PRIM];
+ ScalarCPoint patchData;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_HS_CONTEXT
+/// @brief Input to hull shader
+/////////////////////////////////////////////////////////////////////////
+struct SWR_HS_CONTEXT
+{
+ simdvertex vert[MAX_NUM_VERTS_PER_PRIM]; // IN: (SIMD) input primitive data
+ simdscalari PrimitiveID; // IN: (SIMD) primitive ID generated from the draw call
+ simdscalari mask; // IN: Active mask for shader
+ ScalarPatch* pCPout; // OUT: Output control point patch
+ // SIMD-sized-array of SCALAR patches
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_DS_CONTEXT
+/// @brief Input to domain shader
+/////////////////////////////////////////////////////////////////////////
+struct SWR_DS_CONTEXT
+{
+ uint32_t PrimitiveID; // IN: (SCALAR) PrimitiveID for the patch associated with the DS invocation
+ uint32_t vectorOffset; // IN: (SCALAR) vector index offset into SIMD data.
+ uint32_t vectorStride; // IN: (SCALAR) stride (in vectors) of output data per attribute-component
+ ScalarPatch* pCpIn; // IN: (SCALAR) Control patch
+ simdscalar* pDomainU; // IN: (SIMD) Domain Point U coords
+ simdscalar* pDomainV; // IN: (SIMD) Domain Point V coords
+ simdscalari mask; // IN: Active mask for shader
+ simdscalar* pOutputData; // OUT: (SIMD) Vertex Attributes (2D array of vectors, one row per attribute-component)
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_GS_CONTEXT
+/// @brief Input to geometry shader.
+/////////////////////////////////////////////////////////////////////////
+struct SWR_GS_CONTEXT
+{
+ simdvertex vert[MAX_NUM_VERTS_PER_PRIM]; // IN: input primitive data for SIMD prims
+ simdscalari PrimitiveID; // IN: input primitive ID generated from the draw call
+ uint32_t InstanceID; // IN: input instance ID
+ simdscalari mask; // IN: Active mask for shader
+ uint8_t* pStream; // OUT: output stream (contains vertices for all output streams)
+ uint8_t* pCutOrStreamIdBuffer; // OUT: cut or stream id buffer
+ simdscalari vertexCount; // OUT: num vertices emitted per SIMD lane
+};
+
+struct PixelPositions
+{
+ simdscalar UL;
+ simdscalar center;
+ simdscalar sample;
+ simdscalar centroid;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_PS_CONTEXT
+/// @brief Input to pixel shader.
+/////////////////////////////////////////////////////////////////////////
+struct SWR_PS_CONTEXT
+{
+ PixelPositions vX; // IN: x location(s) of pixels
+ PixelPositions vY; // IN: x location(s) of pixels
+ simdscalar vZ; // INOUT: z location of pixels
+ simdscalari activeMask; // OUT: mask for kill
+ simdscalar inputMask; // IN: input coverage mask for all samples
+ simdscalari oMask; // OUT: mask for output coverage
+
+ PixelPositions vI; // barycentric coords evaluated at pixel center, sample position, centroid
+ PixelPositions vJ;
+ PixelPositions vOneOverW; // IN: 1/w
+
+ const float* pAttribs; // IN: pointer to attribute barycentric coefficients
+ const float* pPerspAttribs; // IN: pointer to attribute/w barycentric coefficients
+ const float* pRecipW; // IN: pointer to 1/w coord for each vertex
+ const float *I; // IN: Barycentric A, B, and C coefs used to compute I
+ const float *J; // IN: Barycentric A, B, and C coefs used to compute J
+ float recipDet; // IN: 1/Det, used when barycentric interpolating attributes
+ const float* pSamplePosX; // IN: array of sample positions
+ const float* pSamplePosY; // IN: array of sample positions
+ simdvector shaded[SWR_NUM_RENDERTARGETS];
+ // OUT: result color per rendertarget
+
+ uint32_t frontFace; // IN: front- 1, back- 0
+ uint32_t primID; // IN: primitive ID
+ uint32_t sampleIndex; // IN: sampleIndex
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_CS_CONTEXT
+/// @brief Input to compute shader.
+/////////////////////////////////////////////////////////////////////////
+struct SWR_CS_CONTEXT
+{
+ // The ThreadGroupId is the current thread group index relative
+ // to all thread groups in the Dispatch call. The ThreadId, ThreadIdInGroup,
+ // and ThreadIdInGroupFlattened can be derived from ThreadGroupId in the shader.
+
+ // Compute shader accepts the following system values.
+ // o ThreadId - Current thread id relative to all other threads in dispatch.
+ // o ThreadGroupId - Current thread group id relative to all other groups in dispatch.
+ // o ThreadIdInGroup - Current thread relative to all threads in the current thread group.
+ // o ThreadIdInGroupFlattened - Flattened linear id derived from ThreadIdInGroup.
+ //
+ // All of these system values can be computed in the shader. They will be
+ // derived from the current tile counter. The tile counter is an atomic counter that
+ // resides in the draw context and is initialized to the product of the dispatch dims.
+ //
+ // tileCounter = dispatchDims.x * dispatchDims.y * dispatchDims.z
+ //
+ // Each CPU worker thread will atomically decrement this counter and passes the current
+ // count into the shader. When the count reaches 0 then all thread groups in the
+ // dispatch call have been completed.
+
+ uint32_t tileCounter; // The tile counter value for this thread group.
+
+ // Dispatch dimensions used by shader to compute system values from the tile counter.
+ uint32_t dispatchDims[3];
+
+ uint8_t* pTGSM; // Thread Group Shared Memory pointer.
+
+ uint8_t* pSpillFillBuffer; // Spill/fill buffer for barrier support
+};
+
+// enums
+enum SWR_TILE_MODE
+{
+ SWR_TILE_NONE = 0x0, // Linear mode (no tiling)
+ SWR_TILE_MODE_WMAJOR, // W major tiling
+ SWR_TILE_MODE_XMAJOR, // X major tiling
+ SWR_TILE_MODE_YMAJOR, // Y major tiling
+ SWR_TILE_SWRZ, // SWR-Z tiling
+
+ SWR_TILE_MODE_COUNT
+};
+
+enum SWR_SURFACE_TYPE
+{
+ SURFACE_1D = 0,
+ SURFACE_2D = 1,
+ SURFACE_3D = 2,
+ SURFACE_CUBE = 3,
+ SURFACE_BUFFER = 4,
+ SURFACE_STRUCTURED_BUFFER = 5,
+ SURFACE_NULL = 7
+};
+
+enum SWR_ZFUNCTION
+{
+ ZFUNC_ALWAYS,
+ ZFUNC_NEVER,
+ ZFUNC_LT,
+ ZFUNC_EQ,
+ ZFUNC_LE,
+ ZFUNC_GT,
+ ZFUNC_NE,
+ ZFUNC_GE,
+ NUM_ZFUNC
+};
+
+enum SWR_STENCILOP
+{
+ STENCILOP_KEEP,
+ STENCILOP_ZERO,
+ STENCILOP_REPLACE,
+ STENCILOP_INCRSAT,
+ STENCILOP_DECRSAT,
+ STENCILOP_INCR,
+ STENCILOP_DECR,
+ STENCILOP_INVERT
+};
+
+enum SWR_BLEND_FACTOR
+{
+ BLENDFACTOR_ONE,
+ BLENDFACTOR_SRC_COLOR,
+ BLENDFACTOR_SRC_ALPHA,
+ BLENDFACTOR_DST_ALPHA,
+ BLENDFACTOR_DST_COLOR,
+ BLENDFACTOR_SRC_ALPHA_SATURATE,
+ BLENDFACTOR_CONST_COLOR,
+ BLENDFACTOR_CONST_ALPHA,
+ BLENDFACTOR_SRC1_COLOR,
+ BLENDFACTOR_SRC1_ALPHA,
+ BLENDFACTOR_ZERO,
+ BLENDFACTOR_INV_SRC_COLOR,
+ BLENDFACTOR_INV_SRC_ALPHA,
+ BLENDFACTOR_INV_DST_ALPHA,
+ BLENDFACTOR_INV_DST_COLOR,
+ BLENDFACTOR_INV_CONST_COLOR,
+ BLENDFACTOR_INV_CONST_ALPHA,
+ BLENDFACTOR_INV_SRC1_COLOR,
+ BLENDFACTOR_INV_SRC1_ALPHA
+};
+
+enum SWR_BLEND_OP
+{
+ BLENDOP_ADD,
+ BLENDOP_SUBTRACT,
+ BLENDOP_REVSUBTRACT,
+ BLENDOP_MIN,
+ BLENDOP_MAX,
+};
+
+enum SWR_LOGIC_OP
+{
+ LOGICOP_CLEAR,
+ LOGICOP_NOR,
+ LOGICOP_AND_INVERTED,
+ LOGICOP_COPY_INVERTED,
+ LOGICOP_AND_REVERSE,
+ LOGICOP_INVERT,
+ LOGICOP_XOR,
+ LOGICOP_NAND,
+ LOGICOP_AND,
+ LOGICOP_EQUIV,
+ LOGICOP_NOOP,
+ LOGICOP_OR_INVERTED,
+ LOGICOP_COPY,
+ LOGICOP_OR_REVERSE,
+ LOGICOP_OR,
+ LOGICOP_SET,
+};
+
+struct SWR_SURFACE_STATE
+{
+ uint8_t *pBaseAddress;
+ SWR_SURFACE_TYPE type; // @llvm_enum
+ SWR_FORMAT format; // @llvm_enum
+ uint32_t width;
+ uint32_t height;
+ uint32_t depth;
+ uint32_t numSamples;
+ uint32_t samplePattern;
+ uint32_t pitch;
+ uint32_t qpitch;
+ uint32_t minLod; // for sampled surfaces, the most detailed LOD that can be accessed by sampler
+ uint32_t maxLod; // for sampled surfaces, the max LOD that can be accessed
+ float resourceMinLod; // for sampled surfaces, the most detailed fractional mip that can be accessed by sampler
+ uint32_t lod; // for render targets, the lod being rendered to
+ uint32_t arrayIndex; // for render targets, the array index being rendered to for arrayed surfaces
+ SWR_TILE_MODE tileMode; // @llvm_enum
+ bool bInterleavedSamples; // are MSAA samples stored interleaved or planar
+ uint32_t halign;
+ uint32_t valign;
+ uint32_t xOffset;
+ uint32_t yOffset;
+
+ uint32_t lodOffsets[2][15]; // lod offsets for sampled surfaces
+
+ uint8_t *pAuxBaseAddress; // Used for compression, append/consume counter, etc.
+};
+
+// vertex fetch state
+// WARNING- any changes to this struct need to be reflected
+// in the fetch shader jit
+struct SWR_VERTEX_BUFFER_STATE
+{
+ uint32_t index;
+ uint32_t pitch;
+ const uint8_t *pData;
+ uint32_t size;
+ uint32_t numaNode;
+ uint32_t maxVertex; // size / pitch. precalculated value used by fetch shader for OOB checks
+ uint32_t partialInboundsSize; // size % pitch. precalculated value used by fetch shader for partially OOB vertices
+};
+
+struct SWR_INDEX_BUFFER_STATE
+{
+ // Format type for indices (e.g. UINT16, UINT32, etc.)
+ SWR_FORMAT format; // @llvm_enum
+ const void *pIndices;
+ uint32_t size;
+};
+
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_FETCH_CONTEXT
+/// @brief Input to fetch shader.
+/// @note WARNING - Changes to this struct need to be reflected in the
+/// fetch shader jit.
+/////////////////////////////////////////////////////////////////////////
+struct SWR_FETCH_CONTEXT
+{
+ const SWR_VERTEX_BUFFER_STATE* pStreams; // IN: array of bound vertex buffers
+ const int32_t* pIndices; // IN: pointer to index buffer for indexed draws
+ const int32_t* pLastIndex; // IN: pointer to end of index buffer, used for bounds checking
+ uint32_t CurInstance; // IN: current instance
+ uint32_t BaseVertex; // IN: base vertex
+ uint32_t StartVertex; // IN: start vertex
+ uint32_t StartInstance; // IN: start instance
+ simdscalari VertexID; // OUT: vector of vertex IDs
+ simdscalari CutMask; // OUT: vector mask of indices which have the cut index value
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_STATS
+///
+/// @brief All statistics generated by SWR go here. These are public
+/// to driver.
+/////////////////////////////////////////////////////////////////////////
+struct SWR_STATS
+{
+ // Occlusion Query
+ uint64_t DepthPassCount; // Number of passing depth tests. Not exact.
+
+ // Pipeline Stats
+ uint64_t IaVertices; // Number of Fetch Shader vertices
+ uint64_t IaPrimitives; // Number of PA primitives.
+ uint64_t VsInvocations; // Number of Vertex Shader invocations
+ uint64_t HsInvocations; // Number of Hull Shader invocations
+ uint64_t DsInvocations; // Number of Domain Shader invocations
+ uint64_t GsInvocations; // Number of Geometry Shader invocations
+ uint64_t PsInvocations; // Number of Pixel Shader invocations
+ uint64_t CsInvocations; // Number of Compute Shader invocations
+ uint64_t CInvocations; // Number of clipper invocations
+ uint64_t CPrimitives; // Number of clipper primitives.
+ uint64_t GsPrimitives; // Number of prims GS outputs.
+
+ // Streamout Stats
+ uint32_t SoWriteOffset[4];
+ uint64_t SoPrimStorageNeeded[4];
+ uint64_t SoNumPrimsWritten[4];
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// STREAMOUT_BUFFERS
+/////////////////////////////////////////////////////////////////////////
+
+#define MAX_SO_STREAMS 4
+#define MAX_ATTRIBUTES 32
+
+struct SWR_STREAMOUT_BUFFER
+{
+ bool enable;
+
+ // Pointers to streamout buffers.
+ uint32_t* pBuffer;
+
+ // Size of buffer in dwords.
+ uint32_t bufferSize;
+
+ // Vertex pitch of buffer in dwords.
+ uint32_t pitch;
+
+ // Offset into buffer in dwords. SOS will increment this offset.
+ uint32_t streamOffset;
+
+ // Offset to the SO write offset. If not null then we update offset here.
+ uint32_t* pWriteOffset;
+
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// STREAMOUT_STATE
+/////////////////////////////////////////////////////////////////////////
+struct SWR_STREAMOUT_STATE
+{
+ // This disables stream output.
+ bool soEnable;
+
+ // which streams are enabled for streamout
+ bool streamEnable[MAX_SO_STREAMS];
+
+ // If set then do not send any streams to the rasterizer.
+ bool rasterizerDisable;
+
+ // Specifies which stream to send to the rasterizer.
+ uint32_t streamToRasterizer;
+
+ // The stream masks specify which attributes are sent to which streams.
+ // These masks help the FE to setup the pPrimData buffer that is passed
+ // the the Stream Output Shader (SOS) function.
+ uint32_t streamMasks[MAX_SO_STREAMS];
+
+ // Number of attributes, including position, per vertex that are streamed out.
+ // This should match number of bits in stream mask.
+ uint32_t streamNumEntries[MAX_SO_STREAMS];
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// STREAMOUT_CONTEXT - Passed to SOS
+/////////////////////////////////////////////////////////////////////////
+struct SWR_STREAMOUT_CONTEXT
+{
+ uint32_t* pPrimData;
+ SWR_STREAMOUT_BUFFER* pBuffer[MAX_SO_STREAMS];
+
+ // Num prims written for this stream
+ uint32_t numPrimsWritten;
+
+ // Num prims that should have been written if there were no overflow.
+ uint32_t numPrimStorageNeeded;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_GS_STATE - Geometry shader state
+/////////////////////////////////////////////////////////////////////////
+struct SWR_GS_STATE
+{
+ bool gsEnable;
+
+ // number of input attributes per vertex. used by the frontend to
+ // optimize assembling primitives for GS
+ uint32_t numInputAttribs;
+
+ // output topology - can be point, tristrip, or linestrip
+ PRIMITIVE_TOPOLOGY outputTopology; // @llvm_enum
+
+ // maximum number of verts that can be emitted by a single instance of the GS
+ uint32_t maxNumVerts;
+
+ // instance count
+ uint32_t instanceCount;
+
+ // geometry shader emits renderTargetArrayIndex
+ bool emitsRenderTargetArrayIndex;
+
+ // geometry shader emits PrimitiveID
+ bool emitsPrimitiveID;
+
+ // if true, geometry shader emits a single stream, with separate cut buffer.
+ // if false, geometry shader emits vertices for multiple streams to the stream buffer, with a separate StreamID buffer
+ // to map vertices to streams
+ bool isSingleStream;
+
+ // when single stream is enabled, singleStreamID dictates which stream is being output.
+ // field ignored if isSingleStream is false
+ uint32_t singleStreamID;
+};
+
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_TS_OUTPUT_TOPOLOGY - Defines data output by the tessellator / DS
+/////////////////////////////////////////////////////////////////////////
+enum SWR_TS_OUTPUT_TOPOLOGY
+{
+ SWR_TS_OUTPUT_POINT,
+ SWR_TS_OUTPUT_LINE,
+ SWR_TS_OUTPUT_TRI_CW,
+ SWR_TS_OUTPUT_TRI_CCW,
+
+ SWR_TS_OUTPUT_TOPOLOGY_COUNT
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_TS_PARTITIONING - Defines tessellation algorithm
+/////////////////////////////////////////////////////////////////////////
+enum SWR_TS_PARTITIONING
+{
+ SWR_TS_INTEGER,
+ SWR_TS_ODD_FRACTIONAL,
+ SWR_TS_EVEN_FRACTIONAL,
+
+ SWR_TS_PARTITIONING_COUNT
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_TS_DOMAIN - Defines Tessellation Domain
+/////////////////////////////////////////////////////////////////////////
+enum SWR_TS_DOMAIN
+{
+ SWR_TS_QUAD,
+ SWR_TS_TRI,
+ SWR_TS_ISOLINE,
+
+ SWR_TS_DOMAIN_COUNT
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_TS_STATE - Tessellation state
+/////////////////////////////////////////////////////////////////////////
+struct SWR_TS_STATE
+{
+ bool tsEnable;
+ SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology; // @llvm_enum
+ SWR_TS_PARTITIONING partitioning; // @llvm_enum
+ SWR_TS_DOMAIN domain; // @llvm_enum
+
+ PRIMITIVE_TOPOLOGY postDSTopology; // @llvm_enum
+
+ uint32_t numHsInputAttribs;
+ uint32_t numHsOutputAttribs;
+ uint32_t numDsOutputAttribs;
+};
+
+// output merger state
+struct SWR_RENDER_TARGET_BLEND_STATE
+{
+ uint8_t writeDisableRed : 1;
+ uint8_t writeDisableGreen : 1;
+ uint8_t writeDisableBlue : 1;
+ uint8_t writeDisableAlpha : 1;
+};
+static_assert(sizeof(SWR_RENDER_TARGET_BLEND_STATE) == 1, "Invalid SWR_RENDER_TARGET_BLEND_STATE size");
+
+#define SWR_MAX_NUM_MULTISAMPLES 16
+enum SWR_MULTISAMPLE_COUNT
+{
+ SWR_MULTISAMPLE_1X = 0,
+ SWR_MULTISAMPLE_2X,
+ SWR_MULTISAMPLE_4X,
+ SWR_MULTISAMPLE_8X,
+ SWR_MULTISAMPLE_16X,
+ SWR_MULTISAMPLE_TYPE_MAX
+};
+
+struct SWR_BLEND_STATE
+{
+ // constant blend factor color in RGBA float
+ float constantColor[4];
+
+ // alpha test reference value in unorm8 or float32
+ uint32_t alphaTestReference;
+ uint32_t sampleMask;
+ // all RT's have the same sample count
+ ///@todo move this to Output Merger state when we refactor
+ SWR_MULTISAMPLE_COUNT sampleCount; // @llvm_enum
+
+ SWR_RENDER_TARGET_BLEND_STATE renderTarget[SWR_NUM_RENDERTARGETS];
+};
+static_assert(sizeof(SWR_BLEND_STATE) == 36, "Invalid SWR_BLEND_STATE size");
+
+//////////////////////////////////////////////////////////////////////////
+/// FUNCTION POINTERS FOR SHADERS
+
+typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, simdvertex& out);
+typedef void(__cdecl *PFN_VERTEX_FUNC)(HANDLE hPrivateData, SWR_VS_CONTEXT* pVsContext);
+typedef void(__cdecl *PFN_HS_FUNC)(HANDLE hPrivateData, SWR_HS_CONTEXT* pHsContext);
+typedef void(__cdecl *PFN_DS_FUNC)(HANDLE hPrivateData, SWR_DS_CONTEXT* pDsContext);
+typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, SWR_GS_CONTEXT* pGsContext);
+typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, SWR_CS_CONTEXT* pCsContext);
+typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext);
+typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
+typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*);
+
+//////////////////////////////////////////////////////////////////////////
+/// FRONTEND_STATE
+/////////////////////////////////////////////////////////////////////////
+struct SWR_FRONTEND_STATE
+{
+ // skip clip test, perspective divide, and viewport transform
+ // intended for verts in screen space
+ bool vpTransformDisable;
+ union
+ {
+ struct
+ {
+ uint32_t triFan : 2;
+ uint32_t lineStripList : 1;
+ uint32_t triStripList : 2;
+ };
+ uint32_t bits;
+ }provokingVertex;
+ uint32_t topologyProvokingVertex; // provoking vertex for the draw topology
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// VIEWPORT_MATRIX
+/////////////////////////////////////////////////////////////////////////
+struct SWR_VIEWPORT_MATRIX
+{
+ float m00;
+ float m11;
+ float m22;
+ float m30;
+ float m31;
+ float m32;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_VIEWPORT
+/////////////////////////////////////////////////////////////////////////
+struct SWR_VIEWPORT
+{
+ float x;
+ float y;
+ float width;
+ float height;
+ float minZ;
+ float maxZ;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_CULLMODE
+//////////////////////////////////////////////////////////////////////////
+enum SWR_CULLMODE
+{
+ SWR_CULLMODE_BOTH,
+ SWR_CULLMODE_NONE,
+ SWR_CULLMODE_FRONT,
+ SWR_CULLMODE_BACK
+};
+
+enum SWR_FILLMODE
+{
+ SWR_FILLMODE_POINT,
+ SWR_FILLMODE_WIREFRAME,
+ SWR_FILLMODE_SOLID
+};
+
+enum SWR_FRONTWINDING
+{
+ SWR_FRONTWINDING_CW,
+ SWR_FRONTWINDING_CCW
+};
+
+
+enum SWR_MSAA_SAMPLE_PATTERN
+{
+ SWR_MSAA_CENTER_PATTERN,
+ SWR_MSAA_STANDARD_PATTERN,
+ SWR_MSAA_SAMPLE_PATTERN_MAX
+};
+
+enum SWR_PIXEL_LOCATION
+{
+ SWR_PIXEL_LOCATION_CENTER,
+ SWR_PIXEL_LOCATION_UL,
+};
+
+// fixed point screen space sample locations within a pixel
+struct SWR_MULTISAMPLE_POS
+{
+ uint32_t x;
+ uint32_t y;
+};
+
+enum SWR_MSAA_RASTMODE
+{
+ SWR_MSAA_RASTMODE_OFF_PIXEL,
+ SWR_MSAA_RASTMODE_OFF_PATTERN,
+ SWR_MSAA_RASTMODE_ON_PIXEL,
+ SWR_MSAA_RASTMODE_ON_PATTERN
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_RASTSTATE
+//////////////////////////////////////////////////////////////////////////
+struct SWR_RASTSTATE
+{
+ uint32_t cullMode : 2;
+ uint32_t fillMode : 2;
+ uint32_t frontWinding : 1;
+ uint32_t scissorEnable : 1;
+ uint32_t depthClipEnable : 1;
+ float pointSize;
+ float lineWidth;
+
+ // point size output from the VS
+ bool pointParam;
+
+ // point sprite
+ bool pointSpriteEnable;
+ bool pointSpriteTopOrigin;
+
+ // depth bias
+ float depthBias;
+ float slopeScaledDepthBias;
+ float depthBiasClamp;
+ SWR_FORMAT depthFormat; // @llvm_enum
+
+ ///@todo: MSAA lines
+ // multisample state for MSAA lines
+ bool msaaRastEnable;
+ SWR_MSAA_RASTMODE rastMode; // @llvm_enum
+
+ // sample count the rasterizer is running at
+ SWR_MULTISAMPLE_COUNT sampleCount; // @llvm_enum
+ bool bForcedSampleCount;
+ uint32_t pixelLocation; // UL or Center
+ bool pixelOffset; // offset pixel positions by .5 in both the horizontal and vertical direction
+ SWR_MULTISAMPLE_POS iSamplePos[SWR_MAX_NUM_MULTISAMPLES];
+ SWR_MSAA_SAMPLE_PATTERN samplePattern; // @llvm_enum
+
+ // user clip/cull distance enables
+ uint8_t cullDistanceMask;
+ uint8_t clipDistanceMask;
+};
+
+// backend state
+struct SWR_BACKEND_STATE
+{
+ uint32_t constantInterpolationMask;
+ uint32_t pointSpriteTexCoordMask;
+ uint8_t numAttributes;
+ uint8_t numComponents[KNOB_NUM_ATTRIBUTES];
+};
+
+union SWR_DEPTH_STENCIL_STATE
+{
+ struct
+ {
+ // dword 0
+ uint32_t depthWriteEnable : 1;
+ uint32_t depthTestEnable : 1;
+ uint32_t stencilWriteEnable : 1;
+ uint32_t stencilTestEnable : 1;
+ uint32_t doubleSidedStencilTestEnable : 1;
+
+ uint32_t depthTestFunc : 3;
+ uint32_t stencilTestFunc : 3;
+
+ uint32_t backfaceStencilPassDepthPassOp : 3;
+ uint32_t backfaceStencilPassDepthFailOp : 3;
+ uint32_t backfaceStencilFailOp : 3;
+ uint32_t backfaceStencilTestFunc : 3;
+ uint32_t stencilPassDepthPassOp : 3;
+ uint32_t stencilPassDepthFailOp : 3;
+ uint32_t stencilFailOp : 3;
+
+ // dword 1
+ uint8_t backfaceStencilWriteMask;
+ uint8_t backfaceStencilTestMask;
+ uint8_t stencilWriteMask;
+ uint8_t stencilTestMask;
+
+ // dword 2
+ uint8_t backfaceStencilRefValue;
+ uint8_t stencilRefValue;
+ };
+ uint32_t value[3];
+};
+
+enum SWR_SHADING_RATE
+{
+ SWR_SHADING_RATE_PIXEL,
+ SWR_SHADING_RATE_SAMPLE,
+ SWR_SHADING_RATE_COARSE,
+ SWR_SHADING_RATE_MAX,
+};
+
+enum SWR_INPUT_COVERAGE
+{
+ SWR_INPUT_COVERAGE_NONE,
+ SWR_INPUT_COVERAGE_NORMAL,
+ SWR_INPUT_COVERAGE_MAX,
+};
+
+enum SWR_PS_POSITION_OFFSET
+{
+ SWR_PS_POSITION_SAMPLE_NONE,
+ SWR_PS_POSITION_SAMPLE_OFFSET,
+ SWR_PS_POSITION_CENTROID_OFFSET,
+ SWR_PS_POSITION_OFFSET_MAX,
+};
+
+enum SWR_BARYCENTRICS_MASK
+{
+ SWR_BARYCENTRIC_PER_PIXEL_MASK = 0x1,
+ SWR_BARYCENTRIC_CENTROID_MASK = 0x2,
+ SWR_BARYCENTRIC_PER_SAMPLE_MASK = 0x4,
+ SWR_BARYCENTRICS_MASK_MAX = 0x8
+};
+
+// pixel shader state
+struct SWR_PS_STATE
+{
+ // dword 0-1
+ PFN_PIXEL_KERNEL pfnPixelShader; // @llvm_pfn
+
+ // dword 2
+ uint32_t killsPixel : 1; // pixel shader can kill pixels
+ uint32_t inputCoverage : 1; // type of input coverage PS uses
+ uint32_t writesODepth : 1; // pixel shader writes to depth
+ uint32_t usesSourceDepth : 1; // pixel shader reads depth
+ uint32_t shadingRate : 2; // shading per pixel / sample / coarse pixel
+ uint32_t numRenderTargets : 4; // number of render target outputs in use (0-8)
+ uint32_t posOffset : 2; // type of offset (none, sample, centroid) to add to pixel position
+ uint32_t barycentricsMask : 3; // which type(s) of barycentric coords does the PS interpolate attributes with
+ uint32_t usesUAV : 1; // pixel shader accesses UAV
+ uint32_t forceEarlyZ : 1; // force execution of early depth/stencil test
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/tessellator.h b/src/gallium/drivers/swr/rasterizer/core/tessellator.h
new file mode 100644
index 00000000000..915ac77897b
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/tessellator.h
@@ -0,0 +1,88 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file tessellator.h
+*
+* @brief Tessellator fixed function unit interface definition
+*
+******************************************************************************/
+#pragma once
+
+/// Allocate and initialize a new tessellation context
+HANDLE SWR_API TSInitCtx(
+ SWR_TS_DOMAIN tsDomain, ///< [IN] Tessellation domain (isoline, quad, triangle)
+ SWR_TS_PARTITIONING tsPartitioning, ///< [IN] Tessellation partitioning algorithm
+ SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology, ///< [IN] Tessellation output topology
+ void* pContextMem, ///< [IN] Memory to use for the context
+ size_t& memSize); ///< [INOUT] In: Amount of memory in pContextMem. Out: Mem required
+
+/// Destroy & de-allocate tessellation context
+void SWR_API TSDestroyCtx(
+ HANDLE tsCtx); ///< [IN] Tessellation context to be destroyed
+
+struct SWR_TS_TESSELLATED_DATA
+{
+ uint32_t NumPrimitives;
+ uint32_t NumDomainPoints;
+
+ uint32_t* ppIndices[3];
+ float* pDomainPointsU;
+ float* pDomainPointsV;
+ // For Tri: pDomainPointsW[i] = 1.0f - pDomainPointsU[i] - pDomainPointsV[i]
+};
+
+/// Perform Tessellation
+void SWR_API TSTessellate(
+ HANDLE tsCtx, ///< [IN] Tessellation Context
+ const SWR_TESSELLATION_FACTORS& tsTessFactors, ///< [IN] Tessellation Factors
+ SWR_TS_TESSELLATED_DATA& tsTessellatedData); ///< [OUT] Tessellated Data
+
+
+
+/// @TODO - Implement OSS tessellator
+
+INLINE HANDLE SWR_API TSInitCtx(
+ SWR_TS_DOMAIN tsDomain,
+ SWR_TS_PARTITIONING tsPartitioning,
+ SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology,
+ void* pContextMem,
+ size_t& memSize)
+{
+ SWR_ASSERT(0, "%s: Not Implemented", __FUNCTION__);
+ return NULL;
+}
+
+
+INLINE void SWR_API TSDestroyCtx(HANDLE tsCtx)
+{
+ SWR_ASSERT(0, "%s: Not Implemented", __FUNCTION__);
+}
+
+
+INLINE void SWR_API TSTessellate(
+ HANDLE tsCtx,
+ const SWR_TESSELLATION_FACTORS& tsTessFactors,
+ SWR_TS_TESSELLATED_DATA& tsTessellatedData)
+{
+ SWR_ASSERT(0, "%s: Not Implemented", __FUNCTION__);
+}
+
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
new file mode 100644
index 00000000000..24c5588bfec
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -0,0 +1,962 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+
+#include <stdio.h>
+#include <thread>
+#include <algorithm>
+#include <unordered_set>
+#include <float.h>
+#include <vector>
+#include <utility>
+#include <fstream>
+#include <string>
+
+#if defined(__linux__) || defined(__gnu_linux__)
+#include <pthread.h>
+#include <sched.h>
+#include <unistd.h>
+#endif
+
+#include "common/os.h"
+#include "context.h"
+#include "frontend.h"
+#include "backend.h"
+#include "rasterizer.h"
+#include "rdtsc_core.h"
+#include "tilemgr.h"
+#include "core/multisample.h"
+
+
+
+
+// ThreadId
+struct Core
+{
+ uint32_t procGroup = 0;
+ std::vector<uint32_t> threadIds;
+};
+
+struct NumaNode
+{
+ std::vector<Core> cores;
+};
+
+typedef std::vector<NumaNode> CPUNumaNodes;
+
+void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup)
+{
+ out_nodes.clear();
+ out_numThreadsPerProcGroup = 0;
+
+#if defined(_WIN32)
+
+ SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS];
+ DWORD bufSize = sizeof(buffer);
+
+ BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufSize);
+ SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");
+
+ uint32_t count = bufSize / buffer->Size;
+ PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = buffer;
+
+ for (uint32_t i = 0; i < count; ++i)
+ {
+ SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore);
+ for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g)
+ {
+ auto& gmask = pBuffer->Processor.GroupMask[g];
+ uint32_t threadId = 0;
+ uint32_t procGroup = gmask.Group;
+
+ Core* pCore = nullptr;
+
+ uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask);
+
+ while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask))
+ {
+ // clear mask
+ gmask.Mask &= ~(KAFFINITY(1) << threadId);
+
+ // Find Numa Node
+ PROCESSOR_NUMBER procNum = {};
+ procNum.Group = WORD(procGroup);
+ procNum.Number = UCHAR(threadId);
+
+ uint32_t numaId = 0;
+ ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId);
+ SWR_ASSERT(ret);
+
+ // Store data
+ if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
+ auto& numaNode = out_nodes[numaId];
+
+ uint32_t coreId = 0;
+
+ if (nullptr == pCore)
+ {
+ numaNode.cores.push_back(Core());
+ pCore = &numaNode.cores.back();
+ pCore->procGroup = procGroup;
+#if !defined(_WIN64)
+ coreId = (uint32_t)numaNode.cores.size();
+ if ((coreId * numThreads) >= 32)
+ {
+ // Windows doesn't return threadIds >= 32 for a processor group correctly
+ // when running a 32-bit application.
+ // Just save -1 as the threadId
+ threadId = uint32_t(-1);
+ }
+#endif
+ }
+ pCore->threadIds.push_back(threadId);
+ if (procGroup == 0)
+ {
+ out_numThreadsPerProcGroup++;
+ }
+ }
+ }
+ pBuffer = PtrAdd(pBuffer, pBuffer->Size);
+ }
+
+
+#elif defined(__linux__) || defined (__gnu_linux__)
+
+ // Parse /proc/cpuinfo to get full topology
+ std::ifstream input("/proc/cpuinfo");
+ std::string line;
+ char* c;
+ uint32_t threadId = uint32_t(-1);
+ uint32_t coreId = uint32_t(-1);
+ uint32_t numaId = uint32_t(-1);
+
+ while (std::getline(input, line))
+ {
+ if (line.find("processor") != std::string::npos)
+ {
+ if (threadId != uint32_t(-1))
+ {
+ // Save information.
+ if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
+ auto& numaNode = out_nodes[numaId];
+ if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1);
+ auto& core = numaNode.cores[coreId];
+
+ core.procGroup = coreId;
+ core.threadIds.push_back(threadId);
+
+ out_numThreadsPerProcGroup++;
+ }
+
+ auto data_start = line.find(": ") + 2;
+ threadId = std::strtoul(&line.c_str()[data_start], &c, 10);
+ continue;
+ }
+ if (line.find("core id") != std::string::npos)
+ {
+ auto data_start = line.find(": ") + 2;
+ coreId = std::strtoul(&line.c_str()[data_start], &c, 10);
+ continue;
+ }
+ if (line.find("physical id") != std::string::npos)
+ {
+ auto data_start = line.find(": ") + 2;
+ numaId = std::strtoul(&line.c_str()[data_start], &c, 10);
+ continue;
+ }
+ }
+
+ if (threadId != uint32_t(-1))
+ {
+ // Save information.
+ if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
+ auto& numaNode = out_nodes[numaId];
+ if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1);
+ auto& core = numaNode.cores[coreId];
+
+ core.procGroup = coreId;
+ core.threadIds.push_back(threadId);
+ out_numThreadsPerProcGroup++;
+ }
+
+ for (uint32_t node = 0; node < out_nodes.size(); node++) {
+ auto& numaNode = out_nodes[node];
+ auto it = numaNode.cores.begin();
+ for ( ; it != numaNode.cores.end(); ) {
+ if (it->threadIds.size() == 0)
+ numaNode.cores.erase(it);
+ else
+ ++it;
+ }
+ }
+
+#else
+
+#error Unsupported platform
+
+#endif
+}
+
+
+void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=false)
+{
+ // Only bind threads when MAX_WORKER_THREADS isn't set.
+ if (KNOB_MAX_WORKER_THREADS && bindProcGroup == false)
+ {
+ return;
+ }
+
+#if defined(_WIN32)
+ {
+ GROUP_AFFINITY affinity = {};
+ affinity.Group = procGroupId;
+
+#if !defined(_WIN64)
+ if (threadId >= 32)
+ {
+ // In a 32-bit process on Windows it is impossible to bind
+ // to logical processors 32-63 within a processor group.
+ // In this case set the mask to 0 and let the system assign
+ // the processor. Hopefully it will make smart choices.
+ affinity.Mask = 0;
+ }
+ else
+#endif
+ {
+ // If KNOB_MAX_WORKER_THREADS is set, only bind to the proc group,
+ // Not the individual HW thread.
+ if (!KNOB_MAX_WORKER_THREADS)
+ {
+ affinity.Mask = KAFFINITY(1) << threadId;
+ }
+ }
+
+ SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr);
+ }
+#else
+ cpu_set_t cpuset;
+ pthread_t thread = pthread_self();
+ CPU_ZERO(&cpuset);
+ CPU_SET(threadId, &cpuset);
+
+ pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
+#endif
+}
+
+INLINE
+uint64_t GetEnqueuedDraw(SWR_CONTEXT *pContext)
+{
+ //uint64_t result = _InterlockedCompareExchange64((volatile __int64*)&pContext->DrawEnqueued, 0, 0);
+ //return result;
+ return pContext->DrawEnqueued;
+}
+
+INLINE
+DRAW_CONTEXT *GetDC(SWR_CONTEXT *pContext, uint64_t drawId)
+{
+ return &pContext->dcRing[(drawId-1) % KNOB_MAX_DRAWS_IN_FLIGHT];
+}
+
+// returns true if dependency not met
+INLINE
+bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint64_t lastRetiredDraw)
+{
+ return (pDC->dependency > lastRetiredDraw);
+}
+
+void ClearColorHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
+{
+ // Load clear color into SIMD register...
+ float *pClearData = (float*)(pHotTile->clearData);
+ simdscalar valR = _simd_broadcast_ss(&pClearData[0]);
+ simdscalar valG = _simd_broadcast_ss(&pClearData[1]);
+ simdscalar valB = _simd_broadcast_ss(&pClearData[2]);
+ simdscalar valA = _simd_broadcast_ss(&pClearData[3]);
+
+ float *pfBuf = (float*)pHotTile->pBuffer;
+ uint32_t numSamples = pHotTile->numSamples;
+
+ for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+ {
+ for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+ {
+ for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
+ {
+ _simd_store_ps(pfBuf, valR);
+ pfBuf += KNOB_SIMD_WIDTH;
+ _simd_store_ps(pfBuf, valG);
+ pfBuf += KNOB_SIMD_WIDTH;
+ _simd_store_ps(pfBuf, valB);
+ pfBuf += KNOB_SIMD_WIDTH;
+ _simd_store_ps(pfBuf, valA);
+ pfBuf += KNOB_SIMD_WIDTH;
+ }
+ }
+ }
+}
+
+void ClearDepthHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
+{
+ // Load clear color into SIMD register...
+ float *pClearData = (float*)(pHotTile->clearData);
+ simdscalar valZ = _simd_broadcast_ss(&pClearData[0]);
+
+ float *pfBuf = (float*)pHotTile->pBuffer;
+ uint32_t numSamples = pHotTile->numSamples;
+
+ for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+ {
+ for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+ {
+ for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM)
+ {
+ _simd_store_ps(pfBuf, valZ);
+ pfBuf += KNOB_SIMD_WIDTH;
+ }
+ }
+ }
+}
+
+void ClearStencilHotTile(const HOTTILE* pHotTile)
+{
+ // convert from F32 to U8.
+ uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
+ //broadcast 32x into __m256i...
+ simdscalari valS = _simd_set1_epi8(clearVal);
+
+ simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer;
+ uint32_t numSamples = pHotTile->numSamples;
+
+ for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+ {
+ for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+ {
+ // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
+ for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4)
+ {
+ _simd_store_si(pBuf, valS);
+ pBuf += 1;
+ }
+ }
+ }
+}
+
+// for draw calls, we initialize the active hot tiles and perform deferred
+// load on them if tile is in invalid state. we do this in the outer thread loop instead of inside
+// the draw routine itself mainly for performance, to avoid unnecessary setup
+// every triangle
+// @todo support deferred clear
+INLINE
+void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, const TRIANGLE_WORK_DESC* pWork)
+{
+ const API_STATE& state = GetApiState(pDC);
+ HotTileMgr *pHotTileMgr = pContext->pHotTileMgr;
+
+ uint32_t x, y;
+ MacroTileMgr::getTileIndices(macroID, x, y);
+ x *= KNOB_MACROTILE_X_DIM;
+ y *= KNOB_MACROTILE_Y_DIM;
+
+ uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);
+
+ // check RT if enabled
+ unsigned long rtSlot = 0;
+ uint32_t colorHottileEnableMask = state.colorHottileEnable;
+ while(_BitScanForward(&rtSlot, colorHottileEnableMask))
+ {
+ HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples);
+
+ if (pHotTile->state == HOTTILE_INVALID)
+ {
+ RDTSC_START(BELoadTiles);
+ // invalid hottile before draw requires a load from surface before we can draw to it
+ pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+ pHotTile->state = HOTTILE_DIRTY;
+ RDTSC_STOP(BELoadTiles, 0, 0);
+ }
+ else if (pHotTile->state == HOTTILE_CLEAR)
+ {
+ RDTSC_START(BELoadTiles);
+ // Clear the tile.
+ ClearColorHotTile(pHotTile);
+ pHotTile->state = HOTTILE_DIRTY;
+ RDTSC_STOP(BELoadTiles, 0, 0);
+ }
+ colorHottileEnableMask &= ~(1 << rtSlot);
+ }
+
+ // check depth if enabled
+ if (state.depthHottileEnable)
+ {
+ HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
+ if (pHotTile->state == HOTTILE_INVALID)
+ {
+ RDTSC_START(BELoadTiles);
+ // invalid hottile before draw requires a load from surface before we can draw to it
+ pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+ pHotTile->state = HOTTILE_DIRTY;
+ RDTSC_STOP(BELoadTiles, 0, 0);
+ }
+ else if (pHotTile->state == HOTTILE_CLEAR)
+ {
+ RDTSC_START(BELoadTiles);
+ // Clear the tile.
+ ClearDepthHotTile(pHotTile);
+ pHotTile->state = HOTTILE_DIRTY;
+ RDTSC_STOP(BELoadTiles, 0, 0);
+ }
+ }
+
+ // check stencil if enabled
+ if (state.stencilHottileEnable)
+ {
+ HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
+ if (pHotTile->state == HOTTILE_INVALID)
+ {
+ RDTSC_START(BELoadTiles);
+ // invalid hottile before draw requires a load from surface before we can draw to it
+ pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+ pHotTile->state = HOTTILE_DIRTY;
+ RDTSC_STOP(BELoadTiles, 0, 0);
+ }
+ else if (pHotTile->state == HOTTILE_CLEAR)
+ {
+ RDTSC_START(BELoadTiles);
+ // Clear the tile.
+ ClearStencilHotTile(pHotTile);
+ pHotTile->state = HOTTILE_DIRTY;
+ RDTSC_STOP(BELoadTiles, 0, 0);
+ }
+ }
+}
+
+INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
+{
+ // increment our current draw id to the first incomplete draw
+ uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
+ while (curDrawBE < drawEnqueued)
+ {
+ DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
+
+ // If its not compute and FE is not done then break out of loop.
+ if (!pDC->doneFE && !pDC->isCompute) break;
+
+ bool isWorkComplete = (pDC->isCompute) ?
+ pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete();
+
+ if (isWorkComplete)
+ {
+ curDrawBE++;
+ InterlockedIncrement(&pDC->threadsDoneBE);
+ }
+ else
+ {
+ break;
+ }
+ }
+
+ // If there are no more incomplete draws then return false.
+ return (curDrawBE >= drawEnqueued) ? false : true;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief If there is any BE work then go work on it.
+/// @param pContext - pointer to SWR context.
+/// @param workerId - The unique worker ID that is assigned to this thread.
+/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
+/// has its own curDrawBE counter and this ensures that each worker processes all the
+/// draws in order.
+/// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its
+/// own set and each time it fails to lock a macrotile, because its already locked,
+/// then it will add that tile to the lockedTiles set. As a worker begins to work
+/// on future draws the lockedTiles ensure that it doesn't work on tiles that may
+/// still have work pending in a previous draw. Additionally, the lockedTiles is
+/// hueristic that can steer a worker back to the same macrotile that it had been
+/// working on in a previous draw.
+void WorkOnFifoBE(
+ SWR_CONTEXT *pContext,
+ uint32_t workerId,
+ uint64_t &curDrawBE,
+ std::unordered_set<uint32_t>& lockedTiles)
+{
+ // Find the first incomplete draw that has pending work. If no such draw is found then
+ // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
+ if (FindFirstIncompleteDraw(pContext, curDrawBE) == false)
+ {
+ return;
+ }
+
+ uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
+
+ // Reset our history for locked tiles. We'll have to re-learn which tiles are locked.
+ lockedTiles.clear();
+
+ // Try to work on each draw in order of the available draws in flight.
+ // 1. If we're on curDrawBE, we can work on any macrotile that is available.
+ // 2. If we're trying to work on draws after curDrawBE, we are restricted to
+ // working on those macrotiles that are known to be complete in the prior draw to
+ // maintain order. The locked tiles provides the history to ensures this.
+ for (uint64_t i = curDrawBE; i < GetEnqueuedDraw(pContext); ++i)
+ {
+ DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];
+
+ if (pDC->isCompute) return; // We don't look at compute work.
+
+ // First wait for FE to be finished with this draw. This keeps threading model simple
+ // but if there are lots of bubbles between draws then serializing FE and BE may
+ // need to be revisited.
+ if (!pDC->doneFE) return;
+
+ // If this draw is dependent on a previous draw then we need to bail.
+ if (CheckDependency(pContext, pDC, lastRetiredDraw))
+ {
+ return;
+ }
+
+ // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it.
+ std::vector<uint32_t> &macroTiles = pDC->pTileMgr->getDirtyTiles();
+
+ for (uint32_t tileID : macroTiles)
+ {
+ MacroTileQueue &tile = pDC->pTileMgr->getMacroTileQueue(tileID);
+
+ // can only work on this draw if it's not in use by other threads
+ if (lockedTiles.find(tileID) == lockedTiles.end())
+ {
+ if (tile.getNumQueued())
+ {
+ if (tile.tryLock())
+ {
+ BE_WORK *pWork;
+
+ RDTSC_START(WorkerFoundWork);
+
+ uint32_t numWorkItems = tile.getNumQueued();
+
+ if (numWorkItems != 0)
+ {
+ pWork = tile.peek();
+ SWR_ASSERT(pWork);
+ if (pWork->type == DRAW)
+ {
+ InitializeHotTiles(pContext, pDC, tileID, (const TRIANGLE_WORK_DESC*)&pWork->desc);
+ }
+ }
+
+ while ((pWork = tile.peek()) != nullptr)
+ {
+ pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
+ tile.dequeue();
+ }
+ RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);
+
+ _ReadWriteBarrier();
+
+ pDC->pTileMgr->markTileComplete(tileID);
+
+ // Optimization: If the draw is complete and we're the last one to have worked on it then
+ // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
+ if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
+ {
+ // We can increment the current BE and safely move to next draw since we know this draw is complete.
+ curDrawBE++;
+ InterlockedIncrement(&pDC->threadsDoneBE);
+
+ lastRetiredDraw++;
+
+ lockedTiles.clear();
+ break;
+ }
+ }
+ else
+ {
+ // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
+ lockedTiles.insert(tileID);
+ }
+ }
+ }
+ }
+ }
+}
+
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, UCHAR numaNode)
+{
+ // Try to grab the next DC from the ring
+ uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
+ while (curDrawFE < drawEnqueued)
+ {
+ uint32_t dcSlot = curDrawFE % KNOB_MAX_DRAWS_IN_FLIGHT;
+ DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
+ if (pDC->isCompute || pDC->doneFE || pDC->FeLock)
+ {
+ curDrawFE++;
+ InterlockedIncrement(&pDC->threadsDoneFE);
+ }
+ else
+ {
+ break;
+ }
+ }
+
+ uint64_t curDraw = curDrawFE;
+ while (curDraw < drawEnqueued)
+ {
+ uint32_t dcSlot = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT;
+ DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
+
+ if (!pDC->isCompute && !pDC->FeLock)
+ {
+ uint32_t initial = InterlockedCompareExchange((volatile uint32_t*)&pDC->FeLock, 1, 0);
+ if (initial == 0)
+ {
+ // successfully grabbed the DC, now run the FE
+ pDC->FeWork.pfnWork(pContext, pDC, workerId, &pDC->FeWork.desc);
+
+ _ReadWriteBarrier();
+ pDC->doneFE = true;
+ }
+ }
+ curDraw++;
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief If there is any compute work then go work on it.
+/// @param pContext - pointer to SWR context.
+/// @param workerId - The unique worker ID that is assigned to this thread.
+/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
+/// has its own curDrawBE counter and this ensures that each worker processes all the
+/// draws in order.
+void WorkOnCompute(
+ SWR_CONTEXT *pContext,
+ uint32_t workerId,
+ uint64_t& curDrawBE)
+{
+ if (FindFirstIncompleteDraw(pContext, curDrawBE) == false)
+ {
+ return;
+ }
+
+ uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
+
+ DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
+ if (pDC->isCompute == false) return;
+
+ // check dependencies
+ if (CheckDependency(pContext, pDC, lastRetiredDraw))
+ {
+ return;
+ }
+
+ SWR_ASSERT(pDC->pDispatch != nullptr);
+ DispatchQueue& queue = *pDC->pDispatch;
+
+ // Is there any work remaining?
+ if (queue.getNumQueued() > 0)
+ {
+ bool lastToComplete = false;
+
+ uint32_t threadGroupId = 0;
+ while (queue.getWork(threadGroupId))
+ {
+ ProcessComputeBE(pDC, workerId, threadGroupId);
+
+ lastToComplete = queue.finishedWork();
+ }
+
+ _ReadWriteBarrier();
+
+ if (lastToComplete)
+ {
+ SWR_ASSERT(queue.isWorkComplete() == true);
+ pDC->doneCompute = true;
+ }
+ }
+}
+
+DWORD workerThreadMain(LPVOID pData)
+{
+ THREAD_DATA *pThreadData = (THREAD_DATA*)pData;
+ SWR_CONTEXT *pContext = pThreadData->pContext;
+ uint32_t threadId = pThreadData->threadId;
+ uint32_t workerId = pThreadData->workerId;
+
+ bindThread(threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup);
+
+ RDTSC_INIT(threadId);
+
+ int numaNode = (int)pThreadData->numaId;
+
+ // flush denormals to 0
+ _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
+
+ // Track tiles locked by other threads. If we try to lock a macrotile and find its already
+ // locked then we'll add it to this list so that we don't try and lock it again.
+ std::unordered_set<uint32_t> lockedTiles;
+
+ // each worker has the ability to work on any of the queued draws as long as certain
+ // conditions are met. the data associated
+ // with a draw is guaranteed to be active as long as a worker hasn't signaled that he
+ // has moved on to the next draw when he determines there is no more work to do. The api
+ // thread will not increment the head of the dc ring until all workers have moved past the
+ // current head.
+ // the logic to determine what to work on is:
+ // 1- try to work on the FE any draw that is queued. For now there are no dependencies
+ // on the FE work, so any worker can grab any FE and process in parallel. Eventually
+ // we'll need dependency tracking to force serialization on FEs. The worker will try
+ // to pick an FE by atomically incrementing a counter in the swr context. he'll keep
+ // trying until he reaches the tail.
+ // 2- BE work must be done in strict order. we accomplish this today by pulling work off
+ // the oldest draw (ie the head) of the dcRing. the worker can determine if there is
+ // any work left by comparing the total # of binned work items and the total # of completed
+ // work items. If they are equal, then there is no more work to do for this draw, and
+ // the worker can safely increment its oldestDraw counter and move on to the next draw.
+ std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
+
+ auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->DrawEnqueued; };
+
+ uint64_t curDrawBE = 1;
+ uint64_t curDrawFE = 1;
+
+ while (pContext->threadPool.inThreadShutdown == false)
+ {
+ uint32_t loop = 0;
+ while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE))
+ {
+ _mm_pause();
+ }
+
+ if (!threadHasWork(curDrawBE))
+ {
+ lock.lock();
+
+ // check for thread idle condition again under lock
+ if (threadHasWork(curDrawBE))
+ {
+ lock.unlock();
+ continue;
+ }
+
+ if (pContext->threadPool.inThreadShutdown)
+ {
+ lock.unlock();
+ break;
+ }
+
+ RDTSC_START(WorkerWaitForThreadEvent);
+
+ pContext->FifosNotEmpty.wait(lock);
+ lock.unlock();
+
+ RDTSC_STOP(WorkerWaitForThreadEvent, 0, 0);
+
+ if (pContext->threadPool.inThreadShutdown)
+ {
+ break;
+ }
+ }
+
+ RDTSC_START(WorkerWorkOnFifoBE);
+ WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles);
+ RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
+
+ WorkOnCompute(pContext, workerId, curDrawBE);
+
+ WorkOnFifoFE(pContext, workerId, curDrawFE, numaNode);
+ }
+
+ return 0;
+}
+
+DWORD workerThreadInit(LPVOID pData)
+{
+#if defined(_WIN32)
+ __try
+#endif // _WIN32
+ {
+ return workerThreadMain(pData);
+ }
+
+#if defined(_WIN32)
+ __except(EXCEPTION_CONTINUE_SEARCH)
+ {
+ }
+
+#endif // _WIN32
+
+ return 1;
+}
+
+void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
+{
+ bindThread(0);
+
+ CPUNumaNodes nodes;
+ uint32_t numThreadsPerProcGroup = 0;
+ CalculateProcessorTopology(nodes, numThreadsPerProcGroup);
+
+ uint32_t numHWNodes = (uint32_t)nodes.size();
+ uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size();
+ uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size();
+
+ uint32_t numNodes = numHWNodes;
+ uint32_t numCoresPerNode = numHWCoresPerNode;
+ uint32_t numHyperThreads = numHWHyperThreads;
+
+ if (KNOB_MAX_NUMA_NODES)
+ {
+ numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES);
+ }
+
+ if (KNOB_MAX_CORES_PER_NUMA_NODE)
+ {
+ numCoresPerNode = std::min(numCoresPerNode, KNOB_MAX_CORES_PER_NUMA_NODE);
+ }
+
+ if (KNOB_MAX_THREADS_PER_CORE)
+ {
+ numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE);
+ }
+
+ // Calculate numThreads
+ uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
+
+ if (KNOB_MAX_WORKER_THREADS)
+ {
+ uint32_t maxHWThreads = numHWNodes * numHWCoresPerNode * numHWHyperThreads;
+ numThreads = std::min(KNOB_MAX_WORKER_THREADS, maxHWThreads);
+ }
+
+ if (numThreads > KNOB_MAX_NUM_THREADS)
+ {
+ printf("WARNING: system thread count %u exceeds max %u, "
+ "performance will be degraded\n",
+ numThreads, KNOB_MAX_NUM_THREADS);
+ }
+
+ if (numThreads == 1)
+ {
+ // If only 1 worker thread, try to move it to an available
+ // HW thread. If that fails, use the API thread.
+ if (numCoresPerNode < numHWCoresPerNode)
+ {
+ numCoresPerNode++;
+ }
+ else if (numHyperThreads < numHWHyperThreads)
+ {
+ numHyperThreads++;
+ }
+ else if (numNodes < numHWNodes)
+ {
+ numNodes++;
+ }
+ else
+ {
+ pPool->numThreads = 0;
+ SET_KNOB(SINGLE_THREADED, true);
+ return;
+ }
+ }
+ else
+ {
+ // Save a HW thread for the API thread.
+ numThreads--;
+ }
+
+ pPool->numThreads = numThreads;
+ pContext->NumWorkerThreads = pPool->numThreads;
+
+ pPool->inThreadShutdown = false;
+ pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
+
+ if (KNOB_MAX_WORKER_THREADS)
+ {
+ bool bForceBindProcGroup = (numThreads > numThreadsPerProcGroup);
+ uint32_t numProcGroups = (numThreads + numThreadsPerProcGroup - 1) / numThreadsPerProcGroup;
+ // When MAX_WORKER_THREADS is set we don't bother to bind to specific HW threads
+ // But Windows will still require binding to specific process groups
+ for (uint32_t workerId = 0; workerId < numThreads; ++workerId)
+ {
+ pPool->pThreadData[workerId].workerId = workerId;
+ pPool->pThreadData[workerId].procGroupId = workerId % numProcGroups;
+ pPool->pThreadData[workerId].threadId = 0;
+ pPool->pThreadData[workerId].numaId = 0;
+ pPool->pThreadData[workerId].pContext = pContext;
+ pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
+ pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]);
+ }
+ }
+ else
+ {
+ uint32_t workerId = 0;
+ for (uint32_t n = 0; n < numNodes; ++n)
+ {
+ auto& node = nodes[n];
+
+ uint32_t numCores = numCoresPerNode;
+ for (uint32_t c = 0; c < numCores; ++c)
+ {
+ auto& core = node.cores[c];
+ for (uint32_t t = 0; t < numHyperThreads; ++t)
+ {
+ if (c == 0 && n == 0 && t == 0)
+ {
+ // Skip core 0, thread0 on node 0 to reserve for API thread
+ continue;
+ }
+
+ pPool->pThreadData[workerId].workerId = workerId;
+ pPool->pThreadData[workerId].procGroupId = core.procGroup;
+ pPool->pThreadData[workerId].threadId = core.threadIds[t];
+ pPool->pThreadData[workerId].numaId = n;
+ pPool->pThreadData[workerId].pContext = pContext;
+ pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]);
+
+ ++workerId;
+ }
+ }
+ }
+ }
+}
+
+void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
+{
+ if (!KNOB_SINGLE_THREADED)
+ {
+ // Inform threads to finish up
+ std::unique_lock<std::mutex> lock(pContext->WaitLock);
+ pPool->inThreadShutdown = true;
+ _mm_mfence();
+ pContext->FifosNotEmpty.notify_all();
+ lock.unlock();
+
+ // Wait for threads to finish and destroy them
+ for (uint32_t t = 0; t < pPool->numThreads; ++t)
+ {
+ pPool->threads[t]->join();
+ delete(pPool->threads[t]);
+ }
+
+ // Clean up data used by threads
+ free(pPool->pThreadData);
+ }
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h
new file mode 100644
index 00000000000..0fa7196f5ac
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -0,0 +1,63 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file threads.h
+*
+* @brief Definitions for SWR threading model.
+*
+******************************************************************************/
+#pragma once
+
+#include "knobs.h"
+
+#include <unordered_set>
+#include <thread>
+typedef std::thread* THREAD_PTR;
+
+struct SWR_CONTEXT;
+
+struct THREAD_DATA
+{
+ uint32_t procGroupId; // Will always be 0 for non-Windows OS
+ uint32_t threadId; // within the procGroup for Windows
+ uint32_t numaId; // NUMA node id
+ uint32_t workerId;
+ SWR_CONTEXT *pContext;
+ bool forceBindProcGroup; // Only useful when KNOB_MAX_WORKER_THREADS is set.
+};
+
+
+struct THREAD_POOL
+{
+ THREAD_PTR threads[KNOB_MAX_NUM_THREADS];
+ uint32_t numThreads;
+ volatile bool inThreadShutdown;
+ THREAD_DATA *pThreadData;
+};
+
+void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
+void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
+
+// Expose FE and BE worker functions to the API thread if single threaded
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, UCHAR numaNode);
+void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, std::unordered_set<uint32_t> &usedTiles);
+void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE);
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
new file mode 100644
index 00000000000..860393661e2
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -0,0 +1,105 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file tilemgr.cpp
+*
+* @brief Implementation for Macro Tile Manager which provides the facilities
+* for threads to work on an macro tile.
+*
+******************************************************************************/
+#include <unordered_map>
+
+#include "fifo.hpp"
+#include "tilemgr.h"
+
+#define TILE_ID(x,y) ((x << 16 | y))
+
+// override new/delete for alignment
+void *MacroTileMgr::operator new(size_t size)
+{
+ return _aligned_malloc(size, 64);
+}
+
+void MacroTileMgr::operator delete(void *p)
+{
+ _aligned_free(p);
+}
+
+void* DispatchQueue::operator new(size_t size)
+{
+ return _aligned_malloc(size, 64);
+}
+
+void DispatchQueue::operator delete(void *p)
+{
+ _aligned_free(p);
+}
+
+MacroTileMgr::MacroTileMgr(Arena& arena) : mArena(arena)
+{
+}
+
+void MacroTileMgr::initialize()
+{
+ mWorkItemsProduced = 0;
+ mWorkItemsConsumed = 0;
+
+ mDirtyTiles.clear();
+}
+
+void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork)
+{
+ // Should not enqueue more then what we have backing for in the hot tile manager.
+ SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
+ SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
+
+ uint32_t id = TILE_ID(x, y);
+
+ MacroTileQueue &tile = mTiles[id];
+ tile.mWorkItemsFE++;
+
+ if (tile.mWorkItemsFE == 1)
+ {
+ tile.clear(mArena);
+ mDirtyTiles.push_back(id);
+ }
+
+ mWorkItemsProduced++;
+ tile.enqueue_try_nosync(mArena, pWork);
+}
+
+void MacroTileMgr::markTileComplete(uint32_t id)
+{
+ SWR_ASSERT(mTiles.find(id) != mTiles.end());
+ MacroTileQueue &tile = mTiles[id];
+ uint32_t numTiles = tile.mWorkItemsFE;
+ InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles);
+
+ _ReadWriteBarrier();
+ tile.mWorkItemsBE += numTiles;
+ SWR_ASSERT(tile.mWorkItemsFE == tile.mWorkItemsBE);
+
+ // clear out tile, but defer fifo clear until the next DC first queues to it.
+ // this prevents worker threads from constantly locking a completed macro tile
+ tile.mWorkItemsFE = 0;
+ tile.mWorkItemsBE = 0;
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
new file mode 100644
index 00000000000..9137941bad4
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -0,0 +1,390 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file tilemgr.h
+*
+* @brief Definitions for Macro Tile Manager which provides the facilities
+* for threads to work on an macro tile.
+*
+******************************************************************************/
+#pragma once
+
+#include <set>
+#include <unordered_map>
+#include "common/formats.h"
+#include "fifo.hpp"
+#include "context.h"
+#include "format_traits.h"
+
+//////////////////////////////////////////////////////////////////////////
+/// MacroTile - work queue for a tile.
+//////////////////////////////////////////////////////////////////////////
+struct MacroTileQueue
+{
+ MacroTileQueue() { }
+ ~MacroTileQueue() { }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Returns number of work items queued for this tile.
+ uint32_t getNumQueued()
+ {
+ return mFifo.getNumQueued();
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Attempt to lock the work fifo. If already locked then return false.
+ bool tryLock()
+ {
+ return mFifo.tryLock();
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Clear fifo and unlock it.
+ void clear(Arena& arena)
+ {
+ mFifo.clear(arena);
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Peek at work sitting at the front of the fifo.
+ BE_WORK* peek()
+ {
+ return mFifo.peek();
+ }
+
+ bool enqueue_try_nosync(Arena& arena, const BE_WORK* entry)
+ {
+ return mFifo.enqueue_try_nosync(arena, entry);
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Move to next work item
+ void dequeue()
+ {
+ mFifo.dequeue_noinc();
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Destroy fifo
+ void destroy()
+ {
+ mFifo.destroy();
+ }
+
+ ///@todo This will all be private.
+ uint32_t mWorkItemsFE = 0;
+ uint32_t mWorkItemsBE = 0;
+
+private:
+ QUEUE<BE_WORK> mFifo;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// MacroTileMgr - Manages macrotiles for a draw.
+//////////////////////////////////////////////////////////////////////////
+class MacroTileMgr
+{
+public:
+ MacroTileMgr(Arena& arena);
+ ~MacroTileMgr()
+ {
+ for (auto &tile : mTiles)
+ {
+ tile.second.destroy();
+ }
+ }
+
+ void initialize();
+ INLINE std::vector<uint32_t>& getDirtyTiles() { return mDirtyTiles; }
+ INLINE MacroTileQueue& getMacroTileQueue(uint32_t id) { return mTiles[id]; }
+ void markTileComplete(uint32_t id);
+
+ INLINE bool isWorkComplete()
+ {
+ return mWorkItemsProduced == mWorkItemsConsumed;
+ }
+
+ void enqueue(uint32_t x, uint32_t y, BE_WORK *pWork);
+
+ static INLINE void getTileIndices(uint32_t tileID, uint32_t &x, uint32_t &y)
+ {
+ y = tileID & 0xffff;
+ x = (tileID >> 16) & 0xffff;
+ }
+
+ void *operator new(size_t size);
+ void operator delete (void *p);
+
+private:
+ Arena& mArena;
+ SWR_FORMAT mFormat;
+ std::unordered_map<uint32_t, MacroTileQueue> mTiles;
+
+ // Any tile that has work queued to it is a dirty tile.
+ std::vector<uint32_t> mDirtyTiles;
+
+ OSALIGNLINE(LONG) mWorkItemsProduced;
+ OSALIGNLINE(volatile LONG) mWorkItemsConsumed;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// DispatchQueue - work queue for dispatch
+//////////////////////////////////////////////////////////////////////////
+class DispatchQueue
+{
+public:
+ DispatchQueue() {}
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Setup the producer consumer counts.
+ void initialize(uint32_t totalTasks, void* pTaskData)
+ {
+ // The available and outstanding counts start with total tasks.
+ // At the start there are N tasks available and outstanding.
+ // When both the available and outstanding counts have reached 0 then all work has completed.
+ // When a worker starts on a threadgroup then it decrements the available count.
+ // When a worker completes a threadgroup then it decrements the outstanding count.
+
+ mTasksAvailable = totalTasks;
+ mTasksOutstanding = totalTasks;
+
+ mpTaskData = pTaskData;
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Returns number of tasks available for this dispatch.
+ uint32_t getNumQueued()
+ {
+ return (mTasksAvailable > 0) ? mTasksAvailable : 0;
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Atomically decrement the work available count. If the result
+ // is greater than 0 then we can on the associated thread group.
+ // Otherwise, there is no more work to do.
+ bool getWork(uint32_t& groupId)
+ {
+ LONG result = InterlockedDecrement(&mTasksAvailable);
+
+ if (result >= 0)
+ {
+ groupId = result;
+ return true;
+ }
+
+ return false;
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Atomically decrement the outstanding count. A worker is notifying
+ /// us that he just finished some work. Also, return true if we're
+ /// the last worker to complete this dispatch.
+ bool finishedWork()
+ {
+ LONG result = InterlockedDecrement(&mTasksOutstanding);
+ SWR_ASSERT(result >= 0, "Should never oversubscribe work");
+
+ return (result == 0) ? true : false;
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Work is complete once both the available/outstanding counts have reached 0.
+ bool isWorkComplete()
+ {
+ return ((mTasksAvailable <= 0) &&
+ (mTasksOutstanding <= 0));
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Return pointer to task data.
+ const void* GetTasksData()
+ {
+ return mpTaskData;
+ }
+
+ void *operator new(size_t size);
+ void operator delete (void *p);
+
+ void* mpTaskData; // The API thread will set this up and the callback task function will interpet this.
+
+ OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 };
+ OSALIGNLINE(volatile LONG) mTasksOutstanding{ 0 };
+};
+
+
+enum HOTTILE_STATE
+{
+ HOTTILE_INVALID, // tile is in unitialized state and should be loaded with surface contents before rendering
+ HOTTILE_CLEAR, // tile should be cleared
+ HOTTILE_DIRTY, // tile has been rendered to
+ HOTTILE_RESOLVED, // tile has been stored to memory
+};
+
+struct HOTTILE
+{
+ BYTE *pBuffer;
+ HOTTILE_STATE state;
+ DWORD clearData[4]; // May need to change based on pfnClearTile implementation. Reorder for alignment?
+ uint32_t numSamples;
+ uint32_t renderTargetArrayIndex; // current render target array index loaded
+};
+
+union HotTileSet
+{
+ struct
+ {
+ HOTTILE Color[SWR_NUM_RENDERTARGETS];
+ HOTTILE Depth;
+ HOTTILE Stencil;
+ };
+ HOTTILE Attachment[SWR_NUM_ATTACHMENTS];
+};
+
+class HotTileMgr
+{
+public:
+ HotTileMgr()
+ {
+ memset(&mHotTiles[0][0], 0, sizeof(mHotTiles));
+
+ // cache hottile size
+ for (uint32_t i = SWR_ATTACHMENT_COLOR0; i <= SWR_ATTACHMENT_COLOR7; ++i)
+ {
+ mHotTileSize[i] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
+ }
+ mHotTileSize[SWR_ATTACHMENT_DEPTH] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
+ mHotTileSize[SWR_ATTACHMENT_STENCIL] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
+ }
+
+ ~HotTileMgr()
+ {
+ for (int x = 0; x < KNOB_NUM_HOT_TILES_X; ++x)
+ {
+ for (int y = 0; y < KNOB_NUM_HOT_TILES_Y; ++y)
+ {
+ for (int a = 0; a < SWR_NUM_ATTACHMENTS; ++a)
+ {
+ if (mHotTiles[x][y].Attachment[a].pBuffer != NULL)
+ {
+ _aligned_free(mHotTiles[x][y].Attachment[a].pBuffer);
+ mHotTiles[x][y].Attachment[a].pBuffer = NULL;
+ }
+ }
+ }
+ }
+ }
+
+ HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1,
+ uint32_t renderTargetArrayIndex = 0)
+ {
+ uint32_t x, y;
+ MacroTileMgr::getTileIndices(macroID, x, y);
+
+ assert(x < KNOB_NUM_HOT_TILES_X);
+ assert(y < KNOB_NUM_HOT_TILES_Y);
+
+ HotTileSet &tile = mHotTiles[x][y];
+ HOTTILE& hotTile = tile.Attachment[attachment];
+ if (hotTile.pBuffer == NULL)
+ {
+ if (create)
+ {
+ uint32_t size = numSamples * mHotTileSize[attachment];
+ hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
+ hotTile.state = HOTTILE_INVALID;
+ hotTile.numSamples = numSamples;
+ hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
+ }
+ else
+ {
+ return NULL;
+ }
+ }
+ else
+ {
+ // free the old tile and create a new one with enough space to hold all samples
+ if (numSamples > hotTile.numSamples)
+ {
+ // tile should be either uninitialized or resolved if we're deleting and switching to a
+ // new sample count
+ assert((hotTile.state == HOTTILE_INVALID) ||
+ (hotTile.state == HOTTILE_RESOLVED) ||
+ (hotTile.state == HOTTILE_CLEAR));
+ _aligned_free(hotTile.pBuffer);
+
+ uint32_t size = numSamples * mHotTileSize[attachment];
+ hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
+ hotTile.state = HOTTILE_INVALID;
+ hotTile.numSamples = numSamples;
+ }
+
+ // if requested render target array index isn't currently loaded, need to store out the current hottile
+ // and load the requested array slice
+ if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex)
+ {
+ SWR_FORMAT format;
+ switch (attachment)
+ {
+ case SWR_ATTACHMENT_COLOR0:
+ case SWR_ATTACHMENT_COLOR1:
+ case SWR_ATTACHMENT_COLOR2:
+ case SWR_ATTACHMENT_COLOR3:
+ case SWR_ATTACHMENT_COLOR4:
+ case SWR_ATTACHMENT_COLOR5:
+ case SWR_ATTACHMENT_COLOR6:
+ case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break;
+ case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break;
+ case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break;
+ default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break;
+ }
+
+ if (hotTile.state == HOTTILE_DIRTY)
+ {
+ pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment,
+ x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer);
+ }
+
+ pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment,
+ x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer);
+
+ hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
+ hotTile.state = HOTTILE_DIRTY;
+ }
+ }
+ return &tile.Attachment[attachment];
+ }
+
+ HotTileSet &GetHotTile(uint32_t macroID)
+ {
+ uint32_t x, y;
+ MacroTileMgr::getTileIndices(macroID, x, y);
+ assert(x < KNOB_NUM_HOT_TILES_X);
+ assert(y < KNOB_NUM_HOT_TILES_Y);
+
+ return mHotTiles[x][y];
+ }
+
+private:
+ HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y];
+ uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS];
+};
+
diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.cpp b/src/gallium/drivers/swr/rasterizer/core/utils.cpp
new file mode 100644
index 00000000000..f36452f2cec
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.cpp
@@ -0,0 +1,148 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file utils.cpp
+*
+* @brief Utilities used by SWR core.
+*
+******************************************************************************/
+#if defined(_WIN32)
+
+#include<Windows.h>
+#include <Gdiplus.h>
+#include <Gdiplusheaders.h>
+#include <cstdint>
+
+using namespace Gdiplus;
+
+int GetEncoderClsid(const WCHAR* format, CLSID* pClsid)
+{
+ uint32_t num = 0; // number of image encoders
+ uint32_t size = 0; // size of the image encoder array in bytes
+
+ ImageCodecInfo* pImageCodecInfo = nullptr;
+
+ GetImageEncodersSize(&num, &size);
+ if(size == 0)
+ return -1; // Failure
+
+ pImageCodecInfo = (ImageCodecInfo*)(malloc(size));
+ if(pImageCodecInfo == nullptr)
+ return -1; // Failure
+
+ GetImageEncoders(num, size, pImageCodecInfo);
+
+ for(uint32_t j = 0; j < num; ++j)
+ {
+ if( wcscmp(pImageCodecInfo[j].MimeType, format) == 0 )
+ {
+ *pClsid = pImageCodecInfo[j].Clsid;
+ free(pImageCodecInfo);
+ return j; // Success
+ }
+ }
+
+ free(pImageCodecInfo);
+ return -1; // Failure
+}
+
+void SaveImageToPNGFile(
+ const WCHAR *pFilename,
+ void *pBuffer,
+ uint32_t width,
+ uint32_t height)
+{
+ // dump pixels to a png
+ // Initialize GDI+.
+ GdiplusStartupInput gdiplusStartupInput;
+ ULONG_PTR gdiplusToken;
+ GdiplusStartup(&gdiplusToken, &gdiplusStartupInput, nullptr);
+
+ Bitmap *bitmap = new Bitmap(width, height);
+ BYTE *pBytes = (BYTE*)pBuffer;
+ static const uint32_t bytesPerPixel = 4;
+ for (uint32_t y = 0; y < height; ++y)
+ for (uint32_t x = 0; x < width; ++x)
+ {
+ uint32_t pixel = *(uint32_t*)pBytes;
+ if (pixel == 0xcdcdcdcd)
+ {
+ pixel = 0xFFFF00FF;
+ }
+ else if (pixel == 0xdddddddd)
+ {
+ pixel = 0x80FF0000;
+ }
+ else
+ {
+ pixel |= 0xFF000000;
+ }
+ Color color(pixel);
+ bitmap->SetPixel(x, y, color);
+ pBytes += bytesPerPixel;
+ }
+
+ // Save image.
+ CLSID pngClsid;
+ GetEncoderClsid(L"image/png", &pngClsid);
+ bitmap->Save(pFilename, &pngClsid, nullptr);
+
+ delete bitmap;
+
+ GdiplusShutdown(gdiplusToken);
+}
+
+void OpenBitmapFromFile(
+ const WCHAR *pFilename,
+ void **pBuffer,
+ uint32_t *width,
+ uint32_t *height)
+{
+ GdiplusStartupInput gdiplusStartupInput;
+ ULONG_PTR gdiplusToken;
+ GdiplusStartup(&gdiplusToken, &gdiplusStartupInput, nullptr);
+
+ Bitmap *bitmap = new Bitmap(pFilename);
+
+ *width = bitmap->GetWidth();
+ *height = bitmap->GetHeight();
+ *pBuffer = new BYTE[*width * *height * 4]; // width * height * |RGBA|
+
+ // The folder 'stb_image' contains a PNG open/close module which
+ // is far less painful than this is, yo.
+ Gdiplus::Color clr;
+ for (uint32_t y = 0, idx = 0; y < *height; ++y)
+ {
+ for (uint32_t x = 0; x < *width; ++x, idx += 4)
+ {
+ bitmap->GetPixel(x, *height - y - 1, &clr);
+ ((BYTE*)*pBuffer)[idx + 0] = clr.GetBlue();
+ ((BYTE*)*pBuffer)[idx + 1] = clr.GetGreen();
+ ((BYTE*)*pBuffer)[idx + 2] = clr.GetRed();
+ ((BYTE*)*pBuffer)[idx + 3] = clr.GetAlpha();
+ }
+ }
+
+ delete bitmap;
+ bitmap = 0;
+}
+#endif
diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h
new file mode 100644
index 00000000000..8a59ef24fee
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.h
@@ -0,0 +1,831 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file utils.h
+*
+* @brief Utilities used by SWR core.
+*
+******************************************************************************/
+#pragma once
+
+#include <string.h>
+#include "common/os.h"
+#include "common/simdintrin.h"
+#include "common/swr_assert.h"
+
+#if defined(_WIN32)
+void SaveImageToPNGFile(
+ const WCHAR *pFilename,
+ void *pBuffer,
+ uint32_t width,
+ uint32_t height);
+
+void OpenBitmapFromFile(
+ const WCHAR *pFilename,
+ void **pBuffer,
+ uint32_t *width,
+ uint32_t *height);
+#endif
+
+/// @todo assume linux is always 64 bit
+#if defined(_WIN64) || defined(__linux__) || defined(__gnu_linux__)
+#define _MM_INSERT_EPI64 _mm_insert_epi64
+#define _MM_EXTRACT_EPI64 _mm_extract_epi64
+#else
+INLINE INT64 _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
+{
+ OSALIGNLINE(uint32_t) elems[4];
+ _mm_store_si128((__m128i*)elems, a);
+ if (ndx == 0)
+ {
+ uint64_t foo = elems[0];
+ foo |= (uint64_t)elems[1] << 32;
+ return foo;
+ }
+ else
+ {
+ uint64_t foo = elems[2];
+ foo |= (uint64_t)elems[3] << 32;
+ return foo;
+ }
+}
+
+INLINE __m128i _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx)
+{
+ OSALIGNLINE(int64_t) elems[2];
+ _mm_store_si128((__m128i*)elems, a);
+ if (ndx == 0)
+ {
+ elems[0] = b;
+ }
+ else
+ {
+ elems[1] = b;
+ }
+ __m128i out;
+ out = _mm_load_si128((const __m128i*)elems);
+ return out;
+}
+#endif
+
+OSALIGNLINE(struct) BBOX
+{
+ int top, bottom, left, right;
+
+ BBOX() {}
+ BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {}
+
+ bool operator==(const BBOX& rhs)
+ {
+ return (this->top == rhs.top &&
+ this->bottom == rhs.bottom &&
+ this->left == rhs.left &&
+ this->right == rhs.right);
+ }
+
+ bool operator!=(const BBOX& rhs)
+ {
+ return !(*this == rhs);
+ }
+};
+
+struct simdBBox
+{
+ simdscalari top, bottom, left, right;
+};
+
+INLINE
+void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
+{
+ __m128i row0i = _mm_castps_si128(row0);
+ __m128i row1i = _mm_castps_si128(row1);
+ __m128i row2i = _mm_castps_si128(row2);
+ __m128i row3i = _mm_castps_si128(row3);
+
+ __m128i vTemp = row2i;
+ row2i = _mm_unpacklo_epi32(row2i, row3i);
+ vTemp = _mm_unpackhi_epi32(vTemp, row3i);
+
+ row3i = row0i;
+ row0i = _mm_unpacklo_epi32(row0i, row1i);
+ row3i = _mm_unpackhi_epi32(row3i, row1i);
+
+ row1i = row0i;
+ row0i = _mm_unpacklo_epi64(row0i, row2i);
+ row1i = _mm_unpackhi_epi64(row1i, row2i);
+
+ row2i = row3i;
+ row2i = _mm_unpacklo_epi64(row2i, vTemp);
+ row3i = _mm_unpackhi_epi64(row3i, vTemp);
+
+ row0 = _mm_castsi128_ps(row0i);
+ row1 = _mm_castsi128_ps(row1i);
+ row2 = _mm_castsi128_ps(row2i);
+ row3 = _mm_castsi128_ps(row3i);
+}
+
+INLINE
+void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
+{
+ __m128i vTemp = row2;
+ row2 = _mm_unpacklo_epi32(row2, row3);
+ vTemp = _mm_unpackhi_epi32(vTemp, row3);
+
+ row3 = row0;
+ row0 = _mm_unpacklo_epi32(row0, row1);
+ row3 = _mm_unpackhi_epi32(row3, row1);
+
+ row1 = row0;
+ row0 = _mm_unpacklo_epi64(row0, row2);
+ row1 = _mm_unpackhi_epi64(row1, row2);
+
+ row2 = row3;
+ row2 = _mm_unpacklo_epi64(row2, vTemp);
+ row3 = _mm_unpackhi_epi64(row3, vTemp);
+}
+
+#define GCC_VERSION (__GNUC__ * 10000 \
+ + __GNUC_MINOR__ * 100 \
+ + __GNUC_PATCHLEVEL__)
+
+#if defined(__GNUC__) && (GCC_VERSION < 40900)
+#define _mm_undefined_ps _mm_setzero_ps
+#define _mm_undefined_si128 _mm_setzero_si128
+#if KNOB_SIMD_WIDTH == 8
+#define _mm256_undefined_ps _mm256_setzero_ps
+#endif
+#endif
+
+#if KNOB_SIMD_WIDTH == 8
+INLINE
+void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2)
+{
+ __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
+ __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5
+ __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
+ __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
+
+ r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
+ r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77
+ __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
+ __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
+
+ vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
+ vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
+ vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
+ vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
+
+ vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
+ vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
+ vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
+ vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
+}
+
+INLINE
+void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3)
+{
+ __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
+ __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
+ __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
+ __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
+
+ r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
+ r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77
+ __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
+ __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
+
+ vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
+ vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
+ vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
+ vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
+
+ vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
+ vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
+ vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
+ vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
+}
+
+INLINE
+void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
+{
+ __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
+ __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
+ __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
+ __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
+ __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
+ __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
+ __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
+ __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
+ __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
+ __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
+ __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
+ __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
+ __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
+ __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
+ __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
+ __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
+ vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
+ vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
+ vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
+ vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
+ vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
+ vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
+ vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
+ vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
+}
+
+INLINE
+void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
+{
+ vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3),
+ _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////
+/// TranposeSingleComponent
+//////////////////////////////////////////////////////////////////////////
+template<uint32_t bpp>
+struct TransposeSingleComponent
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Pass-thru for single component.
+ /// @param pSrc - source data in SOA form
+ /// @param pDst - output data in AOS form
+ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ {
+ memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose8_8_8_8
+//////////////////////////////////////////////////////////////////////////
+struct Transpose8_8_8_8
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
+ /// @param pSrc - source data in SOA form
+ /// @param pDst - output data in AOS form
+ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ {
+ simdscalari src = _simd_load_si((const simdscalari*)pSrc);
+#if KNOB_SIMD_WIDTH == 8
+#if KNOB_ARCH == KNOB_ARCH_AVX
+ __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg
+ __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa
+ __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
+ __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
+ __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
+ __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa
+ __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba
+ __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba
+ _mm_store_si128((__m128i*)pDst, c0123lo);
+ _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
+#elif KNOB_ARCH == KNOB_ARCH_AVX2
+ simdscalari dst01 = _mm256_shuffle_epi8(src,
+ _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
+ simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
+ dst23 = _mm256_shuffle_epi8(dst23,
+ _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
+ simdscalari dst = _mm256_or_si256(dst01, dst23);
+ _simd_store_si((simdscalari*)pDst, dst);
+#endif
+#else
+#error Unsupported vector width
+#endif
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose8_8_8
+//////////////////////////////////////////////////////////////////////////
+struct Transpose8_8_8
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
+ /// @param pSrc - source data in SOA form
+ /// @param pDst - output data in AOS form
+ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose8_8
+//////////////////////////////////////////////////////////////////////////
+struct Transpose8_8
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
+ /// @param pSrc - source data in SOA form
+ /// @param pDst - output data in AOS form
+ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ {
+ simdscalari src = _simd_load_si((const simdscalari*)pSrc);
+
+#if KNOB_SIMD_WIDTH == 8
+ __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg
+ __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg
+ rg = _mm_unpacklo_epi8(rg, g);
+ _mm_store_si128((__m128i*)pDst, rg);
+#else
+#error Unsupported vector width
+#endif
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose32_32_32_32
+//////////////////////////////////////////////////////////////////////////
+struct Transpose32_32_32_32
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
+ /// @param pSrc - source data in SOA form
+ /// @param pDst - output data in AOS form
+ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ {
+#if KNOB_SIMD_WIDTH == 8
+ simdscalar src0 = _simd_load_ps((const float*)pSrc);
+ simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
+ simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
+ simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
+
+ __m128 vDst[8];
+ vTranspose4x8(vDst, src0, src1, src2, src3);
+ _mm_store_ps((float*)pDst, vDst[0]);
+ _mm_store_ps((float*)pDst+4, vDst[1]);
+ _mm_store_ps((float*)pDst+8, vDst[2]);
+ _mm_store_ps((float*)pDst+12, vDst[3]);
+ _mm_store_ps((float*)pDst+16, vDst[4]);
+ _mm_store_ps((float*)pDst+20, vDst[5]);
+ _mm_store_ps((float*)pDst+24, vDst[6]);
+ _mm_store_ps((float*)pDst+28, vDst[7]);
+#else
+#error Unsupported vector width
+#endif
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose32_32_32
+//////////////////////////////////////////////////////////////////////////
+struct Transpose32_32_32
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
+ /// @param pSrc - source data in SOA form
+ /// @param pDst - output data in AOS form
+ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ {
+#if KNOB_SIMD_WIDTH == 8
+ simdscalar src0 = _simd_load_ps((const float*)pSrc);
+ simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
+ simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
+
+ __m128 vDst[8];
+ vTranspose3x8(vDst, src0, src1, src2);
+ _mm_store_ps((float*)pDst, vDst[0]);
+ _mm_store_ps((float*)pDst + 4, vDst[1]);
+ _mm_store_ps((float*)pDst + 8, vDst[2]);
+ _mm_store_ps((float*)pDst + 12, vDst[3]);
+ _mm_store_ps((float*)pDst + 16, vDst[4]);
+ _mm_store_ps((float*)pDst + 20, vDst[5]);
+ _mm_store_ps((float*)pDst + 24, vDst[6]);
+ _mm_store_ps((float*)pDst + 28, vDst[7]);
+#else
+#error Unsupported vector width
+#endif
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose32_32
+//////////////////////////////////////////////////////////////////////////
+struct Transpose32_32
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
+ /// @param pSrc - source data in SOA form
+ /// @param pDst - output data in AOS form
+ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ {
+ const float* pfSrc = (const float*)pSrc;
+ __m128 src_r0 = _mm_load_ps(pfSrc + 0);
+ __m128 src_r1 = _mm_load_ps(pfSrc + 4);
+ __m128 src_g0 = _mm_load_ps(pfSrc + 8);
+ __m128 src_g1 = _mm_load_ps(pfSrc + 12);
+
+ __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
+ __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
+ __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
+ __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
+
+ float* pfDst = (float*)pDst;
+ _mm_store_ps(pfDst + 0, dst0);
+ _mm_store_ps(pfDst + 4, dst1);
+ _mm_store_ps(pfDst + 8, dst2);
+ _mm_store_ps(pfDst + 12, dst3);
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose16_16_16_16
+//////////////////////////////////////////////////////////////////////////
+struct Transpose16_16_16_16
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
+ /// @param pSrc - source data in SOA form
+ /// @param pDst - output data in AOS form
+ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ {
+#if KNOB_SIMD_WIDTH == 8
+ simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
+ simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
+
+ __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
+ __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
+ __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
+ __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
+
+ __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
+ __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
+ __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
+ __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
+
+ __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
+ __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
+ __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
+ __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
+
+ _mm_store_si128(((__m128i*)pDst) + 0, dst0);
+ _mm_store_si128(((__m128i*)pDst) + 1, dst1);
+ _mm_store_si128(((__m128i*)pDst) + 2, dst2);
+ _mm_store_si128(((__m128i*)pDst) + 3, dst3);
+#else
+#error Unsupported vector width
+#endif
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose16_16_16
+//////////////////////////////////////////////////////////////////////////
+struct Transpose16_16_16
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
+ /// @param pSrc - source data in SOA form
+ /// @param pDst - output data in AOS form
+ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ {
+#if KNOB_SIMD_WIDTH == 8
+ simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
+
+ __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
+ __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
+ __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
+ __m128i src_a = _mm_undefined_si128();
+
+ __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
+ __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
+ __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
+ __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
+
+ __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
+ __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
+ __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
+ __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
+
+ _mm_store_si128(((__m128i*)pDst) + 0, dst0);
+ _mm_store_si128(((__m128i*)pDst) + 1, dst1);
+ _mm_store_si128(((__m128i*)pDst) + 2, dst2);
+ _mm_store_si128(((__m128i*)pDst) + 3, dst3);
+#else
+#error Unsupported vector width
+#endif
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose16_16
+//////////////////////////////////////////////////////////////////////////
+struct Transpose16_16
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
+ /// @param pSrc - source data in SOA form
+ /// @param pDst - output data in AOS form
+ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ {
+ simdscalar src = _simd_load_ps((const float*)pSrc);
+
+#if KNOB_SIMD_WIDTH == 8
+ __m128 comp0 = _mm256_castps256_ps128(src);
+ __m128 comp1 = _mm256_extractf128_ps(src, 1);
+
+ __m128i comp0i = _mm_castps_si128(comp0);
+ __m128i comp1i = _mm_castps_si128(comp1);
+
+ __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
+ __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
+
+ _mm_store_si128((__m128i*)pDst, resLo);
+ _mm_store_si128((__m128i*)pDst + 1, resHi);
+#else
+#error Unsupported vector width
+#endif
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose24_8
+//////////////////////////////////////////////////////////////////////////
+struct Transpose24_8
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
+ /// @param pSrc - source data in SOA form
+ /// @param pDst - output data in AOS form
+ static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose32_8_24
+//////////////////////////////////////////////////////////////////////////
+struct Transpose32_8_24
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
+ /// @param pSrc - source data in SOA form
+ /// @param pDst - output data in AOS form
+ static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+};
+
+
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose4_4_4_4
+//////////////////////////////////////////////////////////////////////////
+struct Transpose4_4_4_4
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
+ /// @param pSrc - source data in SOA form
+ /// @param pDst - output data in AOS form
+ static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose5_6_5
+//////////////////////////////////////////////////////////////////////////
+struct Transpose5_6_5
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
+ /// @param pSrc - source data in SOA form
+ /// @param pDst - output data in AOS form
+ static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose9_9_9_5
+//////////////////////////////////////////////////////////////////////////
+struct Transpose9_9_9_5
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
+ /// @param pSrc - source data in SOA form
+ /// @param pDst - output data in AOS form
+ static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose5_5_5_1
+//////////////////////////////////////////////////////////////////////////
+struct Transpose5_5_5_1
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
+ /// @param pSrc - source data in SOA form
+ /// @param pDst - output data in AOS form
+ static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose10_10_10_2
+//////////////////////////////////////////////////////////////////////////
+struct Transpose10_10_10_2
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
+ /// @param pSrc - source data in SOA form
+ /// @param pDst - output data in AOS form
+ static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose11_11_10
+//////////////////////////////////////////////////////////////////////////
+struct Transpose11_11_10
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
+ /// @param pSrc - source data in SOA form
+ /// @param pDst - output data in AOS form
+ static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+};
+
+// helper function to unroll loops
+template<int Begin, int End, int Step = 1>
+struct UnrollerL {
+ template<typename Lambda>
+ INLINE static void step(Lambda& func) {
+ func(Begin);
+ UnrollerL<Begin + Step, End, Step>::step(func);
+ }
+};
+
+template<int End, int Step>
+struct UnrollerL<End, End, Step> {
+ template<typename Lambda>
+ static void step(Lambda& func) {
+ }
+};
+
+// general CRC compute
+INLINE
+uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
+{
+#if defined(_WIN64) || defined(__linux__) || defined(__gnu_linux__)
+ uint32_t sizeInQwords = size / sizeof(uint64_t);
+ uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
+ uint64_t* pDataWords = (uint64_t*)pData;
+ for (uint32_t i = 0; i < sizeInQwords; ++i)
+ {
+ crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
+ }
+#else
+ uint32_t sizeInDwords = size / sizeof(uint32_t);
+ uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
+ uint32_t* pDataWords = (uint32_t*)pData;
+ for (uint32_t i = 0; i < sizeInDwords; ++i)
+ {
+ crc = _mm_crc32_u32(crc, *pDataWords++);
+ }
+#endif
+
+ BYTE* pRemainderBytes = (BYTE*)pDataWords;
+ for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
+ {
+ crc = _mm_crc32_u8(crc, *pRemainderBytes++);
+ }
+
+ return crc;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Add byte offset to any-type pointer
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+INLINE
+static T* PtrAdd(T* p, intptr_t offset)
+{
+ intptr_t intp = reinterpret_cast<intptr_t>(p);
+ return reinterpret_cast<T*>(intp + offset);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Is a power-of-2?
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+INLINE
+static bool IsPow2(T value)
+{
+ return value == (value & (0 - value));
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Align down to specified alignment
+/// Note: IsPow2(alignment) MUST be true
+//////////////////////////////////////////////////////////////////////////
+template <typename T1, typename T2>
+INLINE
+static T1 AlignDownPow2(T1 value, T2 alignment)
+{
+ SWR_ASSERT(IsPow2(alignment));
+ return value & ~T1(alignment - 1);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Align up to specified alignment
+/// Note: IsPow2(alignment) MUST be true
+//////////////////////////////////////////////////////////////////////////
+template <typename T1, typename T2>
+INLINE
+static T1 AlignUpPow2(T1 value, T2 alignment)
+{
+ return AlignDownPow2(value + T1(alignment - 1), alignment);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Align up ptr to specified alignment
+/// Note: IsPow2(alignment) MUST be true
+//////////////////////////////////////////////////////////////////////////
+template <typename T1, typename T2>
+INLINE
+static T1* AlignUpPow2(T1* value, T2 alignment)
+{
+ return reinterpret_cast<T1*>(
+ AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Align down to specified alignment
+//////////////////////////////////////////////////////////////////////////
+template <typename T1, typename T2>
+INLINE
+static T1 AlignDown(T1 value, T2 alignment)
+{
+ if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
+ return value - T1(value % alignment);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Align down to specified alignment
+//////////////////////////////////////////////////////////////////////////
+template <typename T1, typename T2>
+INLINE
+static T1* AlignDown(T1* value, T2 alignment)
+{
+ return (T1*)AlignDown(uintptr_t(value), alignment);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Align up to specified alignment
+/// Note: IsPow2(alignment) MUST be true
+//////////////////////////////////////////////////////////////////////////
+template <typename T1, typename T2>
+INLINE
+static T1 AlignUp(T1 value, T2 alignment)
+{
+ return AlignDown(value + T1(alignment - 1), alignment);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Align up to specified alignment
+/// Note: IsPow2(alignment) MUST be true
+//////////////////////////////////////////////////////////////////////////
+template <typename T1, typename T2>
+INLINE
+static T1* AlignUp(T1* value, T2 alignment)
+{
+ return AlignDown(PtrAdd(value, alignment - 1), alignment);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Helper structure used to access an array of elements that don't
+/// correspond to a typical word size.
+//////////////////////////////////////////////////////////////////////////
+template<typename T, size_t BitsPerElementT, size_t ArrayLenT>
+class BitsArray
+{
+private:
+ static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
+ static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
+ static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
+ static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
+
+ static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
+ "Element size must an integral fraction of pointer size");
+
+ size_t m_words[NUM_WORDS] = {};
+
+public:
+
+ T operator[] (size_t elementIndex) const
+ {
+ size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
+ word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
+ return T(word & ELEMENT_MASK);
+ }
+};
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
new file mode 100644
index 00000000000..734c89792f0
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -0,0 +1,313 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file JitManager.cpp
+*
+* @brief Implementation if the Jit Manager.
+*
+* Notes:
+*
+******************************************************************************/
+#if defined(_WIN32)
+#pragma warning(disable: 4800 4146 4244 4267 4355 4996)
+#endif
+
+#include "jit_api.h"
+#include "JitManager.h"
+#include "fetch_jit.h"
+
+#if defined(_WIN32)
+#include "llvm/ADT/Triple.h"
+#endif
+#include "llvm/IR/Function.h"
+#include "llvm/Support/DynamicLibrary.h"
+
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+
+#include "llvm/Analysis/CFGPrinter.h"
+#include "llvm/IRReader/IRReader.h"
+
+#include "core/state.h"
+#include "common/containers.hpp"
+
+#include "state_llvm.h"
+
+#include <sstream>
+#if defined(_WIN32)
+#include <psapi.h>
+#include <cstring>
+
+#define INTEL_OUTPUT_DIR "c:\\Intel"
+#define SWR_OUTPUT_DIR INTEL_OUTPUT_DIR "\\SWR"
+#define JITTER_OUTPUT_DIR SWR_OUTPUT_DIR "\\Jitter"
+#endif
+
+using namespace llvm;
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Contructor for JitManager.
+/// @param simdWidth - SIMD width to be used in generated program.
+JitManager::JitManager(uint32_t simdWidth, const char *arch)
+ : mContext(), mBuilder(mContext), mIsModuleFinalized(true), mJitNumber(0), mVWidth(simdWidth), mArch(arch)
+{
+ InitializeNativeTarget();
+ InitializeNativeTargetAsmPrinter();
+ InitializeNativeTargetDisassembler();
+
+ TargetOptions tOpts;
+ tOpts.AllowFPOpFusion = FPOpFusion::Fast;
+ tOpts.NoInfsFPMath = false;
+ tOpts.NoNaNsFPMath = false;
+ tOpts.UnsafeFPMath = true;
+#if defined(_DEBUG)
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 7
+ tOpts.NoFramePointerElim = true;
+#endif
+#endif
+
+ //tOpts.PrintMachineCode = true;
+
+ std::stringstream fnName("JitModule", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
+ fnName << mJitNumber++;
+ std::unique_ptr<Module> newModule(new Module(fnName.str(), mContext));
+ mpCurrentModule = newModule.get();
+
+ auto &&EB = EngineBuilder(std::move(newModule));
+ EB.setTargetOptions(tOpts);
+ EB.setOptLevel(CodeGenOpt::Aggressive);
+
+ StringRef hostCPUName;
+
+ // force JIT to use the same CPU arch as the rest of swr
+ if(mArch.AVX512F())
+ {
+ assert(0 && "Implement AVX512 jitter");
+ hostCPUName = sys::getHostCPUName();
+ if (mVWidth == 0)
+ {
+ mVWidth = 16;
+ }
+ }
+ else if(mArch.AVX2())
+ {
+ hostCPUName = StringRef("core-avx2");
+ if (mVWidth == 0)
+ {
+ mVWidth = 8;
+ }
+ }
+ else if(mArch.AVX())
+ {
+ if (mArch.F16C())
+ {
+ hostCPUName = StringRef("core-avx-i");
+ }
+ else
+ {
+ hostCPUName = StringRef("corei7-avx");
+ }
+ if (mVWidth == 0)
+ {
+ mVWidth = 8;
+ }
+ }
+ else
+ {
+ hostCPUName = sys::getHostCPUName();
+ if (mVWidth == 0)
+ {
+ mVWidth = 8; // 4?
+ }
+ }
+
+ EB.setMCPU(hostCPUName);
+
+#if defined(_WIN32)
+ // Needed for MCJIT on windows
+ Triple hostTriple(sys::getProcessTriple());
+ hostTriple.setObjectFormat(Triple::ELF);
+ mpCurrentModule->setTargetTriple(hostTriple.getTriple());
+#endif // _WIN32
+
+ mpExec = EB.create();
+
+#if LLVM_USE_INTEL_JITEVENTS
+ JITEventListener *vTune = JITEventListener::createIntelJITEventListener();
+ mpExec->RegisterJITEventListener(vTune);
+#endif
+
+ mFP32Ty = Type::getFloatTy(mContext); // float type
+ mInt8Ty = Type::getInt8Ty(mContext);
+ mInt32Ty = Type::getInt32Ty(mContext); // int type
+ mInt64Ty = Type::getInt64Ty(mContext); // int type
+ mV4FP32Ty = StructType::get(mContext, std::vector<Type*>(4, mFP32Ty), false); // vector4 float type (represented as structure)
+ mV4Int32Ty = StructType::get(mContext, std::vector<Type*>(4, mInt32Ty), false); // vector4 int type
+
+ // fetch function signature
+ // typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, simdvertex& out);
+ std::vector<Type*> fsArgs;
+ fsArgs.push_back(PointerType::get(Gen_SWR_FETCH_CONTEXT(this), 0));
+ fsArgs.push_back(PointerType::get(Gen_simdvertex(this), 0));
+
+ mFetchShaderTy = FunctionType::get(Type::getVoidTy(mContext), fsArgs, false);
+
+ mSimtFP32Ty = VectorType::get(mFP32Ty, mVWidth);
+ mSimtInt32Ty = VectorType::get(mInt32Ty, mVWidth);
+
+ mSimdVectorTy = StructType::get(mContext, std::vector<Type*>(4, mSimtFP32Ty), false);
+ mSimdVectorInt32Ty = StructType::get(mContext, std::vector<Type*>(4, mSimtInt32Ty), false);
+
+#if defined(_WIN32)
+ // explicitly instantiate used symbols from potentially staticly linked libs
+ sys::DynamicLibrary::AddSymbol("exp2f", &exp2f);
+ sys::DynamicLibrary::AddSymbol("log2f", &log2f);
+ sys::DynamicLibrary::AddSymbol("sinf", &sinf);
+ sys::DynamicLibrary::AddSymbol("cosf", &cosf);
+ sys::DynamicLibrary::AddSymbol("powf", &powf);
+#endif
+
+#if defined(_WIN32)
+ if (KNOB_DUMP_SHADER_IR)
+ {
+ CreateDirectory(INTEL_OUTPUT_DIR, NULL);
+ CreateDirectory(SWR_OUTPUT_DIR, NULL);
+ CreateDirectory(JITTER_OUTPUT_DIR, NULL);
+ }
+
+ ///@todo Figure out a better solution for this.
+ // Redirect stdin, stdout, and stderr to attached console.
+ freopen("CONIN$", "r", stdin);
+ freopen("CONOUT$", "w", stdout);
+ freopen("CONOUT$", "w", stderr);
+#endif
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Create new LLVM module.
+void JitManager::SetupNewModule()
+{
+ SWR_ASSERT(mIsModuleFinalized == true && "Current module is not finalized!");
+
+ std::stringstream fnName("JitModule", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
+ fnName << mJitNumber++;
+ std::unique_ptr<Module> newModule(new Module(fnName.str(), mContext));
+ mpCurrentModule = newModule.get();
+#if defined(_WIN32)
+ // Needed for MCJIT on windows
+ Triple hostTriple(sys::getProcessTriple());
+ hostTriple.setObjectFormat(Triple::ELF);
+ newModule->setTargetTriple(hostTriple.getTriple());
+#endif // _WIN32
+
+ mpExec->addModule(std::move(newModule));
+ mIsModuleFinalized = false;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Create new LLVM module from IR.
+bool JitManager::SetupModuleFromIR(const uint8_t *pIR)
+{
+ std::unique_ptr<MemoryBuffer> pMem = MemoryBuffer::getMemBuffer(StringRef((const char*)pIR), "");
+
+ SMDiagnostic Err;
+ std::unique_ptr<Module> newModule = parseIR(pMem.get()->getMemBufferRef(), Err, mContext);
+
+ if (newModule == nullptr)
+ {
+ SWR_ASSERT(0, "Parse failed! Check Err for details.");
+ return false;
+ }
+
+ mpCurrentModule = newModule.get();
+#if defined(_WIN32)
+ // Needed for MCJIT on windows
+ Triple hostTriple(sys::getProcessTriple());
+ hostTriple.setObjectFormat(Triple::ELF);
+ newModule->setTargetTriple(hostTriple.getTriple());
+#endif // _WIN32
+
+ mpExec->addModule(std::move(newModule));
+ mIsModuleFinalized = false;
+
+ return true;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Dump function to file.
+void JitManager::DumpToFile(Function *f, const char *fileName)
+{
+ if (KNOB_DUMP_SHADER_IR)
+ {
+#if defined(_WIN32)
+ DWORD pid = GetCurrentProcessId();
+ TCHAR procname[MAX_PATH];
+ GetModuleFileName(NULL, procname, MAX_PATH);
+ const char* pBaseName = strrchr(procname, '\\');
+ std::stringstream outDir;
+ outDir << JITTER_OUTPUT_DIR << pBaseName << "_" << pid << std::ends;
+ CreateDirectory(outDir.str().c_str(), NULL);
+#endif
+
+ std::error_code EC;
+ const char *funcName = f->getName().data();
+ char fName[256];
+#if defined(_WIN32)
+ sprintf(fName, "%s\\%s.%s.ll", outDir.str().c_str(), funcName, fileName);
+#else
+ sprintf(fName, "%s.%s.ll", funcName, fileName);
+#endif
+ raw_fd_ostream fd(fName, EC, llvm::sys::fs::F_None);
+ Module* pModule = f->getParent();
+ pModule->print(fd, nullptr);
+
+#if defined(_WIN32)
+ sprintf(fName, "%s\\cfg.%s.%s.dot", outDir.str().c_str(), funcName, fileName);
+#else
+ sprintf(fName, "cfg.%s.%s.dot", funcName, fileName);
+#endif
+ fd.flush();
+
+ raw_fd_ostream fd_cfg(fName, EC, llvm::sys::fs::F_Text);
+ WriteGraph(fd_cfg, (const Function*)f);
+
+ fd_cfg.flush();
+ }
+}
+
+extern "C"
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Create JIT context.
+ /// @param simdWidth - SIMD width to be used in generated program.
+ HANDLE JITCALL JitCreateContext(uint32_t targetSimdWidth, const char* arch)
+ {
+ return new JitManager(targetSimdWidth, arch);
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Destroy JIT context.
+ void JITCALL JitDestroyContext(HANDLE hJitContext)
+ {
+ delete reinterpret_cast<JitManager*>(hJitContext);
+ }
+}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
new file mode 100644
index 00000000000..c974a611224
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
@@ -0,0 +1,186 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file JitManager.h
+*
+* @brief JitManager contains the LLVM data structures used for JIT generation
+*
+* Notes:
+*
+******************************************************************************/
+#pragma once
+
+#include "common/os.h"
+#include "common/isa.hpp"
+
+#if defined(_WIN32)
+#pragma warning(disable : 4146 4244 4267 4800 4996)
+#endif
+
+// llvm 3.7+ reuses "DEBUG" as an enum value
+#pragma push_macro("DEBUG")
+#undef DEBUG
+
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+
+#include "llvm/Config/llvm-config.h"
+#ifndef LLVM_VERSION_MAJOR
+#include "llvm/Config/config.h"
+#endif
+
+#include "llvm/IR/Verifier.h"
+#include "llvm/ExecutionEngine/MCJIT.h"
+#include "llvm/Support/FileSystem.h"
+#define LLVM_F_NONE sys::fs::F_None
+
+#include "llvm/Analysis/Passes.h"
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#include "llvm/PassManager.h"
+#else
+#include "llvm/IR/LegacyPassManager.h"
+using namespace llvm::legacy;
+#endif
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Support/Host.h"
+
+
+#pragma pop_macro("DEBUG")
+
+using namespace llvm;
+//////////////////////////////////////////////////////////////////////////
+/// JitInstructionSet
+/// @brief Subclass of InstructionSet that allows users to override
+/// the reporting of support for certain ISA features. This allows capping
+/// the jitted code to a certain feature level, e.g. jit AVX level code on
+/// a platform that supports AVX2.
+//////////////////////////////////////////////////////////////////////////
+class JitInstructionSet : public InstructionSet
+{
+public:
+ JitInstructionSet(const char* requestedIsa) : isaRequest(requestedIsa)
+ {
+ std::transform(isaRequest.begin(), isaRequest.end(), isaRequest.begin(), ::tolower);
+
+ if(isaRequest == "avx")
+ {
+ bForceAVX = true;
+ bForceAVX2 = false;
+ bForceAVX512 = false;
+ }
+ else if(isaRequest == "avx2")
+ {
+ bForceAVX = false;
+ bForceAVX2 = true;
+ bForceAVX512 = false;
+ }
+ #if 0
+ else if(isaRequest == "avx512")
+ {
+ bForceAVX = false;
+ bForceAVX2 = false;
+ bForceAVX512 = true;
+ }
+ #endif
+ };
+
+ bool AVX2(void) { return bForceAVX ? 0 : InstructionSet::AVX2(); }
+ bool AVX512F(void) { return (bForceAVX | bForceAVX2) ? 0 : InstructionSet::AVX512F(); }
+ bool BMI2(void) { return bForceAVX ? 0 : InstructionSet::BMI2(); }
+
+private:
+ bool bForceAVX = false;
+ bool bForceAVX2 = false;
+ bool bForceAVX512 = false;
+ std::string isaRequest;
+};
+
+
+
+struct JitLLVMContext : LLVMContext
+{
+};
+
+
+//////////////////////////////////////////////////////////////////////////
+/// JitManager
+//////////////////////////////////////////////////////////////////////////
+struct JitManager
+{
+ JitManager(uint32_t w, const char *arch);
+ ~JitManager(){};
+
+ JitLLVMContext mContext; ///< LLVM compiler
+ IRBuilder<> mBuilder; ///< LLVM IR Builder
+ ExecutionEngine* mpExec;
+
+ // Need to be rebuilt after a JIT and before building new IR
+ Module* mpCurrentModule;
+ bool mIsModuleFinalized;
+ uint32_t mJitNumber;
+
+ uint32_t mVWidth;
+
+ // Built in types.
+ Type* mInt8Ty;
+ Type* mInt32Ty;
+ Type* mInt64Ty;
+ Type* mFP32Ty;
+ StructType* mV4FP32Ty;
+ StructType* mV4Int32Ty;
+
+ // helper scalar function types
+ FunctionType* mUnaryFPTy;
+ FunctionType* mBinaryFPTy;
+ FunctionType* mTrinaryFPTy;
+ FunctionType* mUnaryIntTy;
+ FunctionType* mBinaryIntTy;
+ FunctionType* mTrinaryIntTy;
+
+ Type* mSimtFP32Ty;
+ Type* mSimtInt32Ty;
+
+ Type* mSimdVectorInt32Ty;
+ Type* mSimdVectorTy;
+
+ // fetch shader types
+ FunctionType* mFetchShaderTy;
+
+ JitInstructionSet mArch;
+
+ void SetupNewModule();
+ bool SetupModuleFromIR(const uint8_t *pIR);
+
+ static void DumpToFile(Function *f, const char *fileName);
+};
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
new file mode 100644
index 00000000000..954524afd3a
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
@@ -0,0 +1,772 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file blend_jit.cpp
+*
+* @brief Implementation of the blend jitter
+*
+* Notes:
+*
+******************************************************************************/
+#include "jit_api.h"
+#include "blend_jit.h"
+#include "builder.h"
+#include "state_llvm.h"
+#include "common/containers.hpp"
+#include "llvm/IR/DataLayout.h"
+
+#include <sstream>
+
+// components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized
+#define QUANTIZE_THRESHOLD 2
+
+//////////////////////////////////////////////////////////////////////////
+/// Interface to Jitting a blend shader
+//////////////////////////////////////////////////////////////////////////
+struct BlendJit : public Builder
+{
+ BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){};
+
+ template<bool Color, bool Alpha>
+ void GenerateBlendFactor(SWR_BLEND_FACTOR factor, Value* constColor[4], Value* src[4], Value* src1[4], Value* dst[4], Value* result[4])
+ {
+ Value* out[4];
+
+ switch (factor)
+ {
+ case BLENDFACTOR_ONE:
+ out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f);
+ break;
+ case BLENDFACTOR_SRC_COLOR:
+ out[0] = src[0];
+ out[1] = src[1];
+ out[2] = src[2];
+ out[3] = src[3];
+ break;
+ case BLENDFACTOR_SRC_ALPHA:
+ out[0] = out[1] = out[2] = out[3] = src[3];
+ break;
+ case BLENDFACTOR_DST_ALPHA:
+ out[0] = out[1] = out[2] = out[3] = dst[3];
+ break;
+ case BLENDFACTOR_DST_COLOR:
+ out[0] = dst[0];
+ out[1] = dst[1];
+ out[2] = dst[2];
+ out[3] = dst[3];
+ break;
+ case BLENDFACTOR_SRC_ALPHA_SATURATE:
+ out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3]));
+ out[3] = VIMMED1(1.0f);
+ break;
+ case BLENDFACTOR_CONST_COLOR:
+ out[0] = constColor[0];
+ out[1] = constColor[1];
+ out[2] = constColor[2];
+ out[3] = constColor[3];
+ break;
+ case BLENDFACTOR_CONST_ALPHA:
+ out[0] = out[1] = out[2] = out[3] = constColor[3];
+ break;
+ case BLENDFACTOR_SRC1_COLOR:
+ out[0] = src1[0];
+ out[1] = src1[1];
+ out[2] = src1[2];
+ out[3] = src1[3];
+ break;
+ case BLENDFACTOR_SRC1_ALPHA:
+ out[0] = out[1] = out[2] = out[3] = src1[3];
+ break;
+ case BLENDFACTOR_ZERO:
+ out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
+ break;
+ case BLENDFACTOR_INV_SRC_COLOR:
+ out[0] = FSUB(VIMMED1(1.0f), src[0]);
+ out[1] = FSUB(VIMMED1(1.0f), src[1]);
+ out[2] = FSUB(VIMMED1(1.0f), src[2]);
+ out[3] = FSUB(VIMMED1(1.0f), src[3]);
+ break;
+ case BLENDFACTOR_INV_SRC_ALPHA:
+ out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]);
+ break;
+ case BLENDFACTOR_INV_DST_ALPHA:
+ out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]);
+ break;
+ case BLENDFACTOR_INV_DST_COLOR:
+ out[0] = FSUB(VIMMED1(1.0f), dst[0]);
+ out[1] = FSUB(VIMMED1(1.0f), dst[1]);
+ out[2] = FSUB(VIMMED1(1.0f), dst[2]);
+ out[3] = FSUB(VIMMED1(1.0f), dst[3]);
+ break;
+ case BLENDFACTOR_INV_CONST_COLOR:
+ out[0] = FSUB(VIMMED1(1.0f), constColor[0]);
+ out[1] = FSUB(VIMMED1(1.0f), constColor[1]);
+ out[2] = FSUB(VIMMED1(1.0f), constColor[2]);
+ out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
+ break;
+ case BLENDFACTOR_INV_CONST_ALPHA:
+ out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
+ break;
+ case BLENDFACTOR_INV_SRC1_COLOR:
+ out[0] = FSUB(VIMMED1(1.0f), src1[0]);
+ out[1] = FSUB(VIMMED1(1.0f), src1[1]);
+ out[2] = FSUB(VIMMED1(1.0f), src1[2]);
+ out[3] = FSUB(VIMMED1(1.0f), src1[3]);
+ break;
+ case BLENDFACTOR_INV_SRC1_ALPHA:
+ out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]);
+ break;
+ default:
+ SWR_ASSERT(false, "Unsupported blend factor: %d", factor);
+ out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
+ break;
+ }
+
+ if (Color)
+ {
+ result[0] = out[0];
+ result[1] = out[1];
+ result[2] = out[2];
+ }
+
+ if (Alpha)
+ {
+ result[3] = out[3];
+ }
+ }
+
+ void Clamp(SWR_FORMAT format, Value* src[4])
+ {
+ const SWR_FORMAT_INFO& info = GetFormatInfo(format);
+ SWR_TYPE type = info.type[0];
+
+ switch (type)
+ {
+ case SWR_TYPE_FLOAT:
+ break;
+
+ case SWR_TYPE_UNORM:
+ src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f));
+ src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f));
+ src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f));
+ src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f));
+ break;
+
+ case SWR_TYPE_SNORM:
+ src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f));
+ src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f));
+ src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f));
+ src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f));
+ break;
+
+ default: SWR_ASSERT(false, "Unsupport format type: %d", type);
+ }
+ }
+
+ void ApplyDefaults(SWR_FORMAT format, Value* src[4])
+ {
+ const SWR_FORMAT_INFO& info = GetFormatInfo(format);
+
+ bool valid[] = { false, false, false, false };
+ for (uint32_t c = 0; c < info.numComps; ++c)
+ {
+ valid[info.swizzle[c]] = true;
+ }
+
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ if (!valid[c])
+ {
+ src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty);
+ }
+ }
+ }
+
+ void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4])
+ {
+ const SWR_FORMAT_INFO& info = GetFormatInfo(format);
+
+ for (uint32_t c = 0; c < info.numComps; ++c)
+ {
+ if (info.type[c] == SWR_TYPE_UNUSED)
+ {
+ src[info.swizzle[c]] = BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty);
+ }
+ }
+ }
+
+ void Quantize(SWR_FORMAT format, Value* src[4])
+ {
+ const SWR_FORMAT_INFO& info = GetFormatInfo(format);
+ for (uint32_t c = 0; c < info.numComps; ++c)
+ {
+ if (info.bpc[c] <= QUANTIZE_THRESHOLD)
+ {
+ uint32_t swizComp = info.swizzle[c];
+ float factor = (float)((1 << info.bpc[c]) - 1);
+ switch (info.type[c])
+ {
+ case SWR_TYPE_UNORM:
+ src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f));
+ src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO));
+ src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f /factor));
+ break;
+ default: SWR_ASSERT(false, "Unsupported format type: %d", info.type[c]);
+ }
+ }
+ }
+ }
+
+ template<bool Color, bool Alpha>
+ void BlendFunc(SWR_BLEND_OP blendOp, Value* src[4], Value* srcFactor[4], Value* dst[4], Value* dstFactor[4], Value* result[4])
+ {
+ Value* out[4];
+ Value* srcBlend[4];
+ Value* dstBlend[4];
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ srcBlend[i] = FMUL(src[i], srcFactor[i]);
+ dstBlend[i] = FMUL(dst[i], dstFactor[i]);
+ }
+
+ switch (blendOp)
+ {
+ case BLENDOP_ADD:
+ out[0] = FADD(srcBlend[0], dstBlend[0]);
+ out[1] = FADD(srcBlend[1], dstBlend[1]);
+ out[2] = FADD(srcBlend[2], dstBlend[2]);
+ out[3] = FADD(srcBlend[3], dstBlend[3]);
+ break;
+
+ case BLENDOP_SUBTRACT:
+ out[0] = FSUB(srcBlend[0], dstBlend[0]);
+ out[1] = FSUB(srcBlend[1], dstBlend[1]);
+ out[2] = FSUB(srcBlend[2], dstBlend[2]);
+ out[3] = FSUB(srcBlend[3], dstBlend[3]);
+ break;
+
+ case BLENDOP_REVSUBTRACT:
+ out[0] = FSUB(dstBlend[0], srcBlend[0]);
+ out[1] = FSUB(dstBlend[1], srcBlend[1]);
+ out[2] = FSUB(dstBlend[2], srcBlend[2]);
+ out[3] = FSUB(dstBlend[3], srcBlend[3]);
+ break;
+
+ case BLENDOP_MIN:
+ out[0] = VMINPS(src[0], dst[0]);
+ out[1] = VMINPS(src[1], dst[1]);
+ out[2] = VMINPS(src[2], dst[2]);
+ out[3] = VMINPS(src[3], dst[3]);
+ break;
+
+ case BLENDOP_MAX:
+ out[0] = VMAXPS(src[0], dst[0]);
+ out[1] = VMAXPS(src[1], dst[1]);
+ out[2] = VMAXPS(src[2], dst[2]);
+ out[3] = VMAXPS(src[3], dst[3]);
+ break;
+
+ default:
+ SWR_ASSERT(false, "Unsupported blend operation: %d", blendOp);
+ out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
+ break;
+ }
+
+ if (Color)
+ {
+ result[0] = out[0];
+ result[1] = out[1];
+ result[2] = out[2];
+ }
+
+ if (Alpha)
+ {
+ result[3] = out[3];
+ }
+ }
+
+ void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4])
+ {
+ // Op: (s == PS output, d = RT contents)
+ switch(logicOp)
+ {
+ case LOGICOP_CLEAR:
+ result[0] = VIMMED1(0);
+ result[1] = VIMMED1(0);
+ result[2] = VIMMED1(0);
+ result[3] = VIMMED1(0);
+ break;
+
+ case LOGICOP_NOR:
+ // ~(s | d)
+ result[0] = XOR(OR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
+ result[1] = XOR(OR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
+ result[2] = XOR(OR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
+ result[3] = XOR(OR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
+ break;
+
+ case LOGICOP_AND_INVERTED:
+ // ~s & d
+ // todo: use avx andnot instr when I can find the intrinsic to call
+ result[0] = AND(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
+ result[1] = AND(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
+ result[2] = AND(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
+ result[3] = AND(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
+ break;
+
+ case LOGICOP_COPY_INVERTED:
+ // ~s
+ result[0] = XOR(src[0], VIMMED1(0xFFFFFFFF));
+ result[1] = XOR(src[1], VIMMED1(0xFFFFFFFF));
+ result[2] = XOR(src[2], VIMMED1(0xFFFFFFFF));
+ result[3] = XOR(src[3], VIMMED1(0xFFFFFFFF));
+ break;
+
+ case LOGICOP_AND_REVERSE:
+ // s & ~d
+ // todo: use avx andnot instr when I can find the intrinsic to call
+ result[0] = AND(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
+ result[1] = AND(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
+ result[2] = AND(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
+ result[3] = AND(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
+ break;
+
+ case LOGICOP_INVERT:
+ // ~d
+ result[0] = XOR(dst[0], VIMMED1(0xFFFFFFFF));
+ result[1] = XOR(dst[1], VIMMED1(0xFFFFFFFF));
+ result[2] = XOR(dst[2], VIMMED1(0xFFFFFFFF));
+ result[3] = XOR(dst[3], VIMMED1(0xFFFFFFFF));
+ break;
+
+ case LOGICOP_XOR:
+ // s ^ d
+ result[0] = XOR(src[0], dst[0]);
+ result[1] = XOR(src[1], dst[1]);
+ result[2] = XOR(src[2], dst[2]);
+ result[3] = XOR(src[3], dst[3]);
+ break;
+
+ case LOGICOP_NAND:
+ // ~(s & d)
+ result[0] = XOR(AND(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
+ result[1] = XOR(AND(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
+ result[2] = XOR(AND(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
+ result[3] = XOR(AND(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
+ break;
+
+ case LOGICOP_AND:
+ // s & d
+ result[0] = AND(src[0], dst[0]);
+ result[1] = AND(src[1], dst[1]);
+ result[2] = AND(src[2], dst[2]);
+ result[3] = AND(src[3], dst[3]);
+ break;
+
+ case LOGICOP_EQUIV:
+ // ~(s ^ d)
+ result[0] = XOR(XOR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
+ result[1] = XOR(XOR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
+ result[2] = XOR(XOR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
+ result[3] = XOR(XOR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
+ break;
+
+ case LOGICOP_NOOP:
+ result[0] = dst[0];
+ result[1] = dst[1];
+ result[2] = dst[2];
+ result[3] = dst[3];
+ break;
+
+ case LOGICOP_OR_INVERTED:
+ // ~s | d
+ result[0] = OR(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
+ result[1] = OR(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
+ result[2] = OR(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
+ result[3] = OR(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
+ break;
+
+ case LOGICOP_COPY:
+ result[0] = src[0];
+ result[1] = src[1];
+ result[2] = src[2];
+ result[3] = src[3];
+ break;
+
+ case LOGICOP_OR_REVERSE:
+ // s | ~d
+ result[0] = OR(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
+ result[1] = OR(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
+ result[2] = OR(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
+ result[3] = OR(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
+ break;
+
+ case LOGICOP_OR:
+ // s | d
+ result[0] = OR(src[0], dst[0]);
+ result[1] = OR(src[1], dst[1]);
+ result[2] = OR(src[2], dst[2]);
+ result[3] = OR(src[3], dst[3]);
+ break;
+
+ case LOGICOP_SET:
+ result[0] = VIMMED1(0xFFFFFFFF);
+ result[1] = VIMMED1(0xFFFFFFFF);
+ result[2] = VIMMED1(0xFFFFFFFF);
+ result[3] = VIMMED1(0xFFFFFFFF);
+ break;
+
+ default:
+ SWR_ASSERT(false, "Unsupported logic operation: %d", logicOp);
+ result[0] = result[1] = result[2] = result[3] = VIMMED1(0.0f);
+ break;
+ }
+ }
+
+ void AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* pAlpha, Value* ppMask)
+ {
+ // load uint32_t reference
+ Value* pRef = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_alphaTestReference }));
+
+ Value* pTest = nullptr;
+ if (state.alphaTestFormat == ALPHA_TEST_UNORM8)
+ {
+ // convert float alpha to unorm8
+ Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f));
+ pAlphaU8 = FP_TO_UI(pAlphaU8, mSimdInt32Ty);
+
+ // compare
+ switch (state.alphaTestFunction)
+ {
+ case ZFUNC_ALWAYS: pTest = VIMMED1(true); break;
+ case ZFUNC_NEVER: pTest = VIMMED1(false); break;
+ case ZFUNC_LT: pTest = ICMP_ULT(pAlphaU8, pRef); break;
+ case ZFUNC_EQ: pTest = ICMP_EQ(pAlphaU8, pRef); break;
+ case ZFUNC_LE: pTest = ICMP_ULE(pAlphaU8, pRef); break;
+ case ZFUNC_GT: pTest = ICMP_UGT(pAlphaU8, pRef); break;
+ case ZFUNC_NE: pTest = ICMP_NE(pAlphaU8, pRef); break;
+ case ZFUNC_GE: pTest = ICMP_UGE(pAlphaU8, pRef); break;
+ default:
+ SWR_ASSERT(false, "Invalid alpha test function");
+ break;
+ }
+ }
+ else
+ {
+ // cast ref to float
+ pRef = BITCAST(pRef, mSimdFP32Ty);
+
+ // compare
+ switch (state.alphaTestFunction)
+ {
+ case ZFUNC_ALWAYS: pTest = VIMMED1(true); break;
+ case ZFUNC_NEVER: pTest = VIMMED1(false); break;
+ case ZFUNC_LT: pTest = FCMP_OLT(pAlpha, pRef); break;
+ case ZFUNC_EQ: pTest = FCMP_OEQ(pAlpha, pRef); break;
+ case ZFUNC_LE: pTest = FCMP_OLE(pAlpha, pRef); break;
+ case ZFUNC_GT: pTest = FCMP_OGT(pAlpha, pRef); break;
+ case ZFUNC_NE: pTest = FCMP_ONE(pAlpha, pRef); break;
+ case ZFUNC_GE: pTest = FCMP_OGE(pAlpha, pRef); break;
+ default:
+ SWR_ASSERT(false, "Invalid alpha test function");
+ break;
+ }
+ }
+
+ // load current mask
+ Value* pMask = LOAD(ppMask);
+
+ // convert to int1 mask
+ pMask = MASK(pMask);
+
+ // and with alpha test result
+ pMask = AND(pMask, pTest);
+
+ // convert back to vector mask
+ pMask = VMASK(pMask);
+
+ // store new mask
+ STORE(pMask, ppMask);
+ }
+
+ Function* Create(const BLEND_COMPILE_STATE& state)
+ {
+ static std::size_t jitNum = 0;
+
+ std::stringstream fnName("BlendShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
+ fnName << jitNum++;
+
+ // blend function signature
+ //typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*);
+
+ std::vector<Type*> args{
+ PointerType::get(Gen_SWR_BLEND_STATE(JM()), 0), // SWR_BLEND_STATE*
+ PointerType::get(mSimdFP32Ty, 0), // simdvector& src
+ PointerType::get(mSimdFP32Ty, 0), // simdvector& src1
+ Type::getInt32Ty(JM()->mContext), // sampleNum
+ PointerType::get(mSimdFP32Ty, 0), // uint8_t* pDst
+ PointerType::get(mSimdFP32Ty, 0), // simdvector& result
+ PointerType::get(mSimdInt32Ty, 0), // simdscalari* oMask
+ PointerType::get(mSimdInt32Ty, 0), // simdscalari* pMask
+ };
+
+ FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
+ Function* blendFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
+
+ BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc);
+
+ IRB()->SetInsertPoint(entry);
+
+ // arguments
+ auto argitr = blendFunc->getArgumentList().begin();
+ Value* pBlendState = &*argitr++;
+ pBlendState->setName("pBlendState");
+ Value* pSrc = &*argitr++;
+ pSrc->setName("src");
+ Value* pSrc1 = &*argitr++;
+ pSrc1->setName("src1");
+ Value* sampleNum = &*argitr++;
+ sampleNum->setName("sampleNum");
+ Value* pDst = &*argitr++;
+ pDst->setName("pDst");
+ Value* pResult = &*argitr++;
+ pResult->setName("result");
+ Value* ppoMask = &*argitr++;
+ ppoMask->setName("ppoMask");
+ Value* ppMask = &*argitr++;
+ ppMask->setName("pMask");
+
+ static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
+ Value* dst[4];
+ Value* constantColor[4];
+ Value* src[4];
+ Value* src1[4];
+ Value* result[4];
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ // load hot tile
+ dst[i] = LOAD(pDst, { i });
+
+ // load constant color
+ constantColor[i] = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_constantColor, i }));
+
+ // load src
+ src[i] = LOAD(pSrc, { i });
+
+ // load src1
+ src1[i] = LOAD(pSrc1, { i });
+ }
+ Value* currentMask = VIMMED1(-1);
+ if(state.desc.alphaToCoverageEnable)
+ {
+ currentMask = FP_TO_SI(FMUL(src[3], VBROADCAST(C((float)state.desc.numSamples))), mSimdInt32Ty);
+ }
+
+ // alpha test
+ if (state.desc.alphaTestEnable)
+ {
+ AlphaTest(state, pBlendState, src[3], ppMask);
+ }
+
+ // color blend
+ if (state.blendState.blendEnable)
+ {
+ // clamp sources
+ Clamp(state.format, src);
+ Clamp(state.format, src1);
+ Clamp(state.format, dst);
+ Clamp(state.format, constantColor);
+
+ // apply defaults to hottile contents to take into account missing components
+ ApplyDefaults(state.format, dst);
+
+ // Force defaults for unused 'X' components
+ ApplyUnusedDefaults(state.format, dst);
+
+ // Quantize low precision components
+ Quantize(state.format, dst);
+
+ // special case clamping for R11G11B10_float which has no sign bit
+ if (state.format == R11G11B10_FLOAT)
+ {
+ dst[0] = VMAXPS(dst[0], VIMMED1(0.0f));
+ dst[1] = VMAXPS(dst[1], VIMMED1(0.0f));
+ dst[2] = VMAXPS(dst[2], VIMMED1(0.0f));
+ dst[3] = VMAXPS(dst[3], VIMMED1(0.0f));
+ }
+
+ Value* srcFactor[4];
+ Value* dstFactor[4];
+ if (state.desc.independentAlphaBlendEnable)
+ {
+ GenerateBlendFactor<true, false>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
+ GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor, constantColor, src, src1, dst, srcFactor);
+
+ GenerateBlendFactor<true, false>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
+ GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor, constantColor, src, src1, dst, dstFactor);
+
+ BlendFunc<true, false>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
+ BlendFunc<false, true>(state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
+ }
+ else
+ {
+ GenerateBlendFactor<true, true>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
+ GenerateBlendFactor<true, true>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
+
+ BlendFunc<true, true>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
+ }
+
+ // store results out
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ STORE(result[i], pResult, { i });
+ }
+ }
+
+ if(state.blendState.logicOpEnable)
+ {
+ const SWR_FORMAT_INFO& info = GetFormatInfo(state.format);
+ SWR_ASSERT(info.type[0] == SWR_TYPE_UINT);
+ Value* vMask[4];
+ for(uint32_t i = 0; i < 4; i++)
+ {
+ switch(info.bpc[i])
+ {
+ case 0: vMask[i] = VIMMED1(0x00000000); break;
+ case 2: vMask[i] = VIMMED1(0x00000003); break;
+ case 5: vMask[i] = VIMMED1(0x0000001F); break;
+ case 6: vMask[i] = VIMMED1(0x0000003F); break;
+ case 8: vMask[i] = VIMMED1(0x000000FF); break;
+ case 10: vMask[i] = VIMMED1(0x000003FF); break;
+ case 11: vMask[i] = VIMMED1(0x000007FF); break;
+ case 16: vMask[i] = VIMMED1(0x0000FFFF); break;
+ case 24: vMask[i] = VIMMED1(0x00FFFFFF); break;
+ case 32: vMask[i] = VIMMED1(0xFFFFFFFF); break;
+ default:
+ vMask[i] = VIMMED1(0x0);
+ SWR_ASSERT(0, "Unsupported bpc for logic op\n");
+ break;
+ }
+ src[i] = BITCAST(src[i], mSimdInt32Ty);//, vMask[i]);
+ dst[i] = BITCAST(dst[i], mSimdInt32Ty);
+ }
+
+ LogicOpFunc(state.blendState.logicOpFunc, src, dst, result);
+
+ // store results out
+ for(uint32_t i = 0; i < 4; ++i)
+ {
+ // clear upper bits from PS output not in RT format after doing logic op
+ result[i] = AND(result[i], vMask[i]);
+
+ STORE(BITCAST(result[i], mSimdFP32Ty), pResult, {i});
+ }
+ }
+
+ if(state.desc.oMaskEnable)
+ {
+ assert(!(state.desc.alphaToCoverageEnable));
+ // load current mask
+ Value* oMask = LOAD(ppoMask);
+ Value* sampleMasked = VBROADCAST(SHL(C(1), sampleNum));
+ oMask = AND(oMask, sampleMasked);
+ currentMask = AND(oMask, currentMask);
+ }
+
+ if(state.desc.sampleMaskEnable)
+ {
+ Value* sampleMask = LOAD(pBlendState, { 0, SWR_BLEND_STATE_sampleMask});
+ Value* sampleMasked = SHL(C(1), sampleNum);
+ sampleMask = AND(sampleMask, sampleMasked);
+ sampleMask = VBROADCAST(ICMP_SGT(sampleMask, C(0)));
+ sampleMask = S_EXT(sampleMask, mSimdInt32Ty);
+ currentMask = AND(sampleMask, currentMask);
+ }
+
+ if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
+ state.desc.oMaskEnable)
+ {
+ // load current mask
+ Value* pMask = LOAD(ppMask);
+ currentMask = S_EXT(ICMP_SGT(currentMask, VBROADCAST(C(0))), mSimdInt32Ty);
+ Value* outputMask = AND(pMask, currentMask);
+ // store new mask
+ STORE(outputMask, GEP(ppMask, C(0)));
+ }
+
+ RET_VOID();
+
+ JitManager::DumpToFile(blendFunc, "");
+
+ FunctionPassManager passes(JM()->mpCurrentModule);
+ passes.add(createBreakCriticalEdgesPass());
+ passes.add(createCFGSimplificationPass());
+ passes.add(createEarlyCSEPass());
+ passes.add(createPromoteMemoryToRegisterPass());
+ passes.add(createCFGSimplificationPass());
+ passes.add(createEarlyCSEPass());
+ passes.add(createInstructionCombiningPass());
+ passes.add(createInstructionSimplifierPass());
+ passes.add(createConstantPropagationPass());
+ passes.add(createSCCPPass());
+ passes.add(createAggressiveDCEPass());
+
+ passes.run(*blendFunc);
+
+ JitManager::DumpToFile(blendFunc, "optimized");
+
+ return blendFunc;
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JITs from fetch shader IR
+/// @param hJitMgr - JitManager handle
+/// @param func - LLVM function IR
+/// @return PFN_FETCH_FUNC - pointer to fetch code
+PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc)
+{
+ const llvm::Function *func = (const llvm::Function*)hFunc;
+ JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
+ PFN_BLEND_JIT_FUNC pfnBlend;
+ pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
+ // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
+ pJitMgr->mIsModuleFinalized = true;
+
+ return pfnBlend;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JIT compiles blend shader
+/// @param hJitMgr - JitManager handle
+/// @param state - blend state to build function from
+extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr, const BLEND_COMPILE_STATE& state)
+{
+ JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
+
+ pJitMgr->SetupNewModule();
+
+ BlendJit theJit(pJitMgr);
+ HANDLE hFunc = theJit.Create(state);
+
+ return JitBlendFunc(hJitMgr, hFunc);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h
new file mode 100644
index 00000000000..057eb92b67e
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h
@@ -0,0 +1,93 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file blend_jit.h
+*
+* @brief Definition of the blend jitter
+*
+* Notes:
+*
+******************************************************************************/
+#pragma once
+
+#include "common/formats.h"
+#include "core/context.h"
+#include "core/state.h"
+
+struct RENDER_TARGET_BLEND_COMPILE_STATE
+{
+ bool blendEnable;
+ bool logicOpEnable;
+ SWR_BLEND_FACTOR sourceAlphaBlendFactor;
+ SWR_BLEND_FACTOR destAlphaBlendFactor;
+ SWR_BLEND_FACTOR sourceBlendFactor;
+ SWR_BLEND_FACTOR destBlendFactor;
+ SWR_BLEND_OP colorBlendFunc;
+ SWR_BLEND_OP alphaBlendFunc;
+ SWR_LOGIC_OP logicOpFunc;
+};
+
+enum ALPHA_TEST_FORMAT
+{
+ ALPHA_TEST_UNORM8,
+ ALPHA_TEST_FLOAT32
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// BLEND_DESC
+//////////////////////////////////////////////////////////////////////////
+struct BLEND_DESC
+{
+ union
+ {
+ struct
+ {
+ uint32_t alphaTestEnable: 1;
+ uint32_t independentAlphaBlendEnable: 1;
+ uint32_t alphaToCoverageEnable: 1;
+ uint32_t oMaskEnable:1;
+ uint32_t inputCoverageEnable:1;
+ uint32_t sampleMaskEnable:1;
+ uint32_t numSamples:5;
+ uint32_t _reserved : 21;
+ };
+ uint32_t bits;
+ };
+};
+#define BLEND_ENABLE_MASK 0x3D // a2c | oMaskEnable | inputCoverageEnable | sampleMaskEnable
+//////////////////////////////////////////////////////////////////////////
+/// State required for blend jit
+//////////////////////////////////////////////////////////////////////////
+struct BLEND_COMPILE_STATE
+{
+ SWR_FORMAT format; // format of render target being blended
+ RENDER_TARGET_BLEND_COMPILE_STATE blendState;
+ BLEND_DESC desc;
+
+ SWR_ZFUNCTION alphaTestFunction;
+ ALPHA_TEST_FORMAT alphaTestFormat;
+
+ bool operator==(const BLEND_COMPILE_STATE& other) const
+ {
+ return memcmp(this, &other, sizeof(BLEND_COMPILE_STATE)) == 0;
+ }
+};
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
new file mode 100644
index 00000000000..c15bdf1e756
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -0,0 +1,71 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file builder.h
+*
+* @brief Includes all the builder related functionality
+*
+* Notes:
+*
+******************************************************************************/
+
+#include "builder.h"
+
+using namespace llvm;
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Contructor for Builder.
+/// @param pJitMgr - JitManager which contains modules, function passes, etc.
+Builder::Builder(JitManager *pJitMgr)
+ : mpJitMgr(pJitMgr)
+{
+ mpIRBuilder = &pJitMgr->mBuilder;
+
+ mVoidTy = Type::getVoidTy(pJitMgr->mContext);
+ mFP16Ty = Type::getHalfTy(pJitMgr->mContext);
+ mFP32Ty = Type::getFloatTy(pJitMgr->mContext);
+ mDoubleTy = Type::getDoubleTy(pJitMgr->mContext);
+ mInt1Ty = Type::getInt1Ty(pJitMgr->mContext);
+ mInt8Ty = Type::getInt8Ty(pJitMgr->mContext);
+ mInt16Ty = Type::getInt16Ty(pJitMgr->mContext);
+ mInt32Ty = Type::getInt32Ty(pJitMgr->mContext);
+ mInt64Ty = Type::getInt64Ty(pJitMgr->mContext);
+ mV4FP32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mFP32Ty), false); // vector4 float type (represented as structure)
+ mV4Int32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mInt32Ty), false); // vector4 int type
+ mSimdInt16Ty = VectorType::get(mInt16Ty, mpJitMgr->mVWidth);
+ mSimdInt32Ty = VectorType::get(mInt32Ty, mpJitMgr->mVWidth);
+ mSimdInt64Ty = VectorType::get(mInt64Ty, mpJitMgr->mVWidth);
+ mSimdFP16Ty = VectorType::get(mFP16Ty, mpJitMgr->mVWidth);
+ mSimdFP32Ty = VectorType::get(mFP32Ty, mpJitMgr->mVWidth);
+
+ if (sizeof(uint32_t*) == 4)
+ {
+ mIntPtrTy = mInt32Ty;
+ mSimdIntPtrTy = mSimdInt32Ty;
+ }
+ else
+ {
+ SWR_ASSERT(sizeof(uint32_t*) == 8);
+ mIntPtrTy = mInt64Ty;
+ mSimdIntPtrTy = mSimdInt64Ty;
+ }
+}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
new file mode 100644
index 00000000000..49216612cc9
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -0,0 +1,71 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file builder.h
+*
+* @brief Includes all the builder related functionality
+*
+* Notes:
+*
+******************************************************************************/
+#pragma once
+
+#include "JitManager.h"
+#include "common/formats.h"
+
+using namespace llvm;
+
+struct Builder
+{
+ Builder(JitManager *pJitMgr);
+ IRBuilder<>* IRB() { return mpIRBuilder; };
+ JitManager* JM() { return mpJitMgr; }
+
+ JitManager* mpJitMgr;
+ IRBuilder<>* mpIRBuilder;
+
+ // Built in types.
+ Type* mVoidTy;
+ Type* mInt1Ty;
+ Type* mInt8Ty;
+ Type* mInt16Ty;
+ Type* mInt32Ty;
+ Type* mInt64Ty;
+ Type* mIntPtrTy;
+ Type* mFP16Ty;
+ Type* mFP32Ty;
+ Type* mDoubleTy;
+ Type* mSimdFP16Ty;
+ Type* mSimdFP32Ty;
+ Type* mSimdInt16Ty;
+ Type* mSimdInt32Ty;
+ Type* mSimdInt64Ty;
+ Type* mSimdIntPtrTy;
+ StructType* mV4FP32Ty;
+ StructType* mV4Int32Ty;
+
+#include "builder_gen.h"
+#include "builder_x86.h"
+#include "builder_misc.h"
+#include "builder_math.h"
+
+};
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h
new file mode 100644
index 00000000000..92867ec9836
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h
@@ -0,0 +1,34 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file builder_math.h
+*
+* @brief math/alu builder functions
+*
+* Notes:
+*
+******************************************************************************/
+#pragma once
+
+Value* VLOG2PS(Value* src);
+Value* VPOW24PS(Value* src);
+Value* VEXP2PS(Value* src);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
new file mode 100644
index 00000000000..5394fc7bf5a
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -0,0 +1,1447 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file builder_misc.cpp
+*
+* @brief Implementation for miscellaneous builder functions
+*
+* Notes:
+*
+******************************************************************************/
+#include "builder.h"
+#include "llvm/Support/DynamicLibrary.h"
+
+void __cdecl CallPrint(const char* fmt, ...);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Convert an IEEE 754 32-bit single precision float to an
+/// 16 bit float with 5 exponent bits and a variable
+/// number of mantissa bits.
+/// @param val - 32-bit float
+/// @todo Maybe move this outside of this file into a header?
+static uint16_t Convert32To16Float(float val)
+{
+ uint32_t sign, exp, mant;
+ uint32_t roundBits;
+
+ // Extract the sign, exponent, and mantissa
+ uint32_t uf = *(uint32_t*)&val;
+ sign = (uf & 0x80000000) >> 31;
+ exp = (uf & 0x7F800000) >> 23;
+ mant = uf & 0x007FFFFF;
+
+ // Check for out of range
+ if (std::isnan(val))
+ {
+ exp = 0x1F;
+ mant = 0x200;
+ sign = 1; // set the sign bit for NANs
+ }
+ else if (std::isinf(val))
+ {
+ exp = 0x1f;
+ mant = 0x0;
+ }
+ else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
+ {
+ exp = 0x1E;
+ mant = 0x3FF;
+ }
+ else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
+ {
+ mant |= 0x00800000;
+ for (; exp <= 0x70; mant >>= 1, exp++)
+ ;
+ exp = 0;
+ mant = mant >> 13;
+ }
+ else if (exp < 0x66) // Too small to represent -> Zero
+ {
+ exp = 0;
+ mant = 0;
+ }
+ else
+ {
+ // Saves bits that will be shifted off for rounding
+ roundBits = mant & 0x1FFFu;
+ // convert exponent and mantissa to 16 bit format
+ exp = exp - 0x70;
+ mant = mant >> 13;
+
+ // Essentially RTZ, but round up if off by only 1 lsb
+ if (roundBits == 0x1FFFu)
+ {
+ mant++;
+ // check for overflow
+ if ((mant & 0xC00u) != 0)
+ exp++;
+ // make sure only the needed bits are used
+ mant &= 0x3FF;
+ }
+ }
+
+ uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
+ return (uint16_t)tmpVal;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
+/// float
+/// @param val - 16-bit float
+/// @todo Maybe move this outside of this file into a header?
+static float ConvertSmallFloatTo32(UINT val)
+{
+ UINT result;
+ if ((val & 0x7fff) == 0)
+ {
+ result = ((uint32_t)(val & 0x8000)) << 16;
+ }
+ else if ((val & 0x7c00) == 0x7c00)
+ {
+ result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
+ result |= ((uint32_t)val & 0x8000) << 16;
+ }
+ else
+ {
+ uint32_t sign = (val & 0x8000) << 16;
+ uint32_t mant = (val & 0x3ff) << 13;
+ uint32_t exp = (val >> 10) & 0x1f;
+ if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
+ {
+ mant <<= 1;
+ while (mant < (0x400 << 13))
+ {
+ exp--;
+ mant <<= 1;
+ }
+ mant &= (0x3ff << 13);
+ }
+ exp = ((exp - 15 + 127) & 0xff) << 23;
+ result = sign | exp | mant;
+ }
+
+ return *(float*)&result;
+}
+
+Constant *Builder::C(bool i)
+{
+ return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
+}
+
+Constant *Builder::C(char i)
+{
+ return ConstantInt::get(IRB()->getInt8Ty(), i);
+}
+
+Constant *Builder::C(uint8_t i)
+{
+ return ConstantInt::get(IRB()->getInt8Ty(), i);
+}
+
+Constant *Builder::C(int i)
+{
+ return ConstantInt::get(IRB()->getInt32Ty(), i);
+}
+
+Constant *Builder::C(int64_t i)
+{
+ return ConstantInt::get(IRB()->getInt64Ty(), i);
+}
+
+Constant *Builder::C(uint16_t i)
+{
+ return ConstantInt::get(mInt16Ty,i);
+}
+
+Constant *Builder::C(uint32_t i)
+{
+ return ConstantInt::get(IRB()->getInt32Ty(), i);
+}
+
+Constant *Builder::C(float i)
+{
+ return ConstantFP::get(IRB()->getFloatTy(), i);
+}
+
+Constant *Builder::PRED(bool pred)
+{
+ return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
+}
+
+Value *Builder::VIMMED1(int i)
+{
+ return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+}
+
+Value *Builder::VIMMED1(uint32_t i)
+{
+ return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+}
+
+Value *Builder::VIMMED1(float i)
+{
+ return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantFP>(C(i)));
+}
+
+Value *Builder::VIMMED1(bool i)
+{
+ return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+}
+
+Value *Builder::VUNDEF_IPTR()
+{
+ return UndefValue::get(VectorType::get(PointerType::get(mInt32Ty, 0),JM()->mVWidth));
+}
+
+Value *Builder::VUNDEF_I()
+{
+ return UndefValue::get(VectorType::get(mInt32Ty, JM()->mVWidth));
+}
+
+Value *Builder::VUNDEF(Type *ty, uint32_t size)
+{
+ return UndefValue::get(VectorType::get(ty, size));
+}
+
+Value *Builder::VUNDEF_F()
+{
+ return UndefValue::get(VectorType::get(mFP32Ty, JM()->mVWidth));
+}
+
+Value *Builder::VUNDEF(Type* t)
+{
+ return UndefValue::get(VectorType::get(t, JM()->mVWidth));
+}
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
+{
+ return VINSERT(vec, val, C((int64_t)index));
+}
+#endif
+
+Value *Builder::VBROADCAST(Value *src)
+{
+ // check if src is already a vector
+ if (src->getType()->isVectorTy())
+ {
+ return src;
+ }
+
+ return VECTOR_SPLAT(JM()->mVWidth, src);
+}
+
+uint32_t Builder::IMMED(Value* v)
+{
+ SWR_ASSERT(isa<ConstantInt>(v));
+ ConstantInt *pValConst = cast<ConstantInt>(v);
+ return pValConst->getZExtValue();
+}
+
+Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
+{
+ std::vector<Value*> indices;
+ for (auto i : indexList)
+ indices.push_back(i);
+ return GEPA(ptr, indices);
+}
+
+Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
+{
+ std::vector<Value*> indices;
+ for (auto i : indexList)
+ indices.push_back(C(i));
+ return GEPA(ptr, indices);
+}
+
+LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
+{
+ std::vector<Value*> valIndices;
+ for (auto i : indices)
+ valIndices.push_back(C(i));
+ return LOAD(GEPA(basePtr, valIndices), name);
+}
+
+LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
+{
+ std::vector<Value*> valIndices;
+ for (auto i : indices)
+ valIndices.push_back(i);
+ return LOAD(GEPA(basePtr, valIndices), name);
+}
+
+StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
+{
+ std::vector<Value*> valIndices;
+ for (auto i : indices)
+ valIndices.push_back(C(i));
+ return STORE(val, GEPA(basePtr, valIndices));
+}
+
+StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
+{
+ std::vector<Value*> valIndices;
+ for (auto i : indices)
+ valIndices.push_back(i);
+ return STORE(val, GEPA(basePtr, valIndices));
+}
+
+CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
+{
+ std::vector<Value*> args;
+ for (auto arg : argsList)
+ args.push_back(arg);
+ return CALLA(Callee, args);
+}
+
+Value *Builder::VRCP(Value *va)
+{
+ return FDIV(VIMMED1(1.0f), va); // 1 / a
+}
+
+Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
+{
+ Value* vOut = FMADDPS(vA, vX, vC);
+ vOut = FMADDPS(vB, vY, vOut);
+ return vOut;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Generate an i32 masked load operation in LLVM IR. If not
+/// supported on the underlying platform, emulate it with float masked load
+/// @param src - base address pointer for the load
+/// @param vMask - SIMD wide mask that controls whether to access memory load 0
+Value *Builder::MASKLOADD(Value* src,Value* mask)
+{
+ Value* vResult;
+ // use avx2 gather instruction is available
+ if(JM()->mArch.AVX2())
+ {
+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
+ vResult = CALL(func,{src,mask});
+ }
+ else
+ {
+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
+ Value* fMask = BITCAST(mask,VectorType::get(mFP32Ty,JM()->mVWidth));
+ vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,JM()->mVWidth));
+ }
+ return vResult;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief insert a JIT call to CallPrint
+/// - outputs formatted string to both stdout and VS output window
+/// - DEBUG builds only
+/// Usage example:
+/// PRINT("index %d = 0x%p\n",{C(lane), pIndex});
+/// where C(lane) creates a constant value to print, and pIndex is the Value*
+/// result from a GEP, printing out the pointer to memory
+/// @param printStr - constant string to print, which includes format specifiers
+/// @param printArgs - initializer list of Value*'s to print to std out
+CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
+{
+ // push the arguments to CallPrint into a vector
+ std::vector<Value*> printCallArgs;
+ // save room for the format string. we still need to modify it for vectors
+ printCallArgs.resize(1);
+
+ // search through the format string for special processing
+ size_t pos = 0;
+ std::string tempStr(printStr);
+ pos = tempStr.find('%', pos);
+ auto v = printArgs.begin();
+
+ while ((pos != std::string::npos) && (v != printArgs.end()))
+ {
+ Value* pArg = *v;
+ Type* pType = pArg->getType();
+
+ if (tempStr[pos + 1] == 't')
+ {
+ if (pType->isVectorTy())
+ {
+ Type* pContainedType = pType->getContainedType(0);
+
+ std::string vectorFormatStr;
+
+ if (pContainedType->isFloatTy())
+ {
+ tempStr[pos + 1] = 'f'; // Ensure its %f
+ printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(0)), mDoubleTy));
+
+ for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
+ {
+ vectorFormatStr += "%f ";
+ printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), mDoubleTy));
+ }
+ }
+ else if (pContainedType->isIntegerTy())
+ {
+ tempStr[pos + 1] = 'd'; // Ensure its %d
+ printCallArgs.push_back(VEXTRACT(pArg, C(0)));
+
+ for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
+ {
+ vectorFormatStr += "%d ";
+ printCallArgs.push_back(VEXTRACT(pArg, C(i)));
+ }
+ }
+ else
+ {
+ SWR_ASSERT(0, "Unsupported tyep");
+ }
+
+ tempStr.insert(pos, vectorFormatStr);
+ pos += vectorFormatStr.size();
+ }
+ else
+ {
+ if (pType->isFloatTy())
+ {
+ tempStr[pos + 1] = 'f'; // Ensure its %f
+ printCallArgs.push_back(FP_EXT(pArg, mDoubleTy));
+ }
+ else if (pType->isIntegerTy())
+ {
+ tempStr[pos + 1] = 'd'; // Ensure its %d
+ printCallArgs.push_back(pArg);
+ }
+ }
+ }
+ else if (toupper(tempStr[pos + 1]) == 'X')
+ {
+ if (pType->isVectorTy())
+ {
+ tempStr[pos] = '0';
+ tempStr.insert(pos + 1, "x%08");
+
+ printCallArgs.push_back(VEXTRACT(pArg, C(0)));
+
+ std::string vectorFormatStr;
+ for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
+ {
+ vectorFormatStr += "0x%08X ";
+ printCallArgs.push_back(VEXTRACT(pArg, C(i)));
+ }
+
+ tempStr.insert(pos, vectorFormatStr);
+ pos += vectorFormatStr.size();
+ }
+ else
+ {
+ tempStr[pos] = '0';
+ tempStr.insert(pos + 1, "x%08");
+ printCallArgs.push_back(pArg);
+ pos += 3;
+ }
+ }
+ // for %f we need to cast float Values to doubles so that they print out correctly
+ else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
+ {
+ printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
+ pos++;
+ }
+ // add special handling for %f and %d format specifiers to make printing llvm vector types easier
+ else if (pType->isVectorTy())
+ {
+ Type* pContainedType = pType->getContainedType(0);
+
+ if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
+ {
+ uint32_t i = 0;
+ for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
+ {
+ tempStr.insert(pos, std::string("%f "));
+ pos += 3;
+ printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
+ }
+ printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
+ }
+ else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
+ {
+ uint32_t i = 0;
+ for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
+ {
+ tempStr.insert(pos, std::string("%d "));
+ pos += 3;
+ printCallArgs.push_back(VEXTRACT(pArg, C(i)));
+ }
+ printCallArgs.push_back(VEXTRACT(pArg, C(i)));
+ }
+ else
+ {
+ /// not a supported vector to print
+ /// @todo pointer types too
+ SWR_ASSERT(0);
+ }
+ }
+ else
+ {
+ printCallArgs.push_back(pArg);
+ }
+
+ // advance to the next arguement
+ v++;
+ pos = tempStr.find('%', ++pos);
+ }
+
+ // create global variable constant string
+ Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
+ GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
+ JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
+
+ // get a pointer to the first character in the constant string array
+ std::vector<Constant*> geplist{C(0),C(0)};
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+ Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
+#else
+ Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
+#endif
+
+ // insert the pointer to the format string in the argument vector
+ printCallArgs[0] = strGEP;
+
+ // get pointer to CallPrint function and insert decl into the module if needed
+ std::vector<Type*> args;
+ args.push_back(PointerType::get(mInt8Ty,0));
+ FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
+ Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
+
+ // if we haven't yet added the symbol to the symbol table
+ if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
+ {
+ sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
+ }
+
+ // insert a call to CallPrint
+ return CALLA(callPrintFn,printCallArgs);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Wrapper around PRINT with initializer list.
+CallInst* Builder::PRINT(const std::string &printStr)
+{
+ return PRINT(printStr, {});
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Generate a masked gather operation in LLVM IR. If not
+/// supported on the underlying platform, emulate it with loads
+/// @param vSrc - SIMD wide value that will be loaded if mask is invalid
+/// @param pBase - Int8* base VB address pointer value
+/// @param vIndices - SIMD wide value of VB byte offsets
+/// @param vMask - SIMD wide mask that controls whether to access memory or the src values
+/// @param scale - value to scale indices by
+Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
+{
+ Value* vGather;
+
+ // use avx2 gather instruction if available
+ if(JM()->mArch.AVX2())
+ {
+ // force mask to <N x float>, required by vgather
+ vMask = BITCAST(vMask, mSimdFP32Ty);
+ vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
+ }
+ else
+ {
+ Value* pStack = STACKSAVE();
+
+ // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
+ Value* vSrcPtr = ALLOCA(vSrc->getType());
+ STORE(vSrc, vSrcPtr);
+
+ vGather = VUNDEF_F();
+ Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
+ Value *vOffsets = MUL(vIndices,vScaleVec);
+ Value *mask = MASK(vMask);
+ for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+ {
+ // single component byte index
+ Value *offset = VEXTRACT(vOffsets,C(i));
+ // byte pointer to component
+ Value *loadAddress = GEP(pBase,offset);
+ loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
+ // pointer to the value to load if we're masking off a component
+ Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
+ Value *selMask = VEXTRACT(mask,C(i));
+ // switch in a safe address to load if we're trying to access a vertex
+ Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
+ Value *val = LOAD(validAddress);
+ vGather = VINSERT(vGather,val,C(i));
+ }
+ STACKRESTORE(pStack);
+ }
+
+ return vGather;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Generate a masked gather operation in LLVM IR. If not
+/// supported on the underlying platform, emulate it with loads
+/// @param vSrc - SIMD wide value that will be loaded if mask is invalid
+/// @param pBase - Int8* base VB address pointer value
+/// @param vIndices - SIMD wide value of VB byte offsets
+/// @param vMask - SIMD wide mask that controls whether to access memory or the src values
+/// @param scale - value to scale indices by
+Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
+{
+ Value* vGather;
+
+ // use avx2 gather instruction if available
+ if(JM()->mArch.AVX2())
+ {
+ vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
+ }
+ else
+ {
+ Value* pStack = STACKSAVE();
+
+ // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
+ Value* vSrcPtr = ALLOCA(vSrc->getType());
+ STORE(vSrc, vSrcPtr);
+
+ vGather = VUNDEF_I();
+ Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
+ Value *vOffsets = MUL(vIndices, vScaleVec);
+ Value *mask = MASK(vMask);
+ for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+ {
+ // single component byte index
+ Value *offset = VEXTRACT(vOffsets, C(i));
+ // byte pointer to component
+ Value *loadAddress = GEP(pBase, offset);
+ loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
+ // pointer to the value to load if we're masking off a component
+ Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
+ Value *selMask = VEXTRACT(mask, C(i));
+ // switch in a safe address to load if we're trying to access a vertex
+ Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
+ Value *val = LOAD(validAddress, C(0));
+ vGather = VINSERT(vGather, val, C(i));
+ }
+
+ STACKRESTORE(pStack);
+ }
+ return vGather;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief convert x86 <N x float> mask to llvm <N x i1> mask
+Value* Builder::MASK(Value* vmask)
+{
+ Value* src = BITCAST(vmask, mSimdInt32Ty);
+ return ICMP_SLT(src, VIMMED1(0));
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
+Value* Builder::VMASK(Value* mask)
+{
+ return S_EXT(mask, mSimdInt32Ty);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Generate a VPSHUFB operation in LLVM IR. If not
+/// supported on the underlying platform, emulate it
+/// @param a - 256bit SIMD(32x8bit) of 8bit integer values
+/// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
+/// Byte masks in lower 128 lane of b selects 8 bit values from lower
+/// 128bits of a, and vice versa for the upper lanes. If the mask
+/// value is negative, '0' is inserted.
+Value *Builder::PSHUFB(Value* a, Value* b)
+{
+ Value* res;
+ // use avx2 pshufb instruction if available
+ if(JM()->mArch.AVX2())
+ {
+ res = VPSHUFB(a, b);
+ }
+ else
+ {
+ Constant* cB = dyn_cast<Constant>(b);
+ // number of 8 bit elements in b
+ uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
+ // output vector
+ Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
+
+ // insert an 8 bit value from the high and low lanes of a per loop iteration
+ numElms /= 2;
+ for(uint32_t i = 0; i < numElms; i++)
+ {
+ ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
+ ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
+
+ // extract values from constant mask
+ char valLow128bLane = (char)(cLow128b->getSExtValue());
+ char valHigh128bLane = (char)(cHigh128b->getSExtValue());
+
+ Value* insertValLow128b;
+ Value* insertValHigh128b;
+
+ // if the mask value is negative, insert a '0' in the respective output position
+ // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
+ insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
+ insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
+
+ vShuf = VINSERT(vShuf, insertValLow128b, i);
+ vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
+ }
+ res = vShuf;
+ }
+ return res;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
+/// bits)in LLVM IR. If not supported on the underlying platform, emulate it
+/// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only
+/// lower 8 values are used.
+Value *Builder::PMOVSXBD(Value* a)
+{
+ Value* res;
+ // use avx2 byte sign extend instruction if available
+ if(JM()->mArch.AVX2())
+ {
+ res = VPMOVSXBD(a);
+ }
+ else
+ {
+ // VPMOVSXBD output type
+ Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
+ // Extract 8 values from 128bit lane and sign extend
+ res = S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
+ }
+ return res;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
+/// bits)in LLVM IR. If not supported on the underlying platform, emulate it
+/// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
+Value *Builder::PMOVSXWD(Value* a)
+{
+ Value* res;
+ // use avx2 word sign extend if available
+ if(JM()->mArch.AVX2())
+ {
+ res = VPMOVSXWD(a);
+ }
+ else
+ {
+ // VPMOVSXWD output type
+ Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
+ // Extract 8 values from 128bit lane and sign extend
+ res = S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
+ }
+ return res;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Generate a VPERMD operation (shuffle 32 bit integer values
+/// across 128 bit lanes) in LLVM IR. If not supported on the underlying
+/// platform, emulate it
+/// @param a - 256bit SIMD lane(8x32bit) of integer values.
+/// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
+Value *Builder::PERMD(Value* a, Value* idx)
+{
+ Value* res;
+ // use avx2 permute instruction if available
+ if(JM()->mArch.AVX2())
+ {
+ // llvm 3.6.0 swapped the order of the args to vpermd
+ res = VPERMD(idx, a);
+ }
+ else
+ {
+ res = VSHUFFLE(a, a, idx);
+ }
+ return res;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
+/// in LLVM IR. If not supported on the underlying platform, emulate it
+/// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
+Value *Builder::CVTPH2PS(Value* a)
+{
+ if (JM()->mArch.F16C())
+ {
+ return VCVTPH2PS(a);
+ }
+ else
+ {
+ FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
+ Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy));
+
+ if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr)
+ {
+ sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32);
+ }
+
+ Value* pResult = UndefValue::get(mSimdFP32Ty);
+ for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+ {
+ Value* pSrc = VEXTRACT(a, C(i));
+ Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
+ pResult = VINSERT(pResult, pConv, C(i));
+ }
+
+ return pResult;
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
+/// in LLVM IR. If not supported on the underlying platform, emulate it
+/// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
+Value *Builder::CVTPS2PH(Value* a, Value* rounding)
+{
+ if (JM()->mArch.F16C())
+ {
+ return VCVTPS2PH(a, rounding);
+ }
+ else
+ {
+ // call scalar C function for now
+ FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
+ Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy));
+
+ if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
+ {
+ sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float);
+ }
+
+ Value* pResult = UndefValue::get(mSimdInt16Ty);
+ for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+ {
+ Value* pSrc = VEXTRACT(a, C(i));
+ Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
+ pResult = VINSERT(pResult, pConv, C(i));
+ }
+
+ return pResult;
+ }
+}
+
+Value *Builder::PMAXSD(Value* a, Value* b)
+{
+ if (JM()->mArch.AVX2())
+ {
+ return VPMAXSD(a, b);
+ }
+ else
+ {
+ // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
+ Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd);
+
+ // low 128
+ Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
+ Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
+ Value* resLo = CALL(pmaxsd, {aLo, bLo});
+
+ // high 128
+ Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
+ Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
+ Value* resHi = CALL(pmaxsd, {aHi, bHi});
+
+ // combine
+ Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
+ result = VINSERTI128(result, resHi, C((uint8_t)1));
+
+ return result;
+ }
+}
+
+Value *Builder::PMINSD(Value* a, Value* b)
+{
+ if (JM()->mArch.AVX2())
+ {
+ return VPMINSD(a, b);
+ }
+ else
+ {
+ // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
+ Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd);
+
+ // low 128
+ Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
+ Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
+ Value* resLo = CALL(pminsd, {aLo, bLo});
+
+ // high 128
+ Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
+ Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
+ Value* resHi = CALL(pminsd, {aHi, bHi});
+
+ // combine
+ Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
+ result = VINSERTI128(result, resHi, C((uint8_t)1));
+
+ return result;
+ }
+}
+
+void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
+ Value* mask, Value* vGatherComponents[], bool bPackedOutput)
+{
+ const SWR_FORMAT_INFO &info = GetFormatInfo(format);
+ if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
+ {
+ // ensure our mask is the correct type
+ mask = BITCAST(mask, mSimdFP32Ty);
+ GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
+ }
+ else
+ {
+ // ensure our mask is the correct type
+ mask = BITCAST(mask, mSimdInt32Ty);
+ GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
+ }
+}
+
+void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
+ Value* mask, Value* vGatherComponents[], bool bPackedOutput)
+{
+ switch(info.bpp / info.numComps)
+ {
+ case 16:
+ {
+ Value* vGatherResult[2];
+ Value *vMask;
+
+ // TODO: vGatherMaskedVal
+ Value* vGatherMaskedVal = VIMMED1((float)0);
+
+ // always have at least one component out of x or y to fetch
+
+ // save mask as it is zero'd out after each gather
+ vMask = mask;
+
+ vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
+ // e.g. result of first 8x32bit integer gather for 16bit components
+ // 256i - 0 1 2 3 4 5 6 7
+ // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
+ //
+
+ // if we have at least one component out of x or y to fetch
+ if(info.numComps > 2)
+ {
+ // offset base to the next components(zw) in the vertex to gather
+ pSrcBase = GEP(pSrcBase, C((char)4));
+ vMask = mask;
+
+ vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
+ // e.g. result of second 8x32bit integer gather for 16bit components
+ // 256i - 0 1 2 3 4 5 6 7
+ // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
+ //
+ }
+ else
+ {
+ vGatherResult[1] = vGatherMaskedVal;
+ }
+
+ // Shuffle gathered components into place, each row is a component
+ Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
+ }
+ break;
+ case 32:
+ {
+ // apply defaults
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
+ }
+
+ for(uint32_t i = 0; i < info.numComps; i++)
+ {
+ uint32_t swizzleIndex = info.swizzle[i];
+
+ // save mask as it is zero'd out after each gather
+ Value *vMask = mask;
+
+ // Gather a SIMD of components
+ vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
+
+ // offset base to the next component to gather
+ pSrcBase = GEP(pSrcBase, C((char)4));
+ }
+ }
+ break;
+ default:
+ SWR_ASSERT(0, "Invalid float format");
+ break;
+ }
+}
+
+void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
+ Value* mask, Value* vGatherComponents[], bool bPackedOutput)
+{
+ switch (info.bpp / info.numComps)
+ {
+ case 8:
+ {
+ Value* vGatherMaskedVal = VIMMED1((int32_t)0);
+ Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
+ // e.g. result of an 8x32bit integer gather for 8bit components
+ // 256i - 0 1 2 3 4 5 6 7
+ // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
+
+ Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
+ }
+ break;
+ case 16:
+ {
+ Value* vGatherResult[2];
+ Value *vMask;
+
+ // TODO: vGatherMaskedVal
+ Value* vGatherMaskedVal = VIMMED1((int32_t)0);
+
+ // always have at least one component out of x or y to fetch
+
+ // save mask as it is zero'd out after each gather
+ vMask = mask;
+
+ vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
+ // e.g. result of first 8x32bit integer gather for 16bit components
+ // 256i - 0 1 2 3 4 5 6 7
+ // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
+ //
+
+ // if we have at least one component out of x or y to fetch
+ if(info.numComps > 2)
+ {
+ // offset base to the next components(zw) in the vertex to gather
+ pSrcBase = GEP(pSrcBase, C((char)4));
+ vMask = mask;
+
+ vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
+ // e.g. result of second 8x32bit integer gather for 16bit components
+ // 256i - 0 1 2 3 4 5 6 7
+ // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
+ //
+ }
+ else
+ {
+ vGatherResult[1] = vGatherMaskedVal;
+ }
+
+ // Shuffle gathered components into place, each row is a component
+ Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
+
+ }
+ break;
+ case 32:
+ {
+ // apply defaults
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
+ }
+
+ for(uint32_t i = 0; i < info.numComps; i++)
+ {
+ uint32_t swizzleIndex = info.swizzle[i];
+
+ // save mask as it is zero'd out after each gather
+ Value *vMask = mask;
+
+ // Gather a SIMD of components
+ vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
+
+ // offset base to the next component to gather
+ pSrcBase = GEP(pSrcBase, C((char)4));
+ }
+ }
+ break;
+ default:
+ SWR_ASSERT(0, "unsupported format");
+ break;
+ }
+}
+
+void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
+{
+ // cast types
+ Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
+ Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits
+
+ // input could either be float or int vector; do shuffle work in int
+ vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
+ vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
+
+ if(bPackedOutput)
+ {
+ Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+
+ // shuffle mask
+ Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
+ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
+ // after pshufb: group components together in each 128bit lane
+ // 256i - 0 1 2 3 4 5 6 7
+ // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
+
+ Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
+ // after PERMD: move and pack xy components into each 128bit lane
+ // 256i - 0 1 2 3 4 5 6 7
+ // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
+
+ // do the same for zw components
+ Value* vi128ZW = nullptr;
+ if(info.numComps > 2)
+ {
+ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
+ vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
+ }
+
+ for(uint32_t i = 0; i < 4; i++)
+ {
+ uint32_t swizzleIndex = info.swizzle[i];
+ // todo: fixed for packed
+ Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
+ if(i >= info.numComps)
+ {
+ // set the default component val
+ vGatherOutput[swizzleIndex] = vGatherMaskedVal;
+ continue;
+ }
+
+ // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
+ uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
+ // if x or y, use vi128XY permute result, else use vi128ZW
+ Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
+
+ // extract packed component 128 bit lanes
+ vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
+ }
+
+ }
+ else
+ {
+ // pshufb masks for each component
+ Value* vConstMask[2];
+ // x/z shuffle mask
+ vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
+ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
+
+ // y/w shuffle mask
+ vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
+ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
+
+
+ // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
+ // apply defaults
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
+ }
+
+ for(uint32_t i = 0; i < info.numComps; i++)
+ {
+ uint32_t swizzleIndex = info.swizzle[i];
+
+ // select correct constMask for x/z or y/w pshufb
+ uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
+ // if x or y, use vi128XY permute result, else use vi128ZW
+ uint32_t selectedGather = (i < 2) ? 0 : 1;
+
+ vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
+ // after pshufb mask for x channel; z uses the same shuffle from the second gather
+ // 256i - 0 1 2 3 4 5 6 7
+ // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
+ }
+ }
+}
+
+void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
+{
+ // cast types
+ Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
+ Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits
+
+ if(bPackedOutput)
+ {
+ Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+ // shuffle mask
+ Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
+ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
+ // after pshufb: group components together in each 128bit lane
+ // 256i - 0 1 2 3 4 5 6 7
+ // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
+
+ Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
+ // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
+ // 256i - 0 1 2 3 4 5 6 7
+ // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
+
+ // do the same for zw components
+ Value* vi128ZW = nullptr;
+ if(info.numComps > 2)
+ {
+ vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
+ }
+
+ // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
+ for(uint32_t i = 0; i < 4; i++)
+ {
+ uint32_t swizzleIndex = info.swizzle[i];
+ // todo: fix for packed
+ Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
+ if(i >= info.numComps)
+ {
+ // set the default component val
+ vGatherOutput[swizzleIndex] = vGatherMaskedVal;
+ continue;
+ }
+
+ // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
+ uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
+ // if x or y, use vi128XY permute result, else use vi128ZW
+ Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
+
+ // sign extend
+ vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
+ }
+ }
+ // else zero extend
+ else{
+ // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
+ // apply defaults
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
+ }
+
+ for(uint32_t i = 0; i < info.numComps; i++){
+ uint32_t swizzleIndex = info.swizzle[i];
+
+ // pshufb masks for each component
+ Value* vConstMask;
+ switch(i)
+ {
+ case 0:
+ // x shuffle mask
+ vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
+ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
+ break;
+ case 1:
+ // y shuffle mask
+ vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
+ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
+ break;
+ case 2:
+ // z shuffle mask
+ vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
+ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
+ break;
+ case 3:
+ // w shuffle mask
+ vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
+ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
+ break;
+ default:
+ vConstMask = nullptr;
+ break;
+ }
+
+ vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
+ // after pshufb for x channel
+ // 256i - 0 1 2 3 4 5 6 7
+ // x000 x000 x000 x000 x000 x000 x000 x000
+ }
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief emulates a scatter operation.
+/// @param pDst - pointer to destination
+/// @param vSrc - vector of src data to scatter
+/// @param vOffsets - vector of byte offsets from pDst
+/// @param vMask - mask of valid lanes
+void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
+{
+ Value* pStack = STACKSAVE();
+
+ // allocate tmp stack for masked off lanes
+ Value* vTmpPtr = ALLOCA(vSrc->getType()->getVectorElementType());
+
+ Value *mask = MASK(vMask);
+ for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+ {
+ Value *offset = VEXTRACT(vOffsets, C(i));
+ // byte pointer to component
+ Value *storeAddress = GEP(pDst, offset);
+ storeAddress = BITCAST(storeAddress, PointerType::get(mFP32Ty, 0));
+ Value *selMask = VEXTRACT(mask, C(i));
+ Value *srcElem = VEXTRACT(vSrc, C(i));
+ // switch in a safe address to load if we're trying to access a vertex
+ Value *validAddress = SELECT(selMask, storeAddress, vTmpPtr);
+ STORE(srcElem, validAddress);
+ }
+
+ STACKRESTORE(pStack);
+}
+
+Value* Builder::VABSPS(Value* a)
+{
+ Value* asInt = BITCAST(a, mSimdInt32Ty);
+ Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
+ return result;
+}
+
+Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
+{
+ Value *lowCmp = ICMP_SLT(src, low);
+ Value *ret = SELECT(lowCmp, low, src);
+
+ Value *highCmp = ICMP_SGT(ret, high);
+ ret = SELECT(highCmp, high, ret);
+
+ return ret;
+}
+
+Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
+{
+ Value *lowCmp = FCMP_OLT(src, low);
+ Value *ret = SELECT(lowCmp, low, src);
+
+ Value *highCmp = FCMP_OGT(ret, high);
+ ret = SELECT(highCmp, high, ret);
+
+ return ret;
+}
+
+Value *Builder::FCLAMP(Value* src, float low, float high)
+{
+ Value* result = VMAXPS(src, VIMMED1(low));
+ result = VMINPS(result, VIMMED1(high));
+
+ return result;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief save/restore stack, providing ability to push/pop the stack and
+/// reduce overall stack requirements for temporary stack use
+Value* Builder::STACKSAVE()
+{
+ Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+ return CALL(pfnStackSave);
+#else
+ return CALLA(pfnStackSave);
+#endif
+}
+
+void Builder::STACKRESTORE(Value* pSaved)
+{
+ Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
+ CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
+}
+
+Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
+{
+ Value* vOut;
+ // use FMADs if available
+ if(JM()->mArch.AVX2())
+ {
+ vOut = VFMADDPS(a, b, c);
+ }
+ else
+ {
+ vOut = FADD(FMUL(a, b), c);
+ }
+ return vOut;
+}
+
+Value* Builder::POPCNT(Value* a)
+{
+ Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
+ return CALL(pCtPop, std::initializer_list<Value*>{a});
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief C functions called by LLVM IR
+//////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief called in JIT code, inserted by PRINT
+/// output to both stdout and visual studio debug console
+void __cdecl CallPrint(const char* fmt, ...)
+{
+ va_list args;
+ va_start(args, fmt);
+ vprintf(fmt, args);
+
+#if defined( _WIN32 )
+ char strBuf[1024];
+ vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
+ OutputDebugString(strBuf);
+#endif
+}
+
+Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
+{
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+ Function *func =
+ Intrinsic::getDeclaration(JM()->mpCurrentModule,
+ Intrinsic::x86_avx_vextractf128_si_256);
+ return CALL(func, {a, imm8});
+#else
+ bool flag = !imm8->isZeroValue();
+ SmallVector<Constant*,8> idx;
+ for (unsigned i = 0; i < JM()->mVWidth / 2; i++) {
+ idx.push_back(C(flag ? i + JM()->mVWidth / 2 : i));
+ }
+ return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
+#endif
+}
+
+Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
+{
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+ Function *func =
+ Intrinsic::getDeclaration(JM()->mpCurrentModule,
+ Intrinsic::x86_avx_vinsertf128_si_256);
+ return CALL(func, {a, b, imm8});
+#else
+ bool flag = !imm8->isZeroValue();
+ SmallVector<Constant*,8> idx;
+ for (unsigned i = 0; i < JM()->mVWidth; i++) {
+ idx.push_back(C(i));
+ }
+ Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
+
+ SmallVector<Constant*,8> idx2;
+ for (unsigned i = 0; i < JM()->mVWidth / 2; i++) {
+ idx2.push_back(C(flag ? i : i + JM()->mVWidth));
+ }
+ for (unsigned i = JM()->mVWidth / 2; i < JM()->mVWidth; i++) {
+ idx2.push_back(C(flag ? i + JM()->mVWidth / 2 : i));
+ }
+ return VSHUFFLE(a, inter, ConstantVector::get(idx2));
+#endif
+}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
new file mode 100644
index 00000000000..48e0558c4dd
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -0,0 +1,149 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file builder_misc.h
+*
+* @brief miscellaneous builder functions
+*
+* Notes:
+*
+******************************************************************************/
+#pragma once
+
+Constant *C(bool i);
+Constant *C(char i);
+Constant *C(uint8_t i);
+Constant *C(int i);
+Constant *C(int64_t i);
+Constant *C(uint16_t i);
+Constant *C(uint32_t i);
+Constant *C(float i);
+
+template<typename Ty>
+Constant *C(const std::initializer_list<Ty> &constList)
+{
+ std::vector<Constant*> vConsts;
+ for(auto i : constList) {
+
+ vConsts.push_back(C((Ty)i));
+ }
+ return ConstantVector::get(vConsts);
+}
+
+Constant *PRED(bool pred);
+Value *VIMMED1(int i);
+Value *VIMMED1(uint32_t i);
+Value *VIMMED1(float i);
+Value *VIMMED1(bool i);
+Value *VUNDEF(Type* t);
+Value *VUNDEF_F();
+Value *VUNDEF_I();
+Value *VUNDEF(Type* ty, uint32_t size);
+Value *VUNDEF_IPTR();
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+Value *VINSERT(Value *vec, Value *val, uint64_t index);
+#endif
+Value *VBROADCAST(Value *src);
+Value *VRCP(Value *va);
+Value *VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY);
+
+uint32_t IMMED(Value* i);
+
+Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
+Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
+CallInst *CALL(Value *Callee, const std::initializer_list<Value*> &args);
+
+LoadInst *LOAD(Value *BasePtr, const std::initializer_list<uint32_t> &offset, const llvm::Twine& name = "");
+LoadInst *LOADV(Value *BasePtr, const std::initializer_list<Value*> &offset, const llvm::Twine& name = "");
+StoreInst *STORE(Value *Val, Value *BasePtr, const std::initializer_list<uint32_t> &offset);
+StoreInst *STOREV(Value *Val, Value *BasePtr, const std::initializer_list<Value*> &offset);
+
+Value *VCMPPS_EQ(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_EQ_OQ)); }
+Value *VCMPPS_LT(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_LT_OQ)); }
+Value *VCMPPS_LE(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_LE_OQ)); }
+Value *VCMPPS_ISNAN(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_UNORD_Q)); }
+Value *VCMPPS_NEQ(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_NEQ_OQ)); }
+Value *VCMPPS_GE(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_GE_OQ)); }
+Value *VCMPPS_GT(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_GT_OQ)); }
+Value *VCMPPS_NOTNAN(Value* a, Value* b){ return VCMPPS(a, b, C((uint8_t)_CMP_ORD_Q)); }
+
+Value *MASK(Value* vmask);
+Value *VMASK(Value* mask);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief functions that build IR to call x86 intrinsics directly, or
+/// emulate them with other instructions if not available on the host
+//////////////////////////////////////////////////////////////////////////
+Value *MASKLOADD(Value* src, Value* mask);
+
+void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
+ Value* mask, Value* vGatherComponents[], bool bPackedOutput);
+
+Value *GATHERPS(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale);
+void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
+ Value* mask, Value* vGatherComponents[], bool bPackedOutput);
+
+Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale);
+void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
+ Value* mask, Value* vGatherComponents[], bool bPackedOutput);
+
+void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask);
+
+void Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput);
+void Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[], Value* vGatherOutput[], bool bPackedOutput);
+
+Value *PSHUFB(Value* a, Value* b);
+Value *PMOVSXBD(Value* a);
+Value *PMOVSXWD(Value* a);
+Value *PERMD(Value* a, Value* idx);
+Value *CVTPH2PS(Value* a);
+Value *CVTPS2PH(Value* a, Value* rounding);
+Value *PMAXSD(Value* a, Value* b);
+Value *PMINSD(Value* a, Value* b);
+Value *VABSPS(Value* a);
+Value *FMADDPS(Value* a, Value* b, Value* c);
+
+// LLVM removed VPCMPGTD x86 intrinsic. This emulates that behavior
+Value *VPCMPGTD(Value* a, Value* b)
+{
+ Value* vIndexMask = ICMP_UGT(a,b);
+
+ // need to set the high bit for x86 intrinsic masks
+ return S_EXT(vIndexMask,VectorType::get(mInt32Ty,JM()->mVWidth));
+}
+
+Value *ICLAMP(Value* src, Value* low, Value* high);
+Value *FCLAMP(Value* src, Value* low, Value* high);
+Value *FCLAMP(Value* src, float low, float high);
+
+CallInst *PRINT(const std::string &printStr);
+CallInst *PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs);
+Value* STACKSAVE();
+void STACKRESTORE(Value* pSaved);
+
+Value* POPCNT(Value* a);
+
+Value* INT3() { return INTERRUPT(C((uint8_t)3)); }
+
+
+Value *VEXTRACTI128(Value* a, Constant* imm8);
+Value *VINSERTI128(Value* a, Value* b, Constant* imm8);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
new file mode 100644
index 00000000000..c5a180e27cb
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -0,0 +1,1431 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file fetch_jit.cpp
+*
+* @brief Implementation of the fetch jitter
+*
+* Notes:
+*
+******************************************************************************/
+#include "jit_api.h"
+#include "fetch_jit.h"
+#include "builder.h"
+#include "state_llvm.h"
+#include "common/containers.hpp"
+#include "llvm/IR/DataLayout.h"
+#include <sstream>
+#include <tuple>
+
+//#define FETCH_DUMP_VERTEX 1
+
+bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
+
+enum ConversionType
+{
+ CONVERT_NONE,
+ CONVERT_NORMALIZED,
+ CONVERT_USCALED,
+ CONVERT_SSCALED,
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Interface to Jitting a fetch shader
+//////////////////////////////////////////////////////////////////////////
+struct FetchJit : public Builder
+{
+ FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
+
+ Function* Create(const FETCH_COMPILE_STATE& fetchState);
+ Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
+ Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
+ Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
+
+ // package up Shuffle*bpcGatherd args into a tuple for convenience
+ typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
+ uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
+ const uint32_t (&)[4]> Shuffle8bpcArgs;
+ void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
+
+ typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
+ uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
+ void Shuffle16bpcGather(Shuffle16bpcArgs &args);
+
+ void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
+
+ Value* GenerateCompCtrlVector(const ComponentControl ctrl);
+
+ void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut);
+ void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut);
+};
+
+Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
+{
+ static std::size_t fetchNum = 0;
+
+ std::stringstream fnName("FetchShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
+ fnName << fetchNum++;
+
+ Function* fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
+ BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
+
+ IRB()->SetInsertPoint(entry);
+
+ auto argitr = fetch->getArgumentList().begin();
+
+ // Fetch shader arguments
+ Value* fetchInfo = &*argitr; ++argitr;
+ fetchInfo->setName("fetchInfo");
+ Value* pVtxOut = &*argitr;
+ pVtxOut->setName("vtxOutput");
+ // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
+ // index 0(just the pointer to the simdvertex structure
+ // index 1(which element of the simdvertex structure to offset to(in this case 0)
+ // so the indices being i32's doesn't matter
+ // TODO: generated this GEP with a VECTOR structure type so this makes sense
+ std::vector<Value*> vtxInputIndices(2, C(0));
+ // GEP
+ pVtxOut = GEP(pVtxOut, C(0));
+ pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, JM()->mVWidth), 0));
+
+ // SWR_FETCH_CONTEXT::pStreams
+ Value* streams = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
+ streams->setName("pStreams");
+
+ // SWR_FETCH_CONTEXT::pIndices
+ Value* indices = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
+ indices->setName("pIndices");
+
+ // SWR_FETCH_CONTEXT::pLastIndex
+ Value* pLastIndex = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
+ pLastIndex->setName("pLastIndex");
+
+
+ Value* vIndices;
+ switch(fetchState.indexType)
+ {
+ case R8_UINT:
+ indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
+ if(fetchState.bDisableIndexOOBCheck){
+ vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
+ vIndices = Z_EXT(vIndices, mSimdInt32Ty);
+ }
+ else{
+ pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
+ vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
+ }
+ break;
+ case R16_UINT:
+ indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
+ if(fetchState.bDisableIndexOOBCheck){
+ vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
+ vIndices = Z_EXT(vIndices, mSimdInt32Ty);
+ }
+ else{
+ pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
+ vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
+ }
+ break;
+ case R32_UINT:
+ (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
+ : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
+ break; // incoming type is already 32bit int
+ default: SWR_ASSERT(0, "Unsupported index type"); vIndices = nullptr; break;
+ }
+
+ // store out vertex IDs
+ STORE(vIndices, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
+
+ // store out cut mask if enabled
+ if (fetchState.bEnableCutIndex)
+ {
+ Value* vCutIndex = VIMMED1(fetchState.cutIndex);
+ Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
+ STORE(cutMask, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
+ }
+
+ // Fetch attributes from memory and output to a simdvertex struct
+ // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
+ (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut)
+ : JitGatherVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut);
+
+ RET_VOID();
+
+ JitManager::DumpToFile(fetch, "src");
+
+ verifyFunction(*fetch);
+
+ FunctionPassManager setupPasses(JM()->mpCurrentModule);
+
+ ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
+ setupPasses.add(createBreakCriticalEdgesPass());
+ setupPasses.add(createCFGSimplificationPass());
+ setupPasses.add(createEarlyCSEPass());
+ setupPasses.add(createPromoteMemoryToRegisterPass());
+
+ setupPasses.run(*fetch);
+
+ JitManager::DumpToFile(fetch, "se");
+
+ FunctionPassManager optPasses(JM()->mpCurrentModule);
+
+ ///@todo Haven't touched these either. Need to remove some of these and add others.
+ optPasses.add(createCFGSimplificationPass());
+ optPasses.add(createEarlyCSEPass());
+ optPasses.add(createInstructionCombiningPass());
+ optPasses.add(createInstructionSimplifierPass());
+ optPasses.add(createConstantPropagationPass());
+ optPasses.add(createSCCPPass());
+ optPasses.add(createAggressiveDCEPass());
+
+ optPasses.run(*fetch);
+ optPasses.run(*fetch);
+
+ JitManager::DumpToFile(fetch, "opt");
+
+ return fetch;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Loads attributes from memory using LOADs, shuffling the
+/// components into SOA form.
+/// *Note* currently does not support component control,
+/// component packing, or instancing
+/// @param fetchState - info about attributes to be fetched from memory
+/// @param streams - value pointer to the current vertex stream
+/// @param vIndices - vector value of indices to load
+/// @param pVtxOut - value pointer to output simdvertex struct
+void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut)
+{
+ // Zack shuffles; a variant of the Charleston.
+
+ SWRL::UncheckedFixedVector<Value*, 16> vectors;
+
+ std::vector<Constant*> pMask(JM()->mVWidth);
+ for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+ {
+ pMask[i] = (C(i < 4 ? i : 4));
+ }
+ Constant* promoteMask = ConstantVector::get(pMask);
+ Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
+
+ Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
+
+ for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
+ {
+ Value* elements[4] = {0};
+ const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
+ const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
+ uint32_t numComponents = info.numComps;
+ uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
+
+ vectors.clear();
+
+ // load SWR_VERTEX_BUFFER_STATE::pData
+ Value *stream = LOAD(streams, {ied.StreamIndex, 2});
+
+ // load SWR_VERTEX_BUFFER_STATE::pitch
+ Value *stride = LOAD(streams, {ied.StreamIndex, 1});
+ stride = Z_EXT(stride, mInt64Ty);
+
+ // load SWR_VERTEX_BUFFER_STATE::size
+ Value *size = LOAD(streams, {ied.StreamIndex, 3});
+ size = Z_EXT(size, mInt64Ty);
+
+ Value* startVertexOffset = MUL(Z_EXT(startVertex, mInt64Ty), stride);
+
+ // Load from the stream.
+ for(uint32_t lane = 0; lane < JM()->mVWidth; ++lane)
+ {
+ // Get index
+ Value* index = VEXTRACT(vIndices, C(lane));
+ index = Z_EXT(index, mInt64Ty);
+
+ Value* offset = MUL(index, stride);
+ offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
+ offset = ADD(offset, startVertexOffset);
+
+ if (!fetchState.bDisableIndexOOBCheck) {
+ // check for out of bound access, including partial OOB, and mask them to 0
+ Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
+ Value *oob = ICMP_ULE(endOffset, size);
+ offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
+ }
+
+ Value* pointer = GEP(stream, offset);
+ // We use a full-lane, but don't actually care.
+ Value* vptr = 0;
+
+ // get a pointer to a 4 component attrib in default address space
+ switch(bpc)
+ {
+ case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
+ case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
+ case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
+ default: SWR_ASSERT(false, "Unsupported underlying bpp!");
+ }
+
+ // load 4 components of attribute
+ Value* vec = ALIGNED_LOAD(vptr, 1, false);
+
+ // Convert To FP32 internally
+ switch(info.type[0])
+ {
+ case SWR_TYPE_UNORM:
+ switch(bpc)
+ {
+ case 8:
+ vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
+ vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
+ break;
+ case 16:
+ vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
+ vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
+ break;
+ default:
+ SWR_ASSERT(false, "Unsupported underlying type!");
+ break;
+ }
+ break;
+ case SWR_TYPE_SNORM:
+ switch(bpc)
+ {
+ case 8:
+ vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
+ vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
+ break;
+ case 16:
+ vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
+ vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
+ break;
+ default:
+ SWR_ASSERT(false, "Unsupported underlying type!");
+ break;
+ }
+ break;
+ case SWR_TYPE_UINT:
+ // Zero extend uint32_t types.
+ switch(bpc)
+ {
+ case 8:
+ case 16:
+ vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
+ vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
+ break;
+ case 32:
+ break; // Pass through unchanged.
+ default:
+ SWR_ASSERT(false, "Unsupported underlying type!");
+ break;
+ }
+ break;
+ case SWR_TYPE_SINT:
+ // Sign extend SINT types.
+ switch(bpc)
+ {
+ case 8:
+ case 16:
+ vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
+ vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
+ break;
+ case 32:
+ break; // Pass through unchanged.
+ default:
+ SWR_ASSERT(false, "Unsupported underlying type!");
+ break;
+ }
+ break;
+ case SWR_TYPE_FLOAT:
+ switch(bpc)
+ {
+ case 32:
+ break; // Pass through unchanged.
+ default:
+ SWR_ASSERT(false, "Unsupported underlying type!");
+ }
+ break;
+ case SWR_TYPE_USCALED:
+ vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
+ break;
+ case SWR_TYPE_SSCALED:
+ vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
+ break;
+ case SWR_TYPE_UNKNOWN:
+ case SWR_TYPE_UNUSED:
+ SWR_ASSERT(false, "Unsupported type %d!", info.type[0]);
+ }
+
+ // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
+ // uwvec: 4 x F32, undef value
+ Value* wvec = VSHUFFLE(vec, uwvec, promoteMask);
+ vectors.push_back(wvec);
+ }
+
+ std::vector<Constant*> v01Mask(JM()->mVWidth);
+ std::vector<Constant*> v23Mask(JM()->mVWidth);
+ std::vector<Constant*> v02Mask(JM()->mVWidth);
+ std::vector<Constant*> v13Mask(JM()->mVWidth);
+
+ // Concatenate the vectors together.
+ elements[0] = VUNDEF_F();
+ elements[1] = VUNDEF_F();
+ elements[2] = VUNDEF_F();
+ elements[3] = VUNDEF_F();
+ for(uint32_t b = 0, num4Wide = JM()->mVWidth / 4; b < num4Wide; ++b)
+ {
+ v01Mask[4 * b + 0] = C(0 + 4 * b);
+ v01Mask[4 * b + 1] = C(1 + 4 * b);
+ v01Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth);
+ v01Mask[4 * b + 3] = C(1 + 4 * b + JM()->mVWidth);
+
+ v23Mask[4 * b + 0] = C(2 + 4 * b);
+ v23Mask[4 * b + 1] = C(3 + 4 * b);
+ v23Mask[4 * b + 2] = C(2 + 4 * b + JM()->mVWidth);
+ v23Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth);
+
+ v02Mask[4 * b + 0] = C(0 + 4 * b);
+ v02Mask[4 * b + 1] = C(2 + 4 * b);
+ v02Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth);
+ v02Mask[4 * b + 3] = C(2 + 4 * b + JM()->mVWidth);
+
+ v13Mask[4 * b + 0] = C(1 + 4 * b);
+ v13Mask[4 * b + 1] = C(3 + 4 * b);
+ v13Mask[4 * b + 2] = C(1 + 4 * b + JM()->mVWidth);
+ v13Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth);
+
+ std::vector<Constant*> iMask(JM()->mVWidth);
+ for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+ {
+ if(((4 * b) <= i) && (i < (4 * (b + 1))))
+ {
+ iMask[i] = C(i % 4 + JM()->mVWidth);
+ }
+ else
+ {
+ iMask[i] = C(i);
+ }
+ }
+ Constant* insertMask = ConstantVector::get(iMask);
+ elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
+ elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
+ elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
+ elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
+ }
+
+ Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
+ Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
+ Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
+ Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
+ elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
+ elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
+ elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
+ elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
+
+ switch(numComponents + 1)
+ {
+ case 1: elements[0] = VIMMED1(0.0f);
+ case 2: elements[1] = VIMMED1(0.0f);
+ case 3: elements[2] = VIMMED1(0.0f);
+ case 4: elements[3] = VIMMED1(1.0f);
+ }
+
+ for(uint32_t c = 0; c < 4; ++c)
+ {
+ Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
+ STORE(elements[c], dest);
+ }
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Loads attributes from memory using AVX2 GATHER(s)
+/// @param fetchState - info about attributes to be fetched from memory
+/// @param fetchInfo - first argument passed to fetch shader
+/// @param streams - value pointer to the current vertex stream
+/// @param vIndices - vector value of indices to gather
+/// @param pVtxOut - value pointer to output simdvertex struct
+void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo,
+ Value* streams, Value* vIndices, Value* pVtxOut)
+{
+ uint32_t currentVertexElement = 0;
+ uint32_t outputElt = 0;
+ Value* vVertexElements[4];
+
+ Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
+ Value* startInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
+ Value* curInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
+ Value* vBaseVertex = VBROADCAST(LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
+ curInstance->setName("curInstance");
+
+ for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt)
+ {
+ const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
+ const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
+ uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
+
+ Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
+
+ // VGATHER* takes an *i8 src pointer
+ Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
+
+ Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
+ Value *vStride = VBROADCAST(stride);
+
+ // max vertex index that is fully in bounds
+ Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
+ maxVertex = LOAD(maxVertex);
+
+ Value *vCurIndices;
+ Value *startOffset;
+ if(ied.InstanceEnable)
+ {
+ Value* stepRate = C(ied.InstanceDataStepRate);
+
+ // prevent a div by 0 for 0 step rate
+ Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
+ stepRate = SELECT(isNonZeroStep, stepRate, C(1));
+
+ // calc the current offset into instanced data buffer
+ Value* calcInstance = UDIV(curInstance, stepRate);
+
+ // if step rate is 0, every instance gets instance 0
+ calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
+
+ vCurIndices = VBROADCAST(calcInstance);
+
+ startOffset = startInstance;
+ }
+ else
+ {
+ // offset indices by baseVertex
+ vCurIndices = ADD(vIndices, vBaseVertex);
+
+ startOffset = startVertex;
+ }
+
+ // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
+ // do 64bit address offset calculations.
+
+ // calculate byte offset to the start of the VB
+ Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
+ pStreamBase = GEP(pStreamBase, baseOffset);
+
+ // if we have a start offset, subtract from max vertex. Used for OOB check
+ maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
+ Value* neg = ICMP_SLT(maxVertex, C((int64_t)0));
+ // if we have a negative value, we're already OOB. clamp at 0.
+ maxVertex = SELECT(neg, C(0), TRUNC(maxVertex, mInt32Ty));
+
+ // Load the in bounds size of a partially valid vertex
+ Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
+ partialInboundsSize = LOAD(partialInboundsSize);
+ Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
+ Value* vBpp = VBROADCAST(C(info.Bpp));
+ Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
+
+ // is the element is <= the partially valid size
+ Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
+
+ // are vertices partially OOB?
+ Value* vMaxVertex = VBROADCAST(maxVertex);
+ Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
+
+ // are vertices are fully in bounds?
+ Value* vGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
+
+ // blend in any partially OOB indices that have valid elements
+ vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
+ vGatherMask = VMASK(vGatherMask);
+
+ // calculate the actual offsets into the VB
+ Value* vOffsets = MUL(vCurIndices, vStride);
+ vOffsets = ADD(vOffsets, vAlignmentOffsets);
+
+ // Packing and component control
+ ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
+ const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
+ (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
+
+ if(info.type[0] == SWR_TYPE_FLOAT)
+ {
+ ///@todo: support 64 bit vb accesses
+ Value* gatherSrc = VIMMED1(0.0f);
+
+ // Gather components from memory to store in a simdvertex structure
+ switch(bpc)
+ {
+ case 16:
+ {
+ Value* vGatherResult[2];
+ Value *vMask;
+
+ // if we have at least one component out of x or y to fetch
+ if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
+ // save mask as it is zero'd out after each gather
+ vMask = vGatherMask;
+
+ vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+ // e.g. result of first 8x32bit integer gather for 16bit components
+ // 256i - 0 1 2 3 4 5 6 7
+ // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
+ //
+ }
+
+ // if we have at least one component out of z or w to fetch
+ if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
+ // offset base to the next components(zw) in the vertex to gather
+ pStreamBase = GEP(pStreamBase, C((char)4));
+ vMask = vGatherMask;
+
+ vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+ // e.g. result of second 8x32bit integer gather for 16bit components
+ // 256i - 0 1 2 3 4 5 6 7
+ // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
+ //
+ }
+
+ // if we have at least one component to shuffle into place
+ if(compMask){
+ Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
+ currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
+ // Shuffle gathered components into place in simdvertex struct
+ Shuffle16bpcGather(args); // outputs to vVertexElements ref
+ }
+ }
+ break;
+ case 32:
+ {
+ for(uint32_t i = 0; i < 4; i++)
+ {
+ if(!isComponentEnabled(compMask, i)){
+ // offset base to the next component in the vertex to gather
+ pStreamBase = GEP(pStreamBase, C((char)4));
+ continue;
+ }
+
+ // if we need to gather the component
+ if(compCtrl[i] == StoreSrc){
+ // save mask as it is zero'd out after each gather
+ Value *vMask = vGatherMask;
+
+ // Gather a SIMD of vertices
+ vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+ }
+ else{
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+ }
+
+ if(currentVertexElement > 3){
+ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
+ // reset to the next vVertexElement to output
+ currentVertexElement = 0;
+ }
+
+ // offset base to the next component in the vertex to gather
+ pStreamBase = GEP(pStreamBase, C((char)4));
+ }
+ }
+ break;
+ default:
+ SWR_ASSERT(0, "Tried to fetch invalid FP format");
+ break;
+ }
+ }
+ else
+ {
+ Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
+ ConversionType conversionType = CONVERT_NONE;
+
+ switch(info.type[0])
+ {
+ case SWR_TYPE_UNORM:
+ conversionType = CONVERT_NORMALIZED;
+ case SWR_TYPE_UINT:
+ extendCastType = Instruction::CastOps::ZExt;
+ break;
+ case SWR_TYPE_SNORM:
+ conversionType = CONVERT_NORMALIZED;
+ case SWR_TYPE_SINT:
+ extendCastType = Instruction::CastOps::SExt;
+ break;
+ case SWR_TYPE_USCALED:
+ conversionType = CONVERT_USCALED;
+ extendCastType = Instruction::CastOps::UIToFP;
+ break;
+ case SWR_TYPE_SSCALED:
+ conversionType = CONVERT_SSCALED;
+ extendCastType = Instruction::CastOps::SIToFP;
+ break;
+ default:
+ break;
+ }
+
+ // value substituted when component of gather is masked
+ Value* gatherSrc = VIMMED1(0);
+
+ // Gather components from memory to store in a simdvertex structure
+ switch (bpc)
+ {
+ case 8:
+ {
+ // if we have at least one component to fetch
+ if(compMask){
+ Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
+ // e.g. result of an 8x32bit integer gather for 8bit components
+ // 256i - 0 1 2 3 4 5 6 7
+ // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
+
+ Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
+ currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
+ // Shuffle gathered components into place in simdvertex struct
+ Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
+ }
+ }
+ break;
+ case 16:
+ {
+ Value* vGatherResult[2];
+ Value *vMask;
+
+ // if we have at least one component out of x or y to fetch
+ if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
+ // save mask as it is zero'd out after each gather
+ vMask = vGatherMask;
+
+ vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+ // e.g. result of first 8x32bit integer gather for 16bit components
+ // 256i - 0 1 2 3 4 5 6 7
+ // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
+ //
+ }
+
+ // if we have at least one component out of z or w to fetch
+ if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
+ // offset base to the next components(zw) in the vertex to gather
+ pStreamBase = GEP(pStreamBase, C((char)4));
+ vMask = vGatherMask;
+
+ vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+ // e.g. result of second 8x32bit integer gather for 16bit components
+ // 256i - 0 1 2 3 4 5 6 7
+ // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
+ //
+ }
+
+ // if we have at least one component to shuffle into place
+ if(compMask){
+ Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
+ currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
+ // Shuffle gathered components into place in simdvertex struct
+ Shuffle16bpcGather(args); // outputs to vVertexElements ref
+ }
+ }
+ break;
+ case 32:
+ {
+ SWR_ASSERT(conversionType == CONVERT_NONE);
+
+ // Gathered components into place in simdvertex struct
+ for(uint32_t i = 0; i < 4; i++)
+ {
+ if(!isComponentEnabled(compMask, i)){
+ // offset base to the next component in the vertex to gather
+ pStreamBase = GEP(pStreamBase, C((char)4));
+ continue;
+ }
+
+ // if we need to gather the component
+ if(compCtrl[i] == StoreSrc){
+ // save mask as it is zero'd out after each gather
+ Value *vMask = vGatherMask;
+
+ vVertexElements[currentVertexElement++] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+
+ // e.g. result of a single 8x32bit integer gather for 32bit components
+ // 256i - 0 1 2 3 4 5 6 7
+ // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
+ }
+ else{
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+ }
+
+ if(currentVertexElement > 3){
+ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
+ // reset to the next vVertexElement to output
+ currentVertexElement = 0;
+ }
+
+ // offset base to the next component in the vertex to gather
+ pStreamBase = GEP(pStreamBase, C((char)4));
+ }
+ }
+ break;
+ }
+ }
+ }
+
+ // if we have a partially filled vVertexElement struct, output it
+ if(currentVertexElement > 0){
+ StoreVertexElements(pVtxOut, outputElt++, currentVertexElement+1, vVertexElements);
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Loads a simd of valid indices. OOB indices are set to 0
+/// *Note* have to do 16bit index checking in scalar until we have AVX-512
+/// support
+/// @param pIndices - pointer to 8 bit indices
+/// @param pLastIndex - pointer to last valid index
+Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
+{
+ // can fit 2 16 bit integers per vWidth lane
+ Value* vIndices = VUNDEF_I();
+
+ // store 0 index on stack to be used to conditionally load from if index address is OOB
+ Value* pZeroIndex = ALLOCA(mInt8Ty);
+ STORE(C((uint8_t)0), pZeroIndex);
+
+ // Load a SIMD of index pointers
+ for(int64_t lane = 0; lane < JM()->mVWidth; lane++)
+ {
+ // Calculate the address of the requested index
+ Value *pIndex = GEP(pIndices, C(lane));
+
+ // check if the address is less than the max index,
+ Value* mask = ICMP_ULT(pIndex, pLastIndex);
+
+ // if valid, load the index. if not, load 0 from the stack
+ Value* pValid = SELECT(mask, pIndex, pZeroIndex);
+ Value *index = LOAD(pValid, "valid index");
+
+ // zero extended index to 32 bits and insert into the correct simd lane
+ index = Z_EXT(index, mInt32Ty);
+ vIndices = VINSERT(vIndices, index, lane);
+ }
+ return vIndices;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Loads a simd of valid indices. OOB indices are set to 0
+/// *Note* have to do 16bit index checking in scalar until we have AVX-512
+/// support
+/// @param pIndices - pointer to 16 bit indices
+/// @param pLastIndex - pointer to last valid index
+Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
+{
+ // can fit 2 16 bit integers per vWidth lane
+ Value* vIndices = VUNDEF_I();
+
+ // store 0 index on stack to be used to conditionally load from if index address is OOB
+ Value* pZeroIndex = ALLOCA(mInt16Ty);
+ STORE(C((uint16_t)0), pZeroIndex);
+
+ // Load a SIMD of index pointers
+ for(int64_t lane = 0; lane < JM()->mVWidth; lane++)
+ {
+ // Calculate the address of the requested index
+ Value *pIndex = GEP(pIndices, C(lane));
+
+ // check if the address is less than the max index,
+ Value* mask = ICMP_ULT(pIndex, pLastIndex);
+
+ // if valid, load the index. if not, load 0 from the stack
+ Value* pValid = SELECT(mask, pIndex, pZeroIndex);
+ Value *index = LOAD(pValid, "valid index");
+
+ // zero extended index to 32 bits and insert into the correct simd lane
+ index = Z_EXT(index, mInt32Ty);
+ vIndices = VINSERT(vIndices, index, lane);
+ }
+ return vIndices;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Loads a simd of valid indices. OOB indices are set to 0
+/// @param pIndices - pointer to 32 bit indices
+/// @param pLastIndex - pointer to last valid index
+Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
+{
+ DataLayout dL(JM()->mpCurrentModule);
+ unsigned int ptrSize = dL.getPointerSize() * 8; // ptr size in bits
+ Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
+ Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
+
+ // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
+ Value* numIndicesLeft = SUB(iLastIndex,iIndices);
+ numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
+ numIndicesLeft = SDIV(numIndicesLeft, C(4));
+
+ // create a vector of index counts from the base index ptr passed into the fetch
+ const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
+ Constant* vIndexOffsets = ConstantVector::get(vecIndices);
+
+ // compare index count to the max valid index
+ // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load
+ // vIndexOffsets 0 1 2 3 4 5 6 7
+ // ------------------------------
+ // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass
+ // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
+ Value* vMaxIndex = VBROADCAST(numIndicesLeft);
+ Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
+
+ // VMASKLOAD takes an *i8 src pointer
+ pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
+
+ // Load the indices; OOB loads 0
+ return MASKLOADD(pIndices,vIndexMask);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
+/// denormalizes if needed, converts to F32 if needed, and positions in
+// the proper SIMD rows to be output to the simdvertex structure
+/// @param args: (tuple of args, listed below)
+/// @param vGatherResult - 8 gathered 8bpc vertices
+/// @param pVtxOut - base pointer to output simdvertex struct
+/// @param extendType - sign extend or zero extend
+/// @param bNormalized - do we need to denormalize?
+/// @param currentVertexElement - reference to the current vVertexElement
+/// @param outputElt - reference to the current offset from simdvertex we're o
+/// @param compMask - component packing mask
+/// @param compCtrl - component control val
+/// @param vVertexElements[4] - vertex components to output
+/// @param swizzle[4] - component swizzle location
+void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
+{
+ // Unpack tuple args
+ Value*& vGatherResult = std::get<0>(args);
+ Value* pVtxOut = std::get<1>(args);
+ const Instruction::CastOps extendType = std::get<2>(args);
+ const ConversionType conversionType = std::get<3>(args);
+ uint32_t &currentVertexElement = std::get<4>(args);
+ uint32_t &outputElt = std::get<5>(args);
+ const ComponentEnable compMask = std::get<6>(args);
+ const ComponentControl (&compCtrl)[4] = std::get<7>(args);
+ Value* (&vVertexElements)[4] = std::get<8>(args);
+ const uint32_t (&swizzle)[4] = std::get<9>(args);
+
+ // cast types
+ Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
+ Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits
+
+ // have to do extra work for sign extending
+ if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
+ Type* v16x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 2); // 8x16bit ints in a 128bit lane
+ Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+
+ // shuffle mask, including any swizzling
+ const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
+ const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
+ Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
+ char(y), char(y+4), char(y+8), char(y+12),
+ char(z), char(z+4), char(z+8), char(z+12),
+ char(w), char(w+4), char(w+8), char(w+12),
+ char(x), char(x+4), char(x+8), char(x+12),
+ char(y), char(y+4), char(y+8), char(y+12),
+ char(z), char(z+4), char(z+8), char(z+12),
+ char(w), char(w+4), char(w+8), char(w+12)});
+
+ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
+ // after pshufb: group components together in each 128bit lane
+ // 256i - 0 1 2 3 4 5 6 7
+ // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
+
+ Value* vi128XY = nullptr;
+ if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
+ vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
+ // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
+ // 256i - 0 1 2 3 4 5 6 7
+ // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
+ }
+
+ // do the same for zw components
+ Value* vi128ZW = nullptr;
+ if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
+ vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
+ }
+
+ // init denormalize variables if needed
+ Instruction::CastOps fpCast;
+ Value* conversionFactor;
+
+ switch (conversionType)
+ {
+ case CONVERT_NORMALIZED:
+ fpCast = Instruction::CastOps::SIToFP;
+ conversionFactor = VIMMED1((float)(1.0 / 127.0));
+ break;
+ case CONVERT_SSCALED:
+ fpCast = Instruction::CastOps::SIToFP;
+ conversionFactor = VIMMED1((float)(1.0));
+ break;
+ case CONVERT_USCALED:
+ SWR_ASSERT(0, "Type should not be sign extended!");
+ conversionFactor = nullptr;
+ break;
+ default:
+ SWR_ASSERT(conversionType == CONVERT_NONE);
+ conversionFactor = nullptr;
+ break;
+ }
+
+ // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
+ for(uint32_t i = 0; i < 4; i++){
+ if(!isComponentEnabled(compMask, i)){
+ continue;
+ }
+
+ if(compCtrl[i] == ComponentControl::StoreSrc){
+ // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
+ uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
+ // if x or y, use vi128XY permute result, else use vi128ZW
+ Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
+
+ // sign extend
+ vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
+
+ // denormalize if needed
+ if(conversionType != CONVERT_NONE){
+ vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
+ }
+ currentVertexElement++;
+ }
+ else{
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+ }
+
+ if(currentVertexElement > 3){
+ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
+ // reset to the next vVertexElement to output
+ currentVertexElement = 0;
+ }
+ }
+ }
+ // else zero extend
+ else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
+ {
+ // init denormalize variables if needed
+ Instruction::CastOps fpCast;
+ Value* conversionFactor;
+
+ switch (conversionType)
+ {
+ case CONVERT_NORMALIZED:
+ fpCast = Instruction::CastOps::UIToFP;
+ conversionFactor = VIMMED1((float)(1.0 / 255.0));
+ break;
+ case CONVERT_USCALED:
+ fpCast = Instruction::CastOps::UIToFP;
+ conversionFactor = VIMMED1((float)(1.0));
+ break;
+ case CONVERT_SSCALED:
+ SWR_ASSERT(0, "Type should not be zero extended!");
+ conversionFactor = nullptr;
+ break;
+ default:
+ SWR_ASSERT(conversionType == CONVERT_NONE);
+ conversionFactor = nullptr;
+ break;
+ }
+
+ // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
+ for(uint32_t i = 0; i < 4; i++){
+ if(!isComponentEnabled(compMask, i)){
+ continue;
+ }
+
+ if(compCtrl[i] == ComponentControl::StoreSrc){
+ // pshufb masks for each component
+ Value* vConstMask;
+ switch(swizzle[i]){
+ case 0:
+ // x shuffle mask
+ vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
+ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
+ break;
+ case 1:
+ // y shuffle mask
+ vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
+ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
+ break;
+ case 2:
+ // z shuffle mask
+ vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
+ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
+ break;
+ case 3:
+ // w shuffle mask
+ vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
+ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
+ break;
+ default:
+ vConstMask = nullptr;
+ break;
+ }
+
+ vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
+ // after pshufb for x channel
+ // 256i - 0 1 2 3 4 5 6 7
+ // x000 x000 x000 x000 x000 x000 x000 x000
+
+ // denormalize if needed
+ if (conversionType != CONVERT_NONE){
+ vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
+ }
+ currentVertexElement++;
+ }
+ else{
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+ }
+
+ if(currentVertexElement > 3){
+ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
+ // reset to the next vVertexElement to output
+ currentVertexElement = 0;
+ }
+ }
+ }
+ else
+ {
+ SWR_ASSERT(0, "Unsupported conversion type");
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
+/// denormalizes if needed, converts to F32 if needed, and positions in
+// the proper SIMD rows to be output to the simdvertex structure
+/// @param args: (tuple of args, listed below)
+/// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
+/// @param pVtxOut - base pointer to output simdvertex struct
+/// @param extendType - sign extend or zero extend
+/// @param bNormalized - do we need to denormalize?
+/// @param currentVertexElement - reference to the current vVertexElement
+/// @param outputElt - reference to the current offset from simdvertex we're o
+/// @param compMask - component packing mask
+/// @param compCtrl - component control val
+/// @param vVertexElements[4] - vertex components to output
+void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
+{
+ // Unpack tuple args
+ Value* (&vGatherResult)[2] = std::get<0>(args);
+ Value* pVtxOut = std::get<1>(args);
+ const Instruction::CastOps extendType = std::get<2>(args);
+ const ConversionType conversionType = std::get<3>(args);
+ uint32_t &currentVertexElement = std::get<4>(args);
+ uint32_t &outputElt = std::get<5>(args);
+ const ComponentEnable compMask = std::get<6>(args);
+ const ComponentControl(&compCtrl)[4] = std::get<7>(args);
+ Value* (&vVertexElements)[4] = std::get<8>(args);
+
+ // cast types
+ Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
+ Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits
+
+ // have to do extra work for sign extending
+ if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
+ (extendType == Instruction::CastOps::FPExt))
+ {
+ // is this PP float?
+ bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
+
+ Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
+ Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+
+ // shuffle mask
+ Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
+ Value* vi128XY = nullptr;
+ if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
+ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
+ // after pshufb: group components together in each 128bit lane
+ // 256i - 0 1 2 3 4 5 6 7
+ // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
+
+ vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
+ // after PERMD: move and pack xy components into each 128bit lane
+ // 256i - 0 1 2 3 4 5 6 7
+ // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
+ }
+
+ // do the same for zw components
+ Value* vi128ZW = nullptr;
+ if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
+ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
+ vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
+ }
+
+ // init denormalize variables if needed
+ Instruction::CastOps IntToFpCast;
+ Value* conversionFactor;
+
+ switch (conversionType)
+ {
+ case CONVERT_NORMALIZED:
+ IntToFpCast = Instruction::CastOps::SIToFP;
+ conversionFactor = VIMMED1((float)(1.0 / 32767.0));
+ break;
+ case CONVERT_SSCALED:
+ IntToFpCast = Instruction::CastOps::SIToFP;
+ conversionFactor = VIMMED1((float)(1.0));
+ break;
+ case CONVERT_USCALED:
+ SWR_ASSERT(0, "Type should not be sign extended!");
+ conversionFactor = nullptr;
+ break;
+ default:
+ SWR_ASSERT(conversionType == CONVERT_NONE);
+ conversionFactor = nullptr;
+ break;
+ }
+
+ // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
+ for(uint32_t i = 0; i < 4; i++){
+ if(!isComponentEnabled(compMask, i)){
+ continue;
+ }
+
+ if(compCtrl[i] == ComponentControl::StoreSrc){
+ // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
+ uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
+ // if x or y, use vi128XY permute result, else use vi128ZW
+ Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
+
+ if(bFP) {
+ // extract 128 bit lanes to sign extend each component
+ vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
+ }
+ else {
+ // extract 128 bit lanes to sign extend each component
+ vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
+
+ // denormalize if needed
+ if(conversionType != CONVERT_NONE){
+ vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
+ }
+ }
+ currentVertexElement++;
+ }
+ else{
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+ }
+
+ if(currentVertexElement > 3){
+ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
+ // reset to the next vVertexElement to output
+ currentVertexElement = 0;
+ }
+ }
+
+ }
+ // else zero extend
+ else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
+ {
+ // pshufb masks for each component
+ Value* vConstMask[2];
+ if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
+ // x/z shuffle mask
+ vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
+ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
+ }
+
+ if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
+ // y/w shuffle mask
+ vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
+ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
+ }
+
+ // init denormalize variables if needed
+ Instruction::CastOps fpCast;
+ Value* conversionFactor;
+
+ switch (conversionType)
+ {
+ case CONVERT_NORMALIZED:
+ fpCast = Instruction::CastOps::UIToFP;
+ conversionFactor = VIMMED1((float)(1.0 / 65535.0));
+ break;
+ case CONVERT_USCALED:
+ fpCast = Instruction::CastOps::UIToFP;
+ conversionFactor = VIMMED1((float)(1.0f));
+ break;
+ case CONVERT_SSCALED:
+ SWR_ASSERT(0, "Type should not be zero extended!");
+ conversionFactor = nullptr;
+ break;
+ default:
+ SWR_ASSERT(conversionType == CONVERT_NONE);
+ conversionFactor = nullptr;
+ break;
+ }
+
+ // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
+ for(uint32_t i = 0; i < 4; i++){
+ if(!isComponentEnabled(compMask, i)){
+ continue;
+ }
+
+ if(compCtrl[i] == ComponentControl::StoreSrc){
+ // select correct constMask for x/z or y/w pshufb
+ uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
+ // if x or y, use vi128XY permute result, else use vi128ZW
+ uint32_t selectedGather = (i < 2) ? 0 : 1;
+
+ vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
+ // after pshufb mask for x channel; z uses the same shuffle from the second gather
+ // 256i - 0 1 2 3 4 5 6 7
+ // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
+
+ // denormalize if needed
+ if(conversionType != CONVERT_NONE){
+ vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
+ }
+ currentVertexElement++;
+ }
+ else{
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+ }
+
+ if(currentVertexElement > 3){
+ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
+ // reset to the next vVertexElement to output
+ currentVertexElement = 0;
+ }
+ }
+ }
+ else
+ {
+ SWR_ASSERT(0, "Unsupported conversion type");
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Output a simdvertex worth of elements to the current outputElt
+/// @param pVtxOut - base address of VIN output struct
+/// @param outputElt - simdvertex offset in VIN to write to
+/// @param numEltsToStore - number of simdvertex rows to write out
+/// @param vVertexElements - LLVM Value*[] simdvertex to write out
+void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
+{
+ for(uint32_t c = 0; c < numEltsToStore; ++c)
+ {
+ // STORE expects FP32 x vWidth type, just bitcast if needed
+ if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){
+#if FETCH_DUMP_VERTEX
+ PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
+#endif
+ vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
+ }
+#if FETCH_DUMP_VERTEX
+ else
+ {
+ PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
+ }
+#endif
+ // outputElt * 4 = offsetting by the size of a simdvertex
+ // + c offsets to a 32bit x vWidth row within the current vertex
+ Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
+ STORE(vVertexElements[c], dest);
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Generates a constant vector of values based on the
+/// ComponentControl value
+/// @param ctrl - ComponentControl value
+Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
+{
+ switch(ctrl)
+ {
+ case NoStore: return VUNDEF_I();
+ case Store0: return VIMMED1(0);
+ case Store1Fp: return VIMMED1(1.0f);
+ case Store1Int: return VIMMED1(1);
+ case StoreSrc:
+ default: SWR_ASSERT(0, "Invalid component control"); return VUNDEF_I();
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Returns the enable mask for the specified component.
+/// @param enableMask - enable bits
+/// @param component - component to check if enabled.
+bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
+{
+ switch (component)
+ {
+ // X
+ case 0: return (enableMask & ComponentEnable::X);
+ // Y
+ case 1: return (enableMask & ComponentEnable::Y);
+ // Z
+ case 2: return (enableMask & ComponentEnable::Z);
+ // W
+ case 3: return (enableMask & ComponentEnable::W);
+
+ default: return false;
+ }
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JITs from fetch shader IR
+/// @param hJitMgr - JitManager handle
+/// @param func - LLVM function IR
+/// @return PFN_FETCH_FUNC - pointer to fetch code
+PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
+{
+ const llvm::Function* func = (const llvm::Function*)hFunc;
+ JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
+ PFN_FETCH_FUNC pfnFetch;
+
+ pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
+ // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
+ pJitMgr->mIsModuleFinalized = true;
+
+#if defined(KNOB_SWRC_TRACING)
+ char fName[1024];
+ const char *funcName = func->getName().data();
+ sprintf(fName, "%s.bin", funcName);
+ FILE *fd = fopen(fName, "wb");
+ fwrite((void *)pfnFetch, 1, 2048, fd);
+ fclose(fd);
+#endif
+
+ return pfnFetch;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JIT compiles fetch shader
+/// @param hJitMgr - JitManager handle
+/// @param state - fetch state to build function from
+extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
+{
+ JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
+
+ pJitMgr->SetupNewModule();
+
+ FetchJit theJit(pJitMgr);
+ HANDLE hFunc = theJit.Create(state);
+
+ return JitFetchFunc(hJitMgr, hFunc);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
new file mode 100644
index 00000000000..ea3625d2fde
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
@@ -0,0 +1,128 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file fetch_jit.h
+*
+* @brief Definition of the fetch jitter
+*
+* Notes:
+*
+******************************************************************************/
+#pragma once
+
+#include "common/formats.h"
+#include "core/state.h"
+
+//////////////////////////////////////////////////////////////////////////
+/// INPUT_ELEMENT_DESC
+//////////////////////////////////////////////////////////////////////////
+struct INPUT_ELEMENT_DESC
+{
+ union
+ {
+ struct
+ {
+ uint32_t AlignedByteOffset : 12;
+ uint32_t Format : 10;
+ uint32_t StreamIndex : 6;
+ uint32_t InstanceEnable : 1;
+ uint32_t ComponentControl0 : 3;
+ uint32_t ComponentControl1 : 3;
+ uint32_t ComponentControl2 : 3;
+ uint32_t ComponentControl3 : 3;
+ uint32_t ComponentPacking : 4;
+ uint32_t _reserved : 19;
+ };
+ uint64_t bits;
+ };
+ uint32_t InstanceDataStepRate;
+};
+
+// used to set ComponentPacking
+enum ComponentEnable
+{
+ NONE = 0x0,
+ X = 0x1,
+ Y = 0x2,
+ XY = 0x3,
+ Z = 0x4,
+ XZ = 0x5,
+ YZ = 0x6,
+ XYZ = 0x7,
+ W = 0x8,
+ XW = 0x9,
+ YW = 0xA,
+ XYW = 0xB,
+ ZW = 0xC,
+ XZW = 0xD,
+ YZW = 0xE,
+ XYZW = 0xF,
+};
+
+enum ComponentControl
+{
+ NoStore = 0,
+ StoreSrc = 1,
+ Store0 = 2,
+ Store1Fp = 3,
+ Store1Int = 4,
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// State required for fetch shader jit compile.
+//////////////////////////////////////////////////////////////////////////
+struct FETCH_COMPILE_STATE
+{
+ uint32_t numAttribs;
+ INPUT_ELEMENT_DESC layout[KNOB_NUM_ATTRIBUTES];
+ SWR_FORMAT indexType;
+ uint32_t cutIndex{ 0xffffffff };
+
+ // Options that effect the JIT'd code
+ bool bDisableVGATHER; // if enabled, FetchJit will generate loads/shuffles instead of VGATHERs
+ bool bDisableIndexOOBCheck; // if enabled, FetchJit will exclude index OOB check
+ bool bEnableCutIndex{ false }; // compares indices with the cut index and returns a cut mask
+
+ FETCH_COMPILE_STATE(bool useVGATHER = false, bool indexOOBCheck = false) :
+ bDisableVGATHER(useVGATHER), bDisableIndexOOBCheck(indexOOBCheck){};
+
+ bool operator==(const FETCH_COMPILE_STATE &other) const
+ {
+ if (numAttribs != other.numAttribs) return false;
+ if (indexType != other.indexType) return false;
+ if (bDisableVGATHER != other.bDisableVGATHER) return false;
+ if (bDisableIndexOOBCheck != other.bDisableIndexOOBCheck) return false;
+ if (bEnableCutIndex != other.bEnableCutIndex) return false;
+ if (cutIndex != other.cutIndex) return false;
+
+ for(uint32_t i = 0; i < numAttribs; ++i)
+ {
+ if((layout[i].bits != other.layout[i].bits) ||
+ ((layout[i].InstanceEnable == 1) &&
+ (layout[i].InstanceDataStepRate != other.layout[i].InstanceDataStepRate))){
+ return false;
+ }
+ }
+
+ return true;
+ }
+};
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h
new file mode 100644
index 00000000000..39d63836673
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h
@@ -0,0 +1,108 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file jit_api.h
+*
+* @brief Platform independent JIT interface
+*
+* Notes:
+*
+******************************************************************************/
+#pragma once
+#include "common/os.h"
+
+#include "fetch_jit.h"
+#include "streamout_jit.h"
+#include "blend_jit.h"
+
+#if defined(_WIN32)
+#define EXCEPTION_PRINT_STACK(ret) ret
+#endif // _WIN32
+
+#if defined(_WIN32)
+#define JITCALL __stdcall
+#else
+#define JITCALL
+#endif
+
+extern "C"
+{
+
+struct ShaderInfo;
+
+//////////////////////////////////////////////////////////////////////////
+/// Jit Compile Info Input
+//////////////////////////////////////////////////////////////////////////
+struct JIT_COMPILE_INPUT
+{
+ SWR_SHADER_TYPE type;
+
+ const void* pIR; ///< Pointer to LLVM IR text.
+
+ bool enableJitSampler;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Create JIT context.
+HANDLE JITCALL JitCreateContext(uint32_t targetSimdWidth, const char* arch);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Destroy JIT context.
+void JITCALL JitDestroyContext(HANDLE hJitContext);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JIT compile shader.
+/// @param hJitContext - Jit Context
+/// @param input - Input containing LLVM IR and other information
+/// @param output - Output containing information about JIT shader
+ShaderInfo* JITCALL JitCompileShader(
+ HANDLE hJitContext,
+ const JIT_COMPILE_INPUT& input);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JIT destroy shader.
+/// @param hJitContext - Jit Context
+/// @param pShaderInfo - pointer to shader object.
+void JITCALL JitDestroyShader(
+ HANDLE hJitContext,
+ ShaderInfo*& pShaderInfo);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JIT compiles fetch shader
+/// @param hJitContext - Jit Context
+/// @param state - Fetch state to build function from
+PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitContext, const FETCH_COMPILE_STATE& state);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JIT compiles streamout shader
+/// @param hJitContext - Jit Context
+/// @param state - SO state to build function from
+PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitContext, const STREAMOUT_COMPILE_STATE& state);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JIT compiles blend shader
+/// @param hJitContext - Jit Context
+/// @param state - blend state to build function from
+PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitContext, const BLEND_COMPILE_STATE& state);
+
+
+}; // extern "C"
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
new file mode 100644
index 00000000000..1814b7c8d5f
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
@@ -0,0 +1,401 @@
+# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+#!deps/python32/python.exe
+
+import os, sys, re
+import argparse
+import json as JSON
+import operator
+
+header = r"""/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file %s
+*
+* @brief auto-generated file
+*
+* DO NOT EDIT
+*
+******************************************************************************/
+
+"""
+
+"""
+"""
+def gen_file_header(filename):
+ global header
+ headerStr = header % filename
+ return headerStr.splitlines()
+
+
+inst_aliases = {
+ 'SHUFFLE_VECTOR': 'VSHUFFLE',
+ 'INSERT_ELEMENT': 'VINSERT',
+ 'EXTRACT_ELEMENT': 'VEXTRACT',
+ 'MEM_SET': 'MEMSET',
+ 'MEM_CPY': 'MEMCPY',
+ 'MEM_MOVE': 'MEMMOVE',
+ 'L_SHR': 'LSHR',
+ 'A_SHR': 'ASHR',
+ 'BIT_CAST': 'BITCAST',
+ 'U_DIV': 'UDIV',
+ 'S_DIV': 'SDIV',
+ 'U_REM': 'UREM',
+ 'S_REM': 'SREM',
+ 'BIN_OP': 'BINOP',
+}
+
+intrinsics = [
+ ["VGATHERPS", "x86_avx2_gather_d_ps_256", ["src", "pBase", "indices", "mask", "scale"]],
+ ["VGATHERDD", "x86_avx2_gather_d_d_256", ["src", "pBase", "indices", "mask", "scale"]],
+ ["VSQRTPS", "x86_avx_sqrt_ps_256", ["a"]],
+ ["VRSQRTPS", "x86_avx_rsqrt_ps_256", ["a"]],
+ ["VRCPPS", "x86_avx_rcp_ps_256", ["a"]],
+ ["VMINPS", "x86_avx_min_ps_256", ["a", "b"]],
+ ["VMAXPS", "x86_avx_max_ps_256", ["a", "b"]],
+ ["VPMINSD", "x86_avx2_pmins_d", ["a", "b"]],
+ ["VPMAXSD", "x86_avx2_pmaxs_d", ["a", "b"]],
+ ["VROUND", "x86_avx_round_ps_256", ["a", "rounding"]],
+ ["VCMPPS", "x86_avx_cmp_ps_256", ["a", "b", "cmpop"]],
+ ["VBLENDVPS", "x86_avx_blendv_ps_256", ["a", "b", "mask"]],
+ ["BEXTR_32", "x86_bmi_bextr_32", ["src", "control"]],
+ ["VMASKLOADD", "x86_avx2_maskload_d_256", ["src", "mask"]],
+ ["VMASKMOVPS", "x86_avx_maskload_ps_256", ["src", "mask"]],
+ ["VPSHUFB", "x86_avx2_pshuf_b", ["a", "b"]],
+ ["VPMOVSXBD", "x86_avx2_pmovsxbd", ["a"]], # sign extend packed 8bit components
+ ["VPMOVSXWD", "x86_avx2_pmovsxwd", ["a"]], # sign extend packed 16bit components
+ ["VPERMD", "x86_avx2_permd", ["idx", "a"]],
+ ["VCVTPH2PS", "x86_vcvtph2ps_256", ["a"]],
+ ["VCVTPS2PH", "x86_vcvtps2ph_256", ["a", "round"]],
+ ["VHSUBPS", "x86_avx_hsub_ps_256", ["a", "b"]],
+ ["VPTESTC", "x86_avx_ptestc_256", ["a", "b"]],
+ ["VPTESTZ", "x86_avx_ptestz_256", ["a", "b"]],
+ ["VFMADDPS", "x86_fma_vfmadd_ps_256", ["a", "b", "c"]],
+ ["VCVTTPS2DQ", "x86_avx_cvtt_ps2dq_256", ["a"]],
+ ["VMOVMSKPS", "x86_avx_movmsk_ps_256", ["a"]],
+ ["INTERRUPT", "x86_int", ["a"]],
+ ]
+
+def convert_uppercamel(name):
+ s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
+ return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).upper()
+
+"""
+ Given an input file (e.g. IRBuilder.h) generates function dictionary.
+"""
+def parse_ir_builder(input_file):
+
+ functions = []
+
+ lines = input_file.readlines()
+
+ idx = 0
+ while idx < len(lines) - 1:
+ line = lines[idx].rstrip()
+ idx += 1
+
+ #match = re.search(r"\*Create", line)
+ match = re.search(r"[\*\s]Create(\w*)\(", line)
+ if match is not None:
+ #print("Line: %s" % match.group(1))
+
+ if re.search(r"^\s*Create", line) is not None:
+ func_sig = lines[idx-2].rstrip() + line
+ else:
+ func_sig = line
+
+ end_of_args = False
+ while not end_of_args:
+ end_paren = re.search(r"\)", line)
+ if end_paren is not None:
+ end_of_args = True
+ else:
+ line = lines[idx].rstrip()
+ func_sig += line
+ idx += 1
+
+ delfunc = re.search(r"LLVM_DELETED_FUNCTION|= delete;", func_sig)
+
+ if not delfunc:
+ func = re.search(r"(.*?)\*[\n\s]*(Create\w*)\((.*?)\)", func_sig)
+ if func is not None:
+
+ return_type = func.group(1).lstrip() + '*'
+ func_name = func.group(2)
+ arguments = func.group(3)
+
+ func_args = ''
+ func_args_nodefs = ''
+
+ num_args = arguments.count(',')
+
+ arg_names = []
+ num_args = 0
+ args = arguments.split(',')
+ for arg in args:
+ arg = arg.lstrip()
+ if arg:
+ if num_args > 0:
+ func_args += ', '
+ func_args_nodefs += ', '
+ func_args += arg
+ func_args_nodefs += arg.split(' =')[0]
+
+ split_args = arg.split('=')
+ arg_name = split_args[0].rsplit(None, 1)[-1]
+
+ #print("Before ArgName = %s" % arg_name)
+
+ reg_arg = re.search(r"[\&\*]*(\w*)", arg_name)
+ if reg_arg:
+ #print("Arg Name = %s" % reg_arg.group(1))
+ arg_names += [reg_arg.group(1)]
+
+ num_args += 1
+
+ ignore = False
+
+ # The following functions need to be ignored.
+ if func_name == 'CreateInsertNUWNSWBinOp':
+ ignore = True
+
+ if func_name == 'CreateMaskedIntrinsic':
+ ignore = True
+
+ # Convert CamelCase to CAMEL_CASE
+ func_mod = re.search(r"Create(\w*)", func_name)
+ if func_mod:
+ func_mod = func_mod.group(1)
+ func_mod = convert_uppercamel(func_mod)
+ if func_mod[0:2] == 'F_' or func_mod[0:2] == 'I_':
+ func_mod = func_mod[0] + func_mod[2:]
+
+ # Substitute alias based on CAMEL_CASE name.
+ func_alias = inst_aliases.get(func_mod)
+ if not func_alias:
+ func_alias = func_mod
+
+ if func_name == 'CreateCall' or func_name == 'CreateGEP':
+ arglist = re.search(r'ArrayRef', func_args)
+ if arglist:
+ func_alias = func_alias + 'A'
+
+ if not ignore:
+ functions.append({
+ "name": func_name,
+ "alias": func_alias,
+ "return": return_type,
+ "args": func_args,
+ "args_nodefs": func_args_nodefs,
+ "arg_names": arg_names
+ })
+
+ return functions
+
+"""
+ Auto-generates macros for LLVM IR
+"""
+def generate_gen_h(functions, output_file):
+ output_lines = gen_file_header(os.path.basename(output_file.name))
+
+ output_lines += [
+ '#pragma once',
+ '',
+ '//////////////////////////////////////////////////////////////////////////',
+ '/// Auto-generated Builder IR declarations',
+ '//////////////////////////////////////////////////////////////////////////',
+ ]
+
+ for func in functions:
+ name = func['name']
+ if func['alias']:
+ name = func['alias']
+ output_lines += [
+ '%s%s(%s);' % (func['return'], name, func['args'])
+ ]
+
+ output_file.write('\n'.join(output_lines) + '\n')
+
+"""
+ Auto-generates macros for LLVM IR
+"""
+def generate_gen_cpp(functions, output_file):
+ output_lines = gen_file_header(os.path.basename(output_file.name))
+
+ output_lines += [
+ '#include \"builder.h\"',
+ ''
+ ]
+
+ for func in functions:
+ name = func['name']
+ if func['alias']:
+ name = func['alias']
+
+ args = func['arg_names']
+ func_args = ''
+ first_arg = True
+ for arg in args:
+ if not first_arg:
+ func_args += ', '
+ func_args += arg
+ first_arg = False
+
+ output_lines += [
+ '//////////////////////////////////////////////////////////////////////////',
+ '%sBuilder::%s(%s)' % (func['return'], name, func['args_nodefs']),
+ '{',
+ ' return IRB()->%s(%s);' % (func['name'], func_args),
+ '}',
+ '',
+ ]
+
+ output_file.write('\n'.join(output_lines) + '\n')
+
+"""
+ Auto-generates macros for LLVM IR
+"""
+def generate_x86_h(output_file):
+ output_lines = gen_file_header(os.path.basename(output_file.name))
+
+ output_lines += [
+ '#pragma once',
+ '',
+ '//////////////////////////////////////////////////////////////////////////',
+ '/// Auto-generated x86 intrinsics',
+ '//////////////////////////////////////////////////////////////////////////',
+ ]
+
+ for inst in intrinsics:
+ #print("Inst: %s, x86: %s numArgs: %d" % (inst[0], inst[1], len(inst[2])))
+
+ args = ''
+ first = True
+ for arg in inst[2]:
+ if not first:
+ args += ', '
+ args += ("Value* %s" % arg)
+ first = False
+
+ output_lines += [
+ 'Value *%s(%s);' % (inst[0], args)
+ ]
+
+ output_file.write('\n'.join(output_lines) + '\n')
+
+"""
+ Auto-generates macros for LLVM IR
+"""
+def generate_x86_cpp(output_file):
+ output_lines = gen_file_header(os.path.basename(output_file.name))
+
+ output_lines += [
+ '#include \"builder.h\"',
+ ''
+ ]
+
+ for inst in intrinsics:
+ #print("Inst: %s, x86: %s numArgs: %d" % (inst[0], inst[1], len(inst[2])))
+
+ args = ''
+ pass_args = ''
+ first = True
+ for arg in inst[2]:
+ if not first:
+ args += ', '
+ pass_args += ', '
+ args += ("Value* %s" % arg)
+ pass_args += arg
+ first = False
+
+ output_lines += [
+ '//////////////////////////////////////////////////////////////////////////',
+ 'Value *Builder::%s(%s)' % (inst[0], args),
+ '{',
+ ' Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::%s);' % inst[1],
+ ' return CALL(func, std::initializer_list<Value*>{%s});' % pass_args,
+ '}',
+ '',
+ ]
+
+ output_file.write('\n'.join(output_lines) + '\n')
+
+"""
+ Function which is invoked when this script is started from a command line.
+ Will present and consume a set of arguments which will tell this script how
+ to behave
+"""
+def main():
+
+ # Parse args...
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--input", "-i", type=argparse.FileType('r'), help="Path to IRBuilder.h", required=False)
+ parser.add_argument("--output", "-o", type=argparse.FileType('w'), help="Path to output file", required=True)
+ parser.add_argument("--gen_h", "-gen_h", help="Generate builder_gen.h", action="store_true", default=False)
+ parser.add_argument("--gen_cpp", "-gen_cpp", help="Generate builder_gen.cpp", action="store_true", default=False)
+ parser.add_argument("--gen_x86_h", "-gen_x86_h", help="Generate x86 intrinsics. No input is needed.", action="store_true", default=False)
+ parser.add_argument("--gen_x86_cpp", "-gen_x86_cpp", help="Generate x86 intrinsics. No input is needed.", action="store_true", default=False)
+ args = parser.parse_args()
+
+ if args.input:
+ functions = parse_ir_builder(args.input)
+
+ if args.gen_h:
+ generate_gen_h(functions, args.output)
+
+ if args.gen_cpp:
+ generate_gen_cpp(functions, args.output)
+ else:
+ if args.gen_x86_h:
+ generate_x86_h(args.output)
+
+ if args.gen_x86_cpp:
+ generate_x86_cpp(args.output)
+
+ if args.gen_h:
+ print("Need to specify --input for --gen_h!")
+
+ if args.gen_cpp:
+ print("Need to specify --input for --gen_cpp!")
+
+if __name__ == '__main__':
+ main()
+# END OF FILE
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
new file mode 100644
index 00000000000..7bba435467b
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
@@ -0,0 +1,341 @@
+# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+#!deps/python32/python.exe
+
+import os, sys, re
+import argparse
+import json as JSON
+import operator
+
+header = r"""
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file %s
+*
+* @brief auto-generated file
+*
+* DO NOT EDIT
+*
+******************************************************************************/
+
+#pragma once
+
+"""
+
+"""
+"""
+def gen_file_header(filename):
+ global header
+ headerStr = header % filename
+ return headerStr.splitlines()
+
+"""
+"""
+def gen_llvm_type(type, name, postfix_name, is_pointer, is_pointer_pointer, is_array, is_array_array, array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file):
+
+ llvm_type = ''
+
+ if is_llvm_struct:
+ if is_pointer or is_pointer_pointer:
+ llvm_type = 'Type::getInt32Ty(ctx)'
+ else:
+ llvm_type = 'ArrayType::get(Type::getInt8Ty(ctx), sizeof(%s))' % type
+ elif is_llvm_enum:
+ llvm_type = 'Type::getInt32Ty(ctx)'
+ elif is_llvm_pfn:
+ llvm_type = 'PointerType::get(Type::getInt8Ty(ctx), 0)'
+ else:
+ if type == "BYTE" or type == "char" or type == "uint8_t" or type == "int8_t" or type == 'bool':
+ llvm_type = 'Type::getInt8Ty(ctx)'
+ elif type == 'UINT64' or type == 'INT64' or type == 'uint64_t' or type == 'int64_t':
+ llvm_type = 'Type::getInt64Ty(ctx)'
+ elif type == 'UINT16' or type == 'int16_t' or type == 'uint16_t':
+ llvm_type = 'Type::getInt16Ty(ctx)'
+ elif type == 'UINT' or type == 'INT' or type == 'int' or type == 'BOOL' or type == 'uint32_t' or type == 'int32_t':
+ llvm_type = 'Type::getInt32Ty(ctx)'
+ elif type == 'float' or type == 'FLOAT':
+ llvm_type = 'Type::getFloatTy(ctx)'
+ elif type == 'double' or type == 'DOUBLE':
+ llvm_type = 'Type::getDoubleTy(ctx)'
+ elif type == 'void' or type == 'VOID':
+ llvm_type = 'Type::getInt32Ty(ctx)'
+ elif type == 'HANDLE':
+ llvm_type = 'PointerType::get(Type::getInt32Ty(ctx), 0)'
+ elif type == 'simdscalar':
+ llvm_type = 'VectorType::get(Type::getFloatTy(ctx), pJitMgr->mVWidth)'
+ elif type == 'simdscalari':
+ llvm_type = 'VectorType::get(Type::getInt32Ty(ctx), pJitMgr->mVWidth)'
+ elif type == 'simdvector':
+ llvm_type = 'ArrayType::get(VectorType::get(Type::getFloatTy(ctx), pJitMgr->mVWidth), 4)'
+ else:
+ llvm_type = 'Gen_%s%s(pJitMgr)' % (type, postfix_name)
+
+ if is_pointer:
+ llvm_type = 'PointerType::get(%s, 0)' % llvm_type
+
+ if is_pointer_pointer:
+ llvm_type = 'PointerType::get(%s, 0)' % llvm_type
+
+ if is_array_array:
+ llvm_type = 'ArrayType::get(ArrayType::get(%s, %s), %s)' % (llvm_type, array_count1, array_count)
+ elif is_array:
+ llvm_type = 'ArrayType::get(%s, %s)' % (llvm_type, array_count)
+
+ return [' members.push_back( %s ); // %s' % (llvm_type, name)]
+
+"""
+"""
+def gen_llvm_types(input_file, output_file):
+
+ output_lines = gen_file_header(os.path.basename(output_file.name))
+
+ lines = input_file.readlines()
+
+ postfix_name = ""
+
+ for idx in range(len(lines)):
+ line = lines[idx].rstrip()
+
+ match = re.match(r"(\s*)struct(\s*)(\w+)", line)
+ if match:
+ llvm_args = []
+
+ # Detect start of structure
+ is_fwd_decl = re.search(r";", line)
+
+ if not is_fwd_decl:
+
+ # Extract the command name
+ struct_name = match.group(3).strip()
+
+ output_lines += [
+ '//////////////////////////////////////////////////////////////////////////',
+ '/// Generate LLVM type information for %s' % struct_name,
+ 'INLINE static StructType *Gen_%s%s(JitManager* pJitMgr)' % (struct_name, postfix_name),
+ '{',
+ ' LLVMContext& ctx = pJitMgr->mContext;',
+ ' std::vector<Type*> members;',
+ '',
+ ]
+
+ end_of_struct = False
+
+ while not end_of_struct and idx < len(lines)-1:
+ idx += 1
+ line = lines[idx].rstrip()
+
+ is_llvm_typedef = re.search(r"@llvm_typedef", line)
+ if is_llvm_typedef is not None:
+ is_llvm_typedef = True
+ else:
+ is_llvm_typedef = False
+
+ ###########################################
+ # Is field a llvm struct? Tells script to treat type as array of bytes that is size of structure.
+ is_llvm_struct = re.search(r"@llvm_struct", line)
+
+ if is_llvm_struct is not None:
+ is_llvm_struct = True
+ else:
+ is_llvm_struct = False
+
+ ###########################################
+ # Is field a llvm enum? Tells script to treat type as an enum and replaced with uint32 type.
+ is_llvm_enum = re.search(r"@llvm_enum", line)
+
+ if is_llvm_enum is not None:
+ is_llvm_enum = True
+ else:
+ is_llvm_enum = False
+
+ ###########################################
+ # Is field a llvm function pointer? Tells script to treat type as an enum and replaced with uint32 type.
+ is_llvm_pfn = re.search(r"@llvm_pfn", line)
+
+ if is_llvm_pfn is not None:
+ is_llvm_pfn = True
+ else:
+ is_llvm_pfn = False
+
+ ###########################################
+ # Is field const?
+ is_const = re.search(r"\s+const\s+", line)
+
+ if is_const is not None:
+ is_const = True
+ else:
+ is_const = False
+
+ ###########################################
+ # Is field a pointer?
+ is_pointer_pointer = re.search("\*\*", line)
+
+ if is_pointer_pointer is not None:
+ is_pointer_pointer = True
+ else:
+ is_pointer_pointer = False
+
+ ###########################################
+ # Is field a pointer?
+ is_pointer = re.search("\*", line)
+
+ if is_pointer is not None:
+ is_pointer = True
+ else:
+ is_pointer = False
+
+ ###########################################
+ # Is field an array of arrays?
+ # TODO: Can add this to a list.
+ is_array_array = re.search("\[(\w*)\]\[(\w*)\]", line)
+ array_count = '0'
+ array_count1 = '0'
+
+ if is_array_array is not None:
+ array_count = is_array_array.group(1)
+ array_count1 = is_array_array.group(2)
+ is_array_array = True
+ else:
+ is_array_array = False
+
+ ###########################################
+ # Is field an array?
+ is_array = re.search("\[(\w*)\]", line)
+
+ if is_array is not None:
+ array_count = is_array.group(1)
+ is_array = True
+ else:
+ is_array = False
+
+ is_scoped = re.search("::", line)
+
+ if is_scoped is not None:
+ is_scoped = True
+ else:
+ is_scoped = False
+
+ type = None
+ name = None
+ if is_const and is_pointer:
+
+ if is_scoped:
+ field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+::)(\w+)(\s*\**\s*)(\w+)", line)
+
+ type = "%s%s" % (field_match.group(4), field_match.group(5))
+ name = field_match.group(7)
+ else:
+ field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*\**\s*)(\w+)", line)
+
+ type = field_match.group(4)
+ name = field_match.group(6)
+
+ elif is_pointer:
+ field_match = re.match(r"(\s*)(\s+)(\w+\<*\w*\>*)(\s*\**\s*)(\w+)", line)
+
+ if field_match:
+ type = field_match.group(3)
+ name = field_match.group(5)
+ elif is_const:
+ field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*)(\w+)", line)
+
+ if field_match:
+ type = field_match.group(4)
+ name = field_match.group(6)
+ else:
+ if is_scoped:
+ field_match = re.match(r"\s*(\w+\<*\w*\>*)\s*::\s*(\w+\<*\w*\>*)\s+(\w+)", line)
+
+ if field_match:
+ type = field_match.group(1) + '::' + field_match.group(2)
+ name = field_match.group(3)
+ else:
+ field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+)", line)
+
+ if field_match:
+ type = field_match.group(2)
+ name = field_match.group(4)
+
+ if is_llvm_typedef is False:
+ if type is not None:
+ output_lines += gen_llvm_type(type, name, postfix_name, is_pointer, is_pointer_pointer, is_array, is_array_array, array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file)
+ llvm_args.append(name)
+
+ # Detect end of structure
+ end_of_struct = re.match(r"(\s*)};", line)
+
+ if (end_of_struct):
+ output_lines += [
+ '',
+ ' return StructType::get(ctx, members, false);',
+ '}',
+ '',
+ ]
+
+ for i in range(len(llvm_args)):
+ output_lines.append('static const uint32_t %s%s_%s = %s;' % (struct_name, postfix_name, llvm_args[i], i))
+
+ output_lines.append('')
+
+ output_file.write('\n'.join(output_lines) + '\n')
+
+"""
+ Function which is invoked when this script is started from a command line.
+ Will present and consume a set of arguments which will tell this script how
+ to behave
+"""
+def main():
+
+ # Parse args...
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--input", "-i", type=argparse.FileType('r'),
+ help="Path to input file containing structs", required=True)
+ parser.add_argument("--output", "-o", type=argparse.FileType('w'),
+ help="Path to output file", required=True)
+ parser.add_argument("--scalar", "-scalar", help="Generates scalar files with all enums", action="store_true", default=False)
+ args = parser.parse_args()
+
+ gen_llvm_types(args.input, args.output)
+
+if __name__ == '__main__':
+ main()
+# END OF FILE
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
new file mode 100644
index 00000000000..6c5f22bc47c
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
@@ -0,0 +1,357 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file streamout_jit.cpp
+*
+* @brief Implementation of the streamout jitter
+*
+* Notes:
+*
+******************************************************************************/
+#include "jit_api.h"
+#include "streamout_jit.h"
+#include "builder.h"
+#include "state_llvm.h"
+#include "common/containers.hpp"
+#include "llvm/IR/DataLayout.h"
+
+#include <sstream>
+#include <unordered_set>
+
+//////////////////////////////////////////////////////////////////////////
+/// Interface to Jitting a fetch shader
+//////////////////////////////////////////////////////////////////////////
+struct StreamOutJit : public Builder
+{
+ StreamOutJit(JitManager* pJitMgr) : Builder(pJitMgr){};
+
+ // returns pointer to SWR_STREAMOUT_BUFFER
+ Value* getSOBuffer(Value* pSoCtx, uint32_t buffer)
+ {
+ return LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pBuffer, buffer });
+ }
+
+
+ //////////////////////////////////////////////////////////////////////////
+ // @brief checks if streamout buffer is oob
+ // @return <i1> true/false
+ Value* oob(const STREAMOUT_COMPILE_STATE& state, Value* pSoCtx, uint32_t buffer)
+ {
+ Value* returnMask = C(false);
+
+ Value* pBuf = getSOBuffer(pSoCtx, buffer);
+
+ // load enable
+ // @todo bool data types should generate <i1> llvm type
+ Value* enabled = TRUNC(LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_enable }), IRB()->getInt1Ty());
+
+ // load buffer size
+ Value* bufferSize = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_bufferSize });
+
+ // load current streamOffset
+ Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
+
+ // load buffer pitch
+ Value* pitch = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch });
+
+ // buffer is considered oob if in use in a decl but not enabled
+ returnMask = OR(returnMask, NOT(enabled));
+
+ // buffer is oob if cannot fit a prims worth of verts
+ Value* newOffset = ADD(streamOffset, MUL(pitch, C(state.numVertsPerPrim)));
+ returnMask = OR(returnMask, ICMP_SGT(newOffset, bufferSize));
+
+ return returnMask;
+ }
+
+
+ //////////////////////////////////////////////////////////////////////////
+ // @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector,
+ // packing the active mask bits
+ // ex. bitmask 0011 -> (0, 1, 0, 0)
+ // bitmask 1000 -> (3, 0, 0, 0)
+ // bitmask 1100 -> (2, 3, 0, 0)
+ Value* PackMask(uint32_t bitmask)
+ {
+ std::vector<Constant*> indices(4, C(0));
+ DWORD index;
+ uint32_t elem = 0;
+ while (_BitScanForward(&index, bitmask))
+ {
+ indices[elem++] = C((int)index);
+ bitmask &= ~(1 << index);
+ }
+
+ return ConstantVector::get(indices);
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ // @brief convert scalar bitmask to <4xfloat> bitmask
+ Value* ToMask(uint32_t bitmask)
+ {
+ std::vector<Constant*> indices;
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ if (bitmask & (1 << i))
+ {
+ indices.push_back(C(-1.0f));
+ }
+ else
+ {
+ indices.push_back(C(0.0f));
+ }
+ }
+ return ConstantVector::get(indices);
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ // @brief processes a single decl from the streamout stream. Reads 4 components from the input
+ // stream and writes N components to the output buffer given the componentMask or if
+ // a hole, just increments the buffer pointer
+ // @param pStream - pointer to current attribute
+ // @param pOutBuffers - pointers to the current location of each output buffer
+ // @param decl - input decl
+ void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl)
+ {
+ // @todo add this to x86 macros
+ Function* maskStore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskstore_ps);
+
+ uint32_t numComponents = _mm_popcnt_u32(decl.componentMask);
+ uint32_t packedMask = (1 << numComponents) - 1;
+ if (!decl.hole)
+ {
+ // increment stream pointer to correct slot
+ Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot));
+
+ // load 4 components from stream
+ Type* simd4Ty = VectorType::get(IRB()->getFloatTy(), 4);
+ Type* simd4PtrTy = PointerType::get(simd4Ty, 0);
+ pAttrib = BITCAST(pAttrib, simd4PtrTy);
+ Value *vattrib = LOAD(pAttrib);
+
+ // shuffle/pack enabled components
+ Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask));
+
+ // store to output buffer
+ // cast SO buffer to i8*, needed by maskstore
+ Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(mInt8Ty, 0));
+
+ // cast input to <4xfloat>
+ Value* src = BITCAST(vpackedAttrib, simd4Ty);
+ CALL(maskStore, {pOut, ToMask(packedMask), src});
+ }
+
+ // increment SO buffer
+ pOutBuffers[decl.bufferIndex] = GEP(pOutBuffers[decl.bufferIndex], C(numComponents));
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ // @brief builds a single vertex worth of data for the given stream
+ // @param streamState - state for this stream
+ // @param pCurVertex - pointer to src stream vertex data
+ // @param pOutBuffer - pointers to up to 4 SO buffers
+ void buildVertex(const STREAMOUT_STREAM& streamState, Value* pCurVertex, Value* pOutBuffer[4])
+ {
+ for (uint32_t d = 0; d < streamState.numDecls; ++d)
+ {
+ const STREAMOUT_DECL& decl = streamState.decl[d];
+ buildDecl(pCurVertex, pOutBuffer, decl);
+ }
+ }
+
+ void buildStream(const STREAMOUT_COMPILE_STATE& state, const STREAMOUT_STREAM& streamState, Value* pSoCtx, BasicBlock* returnBB, Function* soFunc)
+ {
+ // get list of active SO buffers
+ std::unordered_set<uint32_t> activeSOBuffers;
+ for (uint32_t d = 0; d < streamState.numDecls; ++d)
+ {
+ const STREAMOUT_DECL& decl = streamState.decl[d];
+ activeSOBuffers.insert(decl.bufferIndex);
+ }
+
+ // always increment numPrimStorageNeeded
+ Value *numPrimStorageNeeded = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded });
+ numPrimStorageNeeded = ADD(numPrimStorageNeeded, C(1));
+ STORE(numPrimStorageNeeded, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded });
+
+ // check OOB on active SO buffers. If any buffer is out of bound, don't write
+ // the primitive to any buffer
+ Value* oobMask = C(false);
+ for (uint32_t buffer : activeSOBuffers)
+ {
+ oobMask = OR(oobMask, oob(state, pSoCtx, buffer));
+ }
+
+ BasicBlock* validBB = BasicBlock::Create(JM()->mContext, "valid", soFunc);
+
+ // early out if OOB
+ COND_BR(oobMask, returnBB, validBB);
+
+ IRB()->SetInsertPoint(validBB);
+
+ Value* numPrimsWritten = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten });
+ numPrimsWritten = ADD(numPrimsWritten, C(1));
+ STORE(numPrimsWritten, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten });
+
+ // compute start pointer for each output buffer
+ Value* pOutBuffer[4];
+ Value* pOutBufferStartVertex[4];
+ Value* outBufferPitch[4];
+ for (uint32_t b: activeSOBuffers)
+ {
+ Value* pBuf = getSOBuffer(pSoCtx, b);
+ Value* pData = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pBuffer });
+ Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
+ pOutBuffer[b] = GEP(pData, streamOffset);
+ pOutBufferStartVertex[b] = pOutBuffer[b];
+
+ outBufferPitch[b] = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch });
+ }
+
+ // loop over the vertices of the prim
+ Value* pStreamData = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pPrimData });
+ for (uint32_t v = 0; v < state.numVertsPerPrim; ++v)
+ {
+ buildVertex(streamState, pStreamData, pOutBuffer);
+
+ // increment stream and output buffer pointers
+ // stream verts are always 32*4 dwords apart
+ pStreamData = GEP(pStreamData, C(KNOB_NUM_ATTRIBUTES * 4));
+
+ // output buffers offset using pitch in buffer state
+ for (uint32_t b : activeSOBuffers)
+ {
+ pOutBufferStartVertex[b] = GEP(pOutBufferStartVertex[b], outBufferPitch[b]);
+ pOutBuffer[b] = pOutBufferStartVertex[b];
+ }
+ }
+
+ // update each active buffer's streamOffset
+ for (uint32_t b : activeSOBuffers)
+ {
+ Value* pBuf = getSOBuffer(pSoCtx, b);
+ Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
+ streamOffset = ADD(streamOffset, MUL(C(state.numVertsPerPrim), outBufferPitch[b]));
+ STORE(streamOffset, pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
+ }
+ }
+
+ Function* Create(const STREAMOUT_COMPILE_STATE& state)
+ {
+ static std::size_t soNum = 0;
+
+ std::stringstream fnName("SOShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
+ fnName << soNum++;
+
+ // SO function signature
+ // typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT*)
+
+ std::vector<Type*> args{
+ PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*
+ };
+
+ FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
+ Function* soFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
+
+ // create return basic block
+ BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", soFunc);
+ BasicBlock* returnBB = BasicBlock::Create(JM()->mContext, "return", soFunc);
+
+ IRB()->SetInsertPoint(entry);
+
+ // arguments
+ auto argitr = soFunc->getArgumentList().begin();
+ Value* pSoCtx = &*argitr++;
+ pSoCtx->setName("pSoCtx");
+
+ const STREAMOUT_STREAM& streamState = state.stream;
+ buildStream(state, streamState, pSoCtx, returnBB, soFunc);
+
+ BR(returnBB);
+
+ IRB()->SetInsertPoint(returnBB);
+ RET_VOID();
+
+ JitManager::DumpToFile(soFunc, "SoFunc");
+
+ FunctionPassManager passes(JM()->mpCurrentModule);
+ passes.add(createBreakCriticalEdgesPass());
+ passes.add(createCFGSimplificationPass());
+ passes.add(createEarlyCSEPass());
+ passes.add(createPromoteMemoryToRegisterPass());
+ passes.add(createCFGSimplificationPass());
+ passes.add(createEarlyCSEPass());
+ passes.add(createInstructionCombiningPass());
+ passes.add(createInstructionSimplifierPass());
+ passes.add(createConstantPropagationPass());
+ passes.add(createSCCPPass());
+ passes.add(createAggressiveDCEPass());
+
+ passes.run(*soFunc);
+
+ JitManager::DumpToFile(soFunc, "SoFunc_optimized");
+
+ return soFunc;
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JITs from streamout shader IR
+/// @param hJitMgr - JitManager handle
+/// @param func - LLVM function IR
+/// @return PFN_SO_FUNC - pointer to SOS function
+PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc)
+{
+ const llvm::Function *func = (const llvm::Function*)hFunc;
+ JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
+ PFN_SO_FUNC pfnStreamOut;
+ pfnStreamOut = (PFN_SO_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
+ // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
+ pJitMgr->mIsModuleFinalized = true;
+
+ return pfnStreamOut;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JIT compiles streamout shader
+/// @param hJitMgr - JitManager handle
+/// @param state - SO state to build function from
+extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitMgr, const STREAMOUT_COMPILE_STATE& state)
+{
+ JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
+
+ STREAMOUT_COMPILE_STATE soState = state;
+ if (soState.offsetAttribs)
+ {
+ for (uint32_t i = 0; i < soState.stream.numDecls; ++i)
+ {
+ soState.stream.decl[i].attribSlot -= soState.offsetAttribs;
+ }
+ }
+
+ pJitMgr->SetupNewModule();
+
+ StreamOutJit theJit(pJitMgr);
+ HANDLE hFunc = theJit.Create(soState);
+
+ return JitStreamoutFunc(hJitMgr, hFunc);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h
new file mode 100644
index 00000000000..097f8ab44d9
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h
@@ -0,0 +1,94 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file streamout_jit.h
+*
+* @brief Definition of the streamout jitter
+*
+* Notes:
+*
+******************************************************************************/
+#pragma once
+
+#include "common/formats.h"
+#include "core/state.h"
+
+//////////////////////////////////////////////////////////////////////////
+/// STREAMOUT_DECL - Stream decl
+//////////////////////////////////////////////////////////////////////////
+struct STREAMOUT_DECL
+{
+ // Buffer that stream maps to.
+ DWORD bufferIndex;
+
+ // attribute to stream
+ uint32_t attribSlot;
+
+ // attribute component mask
+ uint32_t componentMask;
+
+ // indicates this decl is a hole
+ bool hole;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// STREAMOUT_STREAM - Stream decls
+//////////////////////////////////////////////////////////////////////////
+struct STREAMOUT_STREAM
+{
+ // numnber of decls for this stream
+ uint32_t numDecls;
+
+ // array of numDecls decls
+ STREAMOUT_DECL decl[128];
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// State required for streamout jit
+//////////////////////////////////////////////////////////////////////////
+struct STREAMOUT_COMPILE_STATE
+{
+ // number of verts per primitive
+ uint32_t numVertsPerPrim;
+ uint32_t offsetAttribs; ///< attrib offset to subtract from all STREAMOUT_DECL::attribSlot values.
+
+ uint64_t streamMask;
+
+ // stream decls
+ STREAMOUT_STREAM stream;
+
+ bool operator==(const STREAMOUT_COMPILE_STATE &other) const
+ {
+ if (numVertsPerPrim != other.numVertsPerPrim) return false;
+ if (stream.numDecls != other.stream.numDecls) return false;
+
+ for (uint32_t i = 0; i < stream.numDecls; ++i)
+ {
+ if (stream.decl[i].bufferIndex != other.stream.decl[i].bufferIndex) return false;
+ if (stream.decl[i].attribSlot != other.stream.decl[i].attribSlot) return false;
+ if (stream.decl[i].componentMask != other.stream.decl[i].componentMask) return false;
+ if (stream.decl[i].hole != other.stream.decl[i].hole) return false;
+ }
+
+ return true;
+ }
+};
diff --git a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
new file mode 100644
index 00000000000..ad73cd840a7
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
@@ -0,0 +1,287 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file ClearTile.cpp
+*
+* @brief Functionality for ClearTile. StoreHotTileClear clears a single macro
+* tile in the destination.
+*
+******************************************************************************/
+#include "common/os.h"
+#include "core/context.h"
+#include "common/formats.h"
+#include "memory/TilingFunctions.h"
+#include "memory/tilingtraits.h"
+#include "memory/Convert.h"
+
+typedef void(*PFN_STORE_TILES_CLEAR)(const FLOAT*, SWR_SURFACE_STATE*, UINT, UINT);
+
+//////////////////////////////////////////////////////////////////////////
+/// Clear Raster Tile Function Tables.
+//////////////////////////////////////////////////////////////////////////
+static PFN_STORE_TILES_CLEAR sStoreTilesClearColorTable[NUM_SWR_FORMATS];
+
+static PFN_STORE_TILES_CLEAR sStoreTilesClearDepthTable[NUM_SWR_FORMATS];
+
+//////////////////////////////////////////////////////////////////////////
+/// StoreRasterTileClear
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct StoreRasterTileClear
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Stores an 8x8 raster tile to the destination surface.
+ /// @param pColor - Pointer to clear color.
+ /// @param pDstSurface - Destination surface state
+ /// @param x, y - Coordinates to raster tile.
+ INLINE static void StoreClear(
+ const BYTE* dstFormattedColor,
+ UINT dstBytesPerPixel,
+ SWR_SURFACE_STATE* pDstSurface,
+ UINT x, UINT y) // (x, y) pixel coordinate to start of raster tile.
+ {
+ // Compute destination address for raster tile.
+ BYTE* pDstTile = (BYTE*)pDstSurface->pBaseAddress +
+ (y * pDstSurface->pitch) + (x * dstBytesPerPixel);
+
+ // start of first row
+ BYTE* pDst = pDstTile;
+ UINT dstBytesPerRow = 0;
+
+ // For each raster tile pixel in row 0 (rx, 0)
+ for (UINT rx = 0; (rx < KNOB_TILE_X_DIM) && ((x + rx) < pDstSurface->width); ++rx)
+ {
+ memcpy(pDst, dstFormattedColor, dstBytesPerPixel);
+
+ // Increment pointer to next pixel in row.
+ pDst += dstBytesPerPixel;
+ dstBytesPerRow += dstBytesPerPixel;
+ }
+
+ // start of second row
+ pDst = pDstTile + pDstSurface->pitch;
+
+ // For each remaining row in the rest of the raster tile
+ for (UINT ry = 1; (ry < KNOB_TILE_Y_DIM) && ((y + ry) < pDstSurface->height); ++ry)
+ {
+ // copy row
+ memcpy(pDst, pDstTile, dstBytesPerRow);
+
+ // Increment pointer to first pixel in next row.
+ pDst += pDstSurface->pitch;
+ }
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// StoreMacroTileClear - Stores a macro tile clear to its raster tiles.
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct StoreMacroTileClear
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Stores a macrotile to the destination surface.
+ /// @param pColor - Pointer to color to write to pixels.
+ /// @param pDstSurface - Destination surface state
+ /// @param x, y - Coordinates to macro tile
+ static void StoreClear(
+ const FLOAT *pColor,
+ SWR_SURFACE_STATE* pDstSurface,
+ UINT x, UINT y)
+ {
+ UINT dstBytesPerPixel = (FormatTraits<DstFormat>::bpp / 8);
+
+ BYTE dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel
+
+ FLOAT srcColor[4];
+
+ for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
+ {
+ srcColor[comp] = pColor[FormatTraits<DstFormat>::swizzle(comp)];
+ }
+
+ // using this helper function, but the Tiling Traits is unused inside it so just using a dummy value
+ ConvertPixelFromFloat<DstFormat>(dstFormattedColor, srcColor);
+
+ // Store each raster tile from the hot tile to the destination surface.
+ // TODO: Put in check for partial coverage on x/y -- SWR_ASSERT if it happens.
+ // Intent is for this function to only handle full tiles.
+ for (UINT row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+ {
+ for (UINT col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+ {
+ StoreRasterTileClear<SrcFormat, DstFormat>::StoreClear(dstFormattedColor, dstBytesPerPixel, pDstSurface, (x + col), (y + row));
+ }
+ }
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Writes clear color to every pixel of a render surface
+/// @param hPrivateContext - Handle to private DC
+/// @param renderTargetIndex - Index to destination render target
+/// @param x, y - Coordinates to raster tile.
+/// @param pClearColor - Pointer to clear color
+void StoreHotTileClear(
+ SWR_SURFACE_STATE *pDstSurface,
+ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
+ UINT x,
+ UINT y,
+ const float* pClearColor)
+{
+ PFN_STORE_TILES_CLEAR pfnStoreTilesClear = NULL;
+
+ SWR_ASSERT(renderTargetIndex != SWR_ATTACHMENT_STENCIL); ///@todo Not supported yet.
+
+ if (renderTargetIndex != SWR_ATTACHMENT_DEPTH)
+ {
+ pfnStoreTilesClear = sStoreTilesClearColorTable[pDstSurface->format];
+ }
+ else
+ {
+ pfnStoreTilesClear = sStoreTilesClearDepthTable[pDstSurface->format];
+ }
+
+ SWR_ASSERT(pfnStoreTilesClear != NULL);
+
+ // Store a macro tile.
+ /// @todo Once all formats are supported then if check can go away. This is to help us near term to make progress.
+ if (pfnStoreTilesClear != NULL)
+ {
+ pfnStoreTilesClear(pClearColor, pDstSurface, x, y);
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
+#define INIT_STORE_TILES_CLEAR_COLOR_TABLE() \
+ memset(sStoreTilesClearColorTable, 0, sizeof(sStoreTilesClearColorTable)); \
+ \
+ sStoreTilesClearColorTable[R32G32B32A32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::StoreClear; \
+ sStoreTilesClearColorTable[R32G32B32A32_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32A32_SINT>::StoreClear; \
+ sStoreTilesClearColorTable[R32G32B32A32_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32A32_UINT>::StoreClear; \
+ sStoreTilesClearColorTable[R32G32B32X32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::StoreClear; \
+ sStoreTilesClearColorTable[R32G32B32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32_FLOAT>::StoreClear; \
+ sStoreTilesClearColorTable[R32G32B32_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32_SINT>::StoreClear; \
+ sStoreTilesClearColorTable[R32G32B32_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32_UINT>::StoreClear; \
+ sStoreTilesClearColorTable[R16G16B16A16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[R16G16B16A16_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::StoreClear; \
+ sStoreTilesClearColorTable[R16G16B16A16_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_SINT>::StoreClear; \
+ sStoreTilesClearColorTable[R16G16B16A16_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_UINT>::StoreClear; \
+ sStoreTilesClearColorTable[R16G16B16A16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::StoreClear; \
+ sStoreTilesClearColorTable[R32G32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32_FLOAT>::StoreClear; \
+ sStoreTilesClearColorTable[R32G32_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32_SINT>::StoreClear; \
+ sStoreTilesClearColorTable[R32G32_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32_UINT>::StoreClear; \
+ sStoreTilesClearColorTable[R16G16B16X16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[R16G16B16X16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::StoreClear; \
+ sStoreTilesClearColorTable[B8G8R8A8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[B8G8R8A8_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::StoreClear; \
+ sStoreTilesClearColorTable[R10G10B10A2_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[R10G10B10A2_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreClear; \
+ sStoreTilesClearColorTable[R10G10B10A2_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreClear; \
+ sStoreTilesClearColorTable[R8G8B8A8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[R8G8B8A8_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::StoreClear; \
+ sStoreTilesClearColorTable[R8G8B8A8_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::StoreClear; \
+ sStoreTilesClearColorTable[R8G8B8A8_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_SINT>::StoreClear; \
+ sStoreTilesClearColorTable[R8G8B8A8_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_UINT>::StoreClear; \
+ sStoreTilesClearColorTable[R16G16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[R16G16_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_SNORM>::StoreClear; \
+ sStoreTilesClearColorTable[R16G16_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_SINT>::StoreClear; \
+ sStoreTilesClearColorTable[R16G16_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_UINT>::StoreClear; \
+ sStoreTilesClearColorTable[R16G16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_FLOAT>::StoreClear; \
+ sStoreTilesClearColorTable[B10G10R10A2_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[B10G10R10A2_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreClear; \
+ sStoreTilesClearColorTable[R11G11B10_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreClear; \
+ sStoreTilesClearColorTable[R32_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32_SINT>::StoreClear; \
+ sStoreTilesClearColorTable[R32_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32_UINT>::StoreClear; \
+ sStoreTilesClearColorTable[R32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32_FLOAT>::StoreClear; \
+ sStoreTilesClearColorTable[A32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, A32_FLOAT>::StoreClear; \
+ sStoreTilesClearColorTable[B8G8R8X8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[B8G8R8X8_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::StoreClear; \
+ sStoreTilesClearColorTable[R8G8B8X8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[R8G8B8X8_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::StoreClear; \
+ sStoreTilesClearColorTable[B10G10R10X2_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[B5G6R5_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G6R5_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[B5G6R5_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreClear; \
+ sStoreTilesClearColorTable[B5G5R5A1_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[B5G5R5A1_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreClear; \
+ sStoreTilesClearColorTable[B4G4R4A4_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[B4G4R4A4_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreClear; \
+ sStoreTilesClearColorTable[R8G8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[R8G8_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_SNORM>::StoreClear; \
+ sStoreTilesClearColorTable[R8G8_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_SINT>::StoreClear; \
+ sStoreTilesClearColorTable[R8G8_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_UINT>::StoreClear; \
+ sStoreTilesClearColorTable[R16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[R16_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_SNORM>::StoreClear; \
+ sStoreTilesClearColorTable[R16_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_SINT>::StoreClear; \
+ sStoreTilesClearColorTable[R16_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_UINT>::StoreClear; \
+ sStoreTilesClearColorTable[R16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_FLOAT>::StoreClear; \
+ sStoreTilesClearColorTable[A16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, A16_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[A16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, A16_FLOAT>::StoreClear; \
+ sStoreTilesClearColorTable[B5G5R5X1_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[B5G5R5X1_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreClear; \
+ sStoreTilesClearColorTable[R8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[R8_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_SNORM>::StoreClear; \
+ sStoreTilesClearColorTable[R8_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_SINT>::StoreClear; \
+ sStoreTilesClearColorTable[R8_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_UINT>::StoreClear; \
+ sStoreTilesClearColorTable[A8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, A8_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[BC1_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC1_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[BC2_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC2_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[BC3_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC3_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[BC4_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC4_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[BC5_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC5_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[BC1_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC1_UNORM_SRGB>::StoreClear; \
+ sStoreTilesClearColorTable[BC2_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC2_UNORM_SRGB>::StoreClear; \
+ sStoreTilesClearColorTable[BC3_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC3_UNORM_SRGB>::StoreClear; \
+ sStoreTilesClearColorTable[R8G8B8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[R8G8B8_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_SNORM>::StoreClear; \
+ sStoreTilesClearColorTable[BC4_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC4_SNORM>::StoreClear; \
+ sStoreTilesClearColorTable[BC5_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC5_SNORM>::StoreClear; \
+ sStoreTilesClearColorTable[R16G16B16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_FLOAT>::StoreClear; \
+ sStoreTilesClearColorTable[R16G16B16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_UNORM>::StoreClear; \
+ sStoreTilesClearColorTable[R16G16B16_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_SNORM>::StoreClear; \
+ sStoreTilesClearColorTable[R8G8B8_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::StoreClear; \
+ sStoreTilesClearColorTable[R16G16B16_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_UINT>::StoreClear; \
+ sStoreTilesClearColorTable[R16G16B16_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_SINT>::StoreClear; \
+ sStoreTilesClearColorTable[R10G10B10A2_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreClear; \
+ sStoreTilesClearColorTable[R10G10B10A2_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreClear; \
+ sStoreTilesClearColorTable[B10G10R10A2_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreClear; \
+ sStoreTilesClearColorTable[B10G10R10A2_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreClear; \
+ sStoreTilesClearColorTable[B10G10R10A2_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreClear; \
+ sStoreTilesClearColorTable[R8G8B8_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_UINT>::StoreClear; \
+ sStoreTilesClearColorTable[R8G8B8_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_SINT>::StoreClear; \
+
+//////////////////////////////////////////////////////////////////////////
+/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
+#define INIT_STORE_TILES_CLEAR_DEPTH_TABLE() \
+ memset(sStoreTilesClearDepthTable, 0, sizeof(sStoreTilesClearDepthTable)); \
+ \
+ sStoreTilesClearDepthTable[R32_FLOAT] = StoreMacroTileClear<R32_FLOAT, R32_FLOAT>::StoreClear; \
+ sStoreTilesClearDepthTable[R24_UNORM_X8_TYPELESS] = StoreMacroTileClear<R32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreClear; \
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Sets up tables for ClearTile
+void InitSimClearTilesTable()
+{
+ INIT_STORE_TILES_CLEAR_COLOR_TABLE();
+ INIT_STORE_TILES_CLEAR_DEPTH_TABLE();
+}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/Convert.h b/src/gallium/drivers/swr/rasterizer/memory/Convert.h
new file mode 100644
index 00000000000..0f9e0ad4bd8
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/memory/Convert.h
@@ -0,0 +1,698 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file Convert.h
+*
+* @brief Conversion utility functions
+*
+******************************************************************************/
+#pragma once
+
+#if defined(_WIN32)
+// disable "potential divide by 0"
+#pragma warning(disable: 4723)
+#endif
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
+/// float
+/// @param val - 16-bit float
+/// @todo Maybe move this outside of this file into a header?
+static float ConvertSmallFloatTo32(UINT val)
+{
+ UINT result;
+ if ((val & 0x7fff) == 0)
+ {
+ result = ((uint32_t)(val & 0x8000)) << 16;
+ }
+ else if ((val & 0x7c00) == 0x7c00)
+ {
+ result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
+ result |= ((uint32_t)val & 0x8000) << 16;
+ }
+ else
+ {
+ uint32_t sign = (val & 0x8000) << 16;
+ uint32_t mant = (val & 0x3ff) << 13;
+ uint32_t exp = (val >> 10) & 0x1f;
+ if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
+ {
+ mant <<= 1;
+ while (mant < (0x400 << 13))
+ {
+ exp--;
+ mant <<= 1;
+ }
+ mant &= (0x3ff << 13);
+ }
+ exp = ((exp - 15 + 127) & 0xff) << 23;
+ result = sign | exp | mant;
+ }
+
+ return *(float*)&result;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Convert an IEEE 754 32-bit single precision float to an
+/// unsigned small float with 5 exponent bits and a variable
+/// number of mantissa bits.
+/// @param val - 32-bit float
+/// @todo Maybe move this outside of this file into a header?
+template<UINT numMantissaBits>
+static UINT Convert32ToSmallFloat(float val)
+{
+ uint32_t sign, exp, mant;
+ uint32_t roundBits;
+
+ // Extract the sign, exponent, and mantissa
+ UINT uf = *(UINT*)&val;
+
+ sign = (uf & 0x80000000) >> 31;
+ exp = (uf & 0x7F800000) >> 23;
+ mant = uf & 0x007FFFFF;
+
+ // 10/11 bit floats are unsigned. Negative values are clamped to 0.
+ if (sign != 0)
+ {
+ exp = mant = 0;
+ }
+ // Check for out of range
+ else if ((exp == 0xFF) && (mant != 0)) // NaN
+ {
+ exp = 0x1F;
+ mant = 1 << numMantissaBits;
+ }
+ else if ((exp == 0xFF) && (mant == 0)) // INF
+ {
+ exp = 0x1F;
+ mant = 0;
+ }
+ else if (exp > (0x70 + 0x1E)) // Too big to represent
+ {
+ exp = 0x1Eu;
+ mant = (1 << numMantissaBits) - 1; // 0x3F for 6 bit mantissa.
+ }
+ else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
+ {
+ mant |= 0x00800000;
+ for (; exp <= 0x70; mant >>= 1, exp++)
+ ;
+ exp = 0;
+ mant = mant >> (23 - numMantissaBits);
+ }
+ else if (exp < 0x66) // Too small to represent -> Zero
+ {
+ exp = 0;
+ mant = 0;
+ }
+ else
+ {
+ // Saves bits that will be shifted off for rounding
+ roundBits = mant & 0x1FFFu;
+ // convert exponent and mantissa to 16 bit format
+ exp = exp - 0x70u;
+ mant = mant >> (23 - numMantissaBits);
+
+ // Essentially RTZ, but round up if off by only 1 lsb
+ if (roundBits == 0x1FFFu)
+ {
+ mant++;
+ // check for overflow
+ if ((mant & (0x3 << numMantissaBits)) != 0) // 0x60 = 0x3 << (num Mantissa Bits)
+ exp++;
+ // make sure only the needed bits are used
+ mant &= (1 << numMantissaBits) - 1;
+ }
+ }
+
+ UINT tmpVal = (exp << numMantissaBits) | mant;
+ return tmpVal;
+}
+
+#if KNOB_ARCH == KNOB_ARCH_AVX
+//////////////////////////////////////////////////////////////////////////
+/// @brief Convert an IEEE 754 32-bit single precision float to an
+/// 16 bit float with 5 exponent bits and a variable
+/// number of mantissa bits.
+/// @param val - 32-bit float
+/// @todo Maybe move this outside of this file into a header?
+static uint16_t Convert32To16Float(float val)
+{
+ uint32_t sign, exp, mant;
+ uint32_t roundBits;
+
+ // Extract the sign, exponent, and mantissa
+ uint32_t uf = *(uint32_t*)&val;
+ sign = (uf & 0x80000000) >> 31;
+ exp = (uf & 0x7F800000) >> 23;
+ mant = uf & 0x007FFFFF;
+
+ // Check for out of range
+ if (std::isnan(val))
+ {
+ exp = 0x1F;
+ mant = 0x200;
+ sign = 1; // set the sign bit for NANs
+ }
+ else if (std::isinf(val))
+ {
+ exp = 0x1f;
+ mant = 0x0;
+ }
+ else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
+ {
+ exp = 0x1E;
+ mant = 0x3FF;
+ }
+ else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
+ {
+ mant |= 0x00800000;
+ for (; exp <= 0x70; mant >>= 1, exp++)
+ ;
+ exp = 0;
+ mant = mant >> 13;
+ }
+ else if (exp < 0x66) // Too small to represent -> Zero
+ {
+ exp = 0;
+ mant = 0;
+ }
+ else
+ {
+ // Saves bits that will be shifted off for rounding
+ roundBits = mant & 0x1FFFu;
+ // convert exponent and mantissa to 16 bit format
+ exp = exp - 0x70;
+ mant = mant >> 13;
+
+ // Essentially RTZ, but round up if off by only 1 lsb
+ if (roundBits == 0x1FFFu)
+ {
+ mant++;
+ // check for overflow
+ if ((mant & 0xC00u) != 0)
+ exp++;
+ // make sure only the needed bits are used
+ mant &= 0x3FF;
+ }
+ }
+
+ uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
+ return (uint16_t)tmpVal;
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Retrieve color from hot tile source which is always float.
+/// @param pDstPixel - Pointer to destination pixel.
+/// @param srcPixel - Pointer to source pixel (pre-swizzled according to dest).
+template<SWR_FORMAT DstFormat>
+static void ConvertPixelFromFloat(
+ BYTE* pDstPixel,
+ const float srcPixel[4])
+{
+ UINT outColor[4]; // typeless bits
+
+ // Store component
+ for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
+ {
+ SWR_TYPE type = FormatTraits<DstFormat>::GetType(comp);
+
+ float src = srcPixel[comp];
+
+ switch (type)
+ {
+ case SWR_TYPE_UNORM:
+ {
+ // Force NaN to 0. IEEE standard, comparisons involving NaN always evaluate to false.
+ src = (src != src) ? 0.0f : src;
+
+ // Clamp [0, 1]
+ src = std::max(src, 0.0f);
+ src = std::min(src, 1.0f);
+
+ // SRGB
+ if (FormatTraits<DstFormat>::isSRGB && comp != 3)
+ {
+ src = (src <= 0.0031308f) ? (12.92f * src) : (1.055f * powf(src, (1.0f / 2.4f)) - 0.055f);
+ }
+
+ // Float scale to integer scale.
+ UINT scale = (1 << FormatTraits<DstFormat>::GetBPC(comp)) - 1;
+ src = (float)scale * src;
+ src = roundf(src);
+ outColor[comp] = (UINT)src; // Drop fractional part.
+ break;
+ }
+ case SWR_TYPE_SNORM:
+ {
+ SWR_ASSERT(!FormatTraits<DstFormat>::isSRGB);
+
+ // Force NaN to 0. IEEE standard, comparisons involving NaN always evaluate to false.
+ src = (src != src) ? 0.0f : src;
+
+ // Clamp [-1, 1]
+ src = std::max(src, -1.0f);
+ src = std::min(src, 1.0f);
+
+ // Float scale to integer scale.
+ UINT scale = (1 << (FormatTraits<DstFormat>::GetBPC(comp) - 1)) - 1;
+ src = (float)scale * src;
+
+ // Round
+ src += (src >= 0) ? 0.5f : -0.5f;
+
+ INT out = (INT)src;
+
+ outColor[comp] = *(UINT*)&out;
+
+ break;
+ }
+ case SWR_TYPE_UINT:
+ {
+ ///@note The *(UINT*)& is currently necessary as the hot tile appears to always be float.
+ // However, the number in the hot tile should be unsigned integer. So doing this
+ // to preserve bits intead of doing a float -> integer conversion.
+ if (FormatTraits<DstFormat>::GetBPC(comp) == 32)
+ {
+ outColor[comp] = *(UINT*)&src;
+ }
+ else
+ {
+ outColor[comp] = *(UINT*)&src;
+ UINT max = (1 << FormatTraits<DstFormat>::GetBPC(comp)) - 1; // 2^numBits - 1
+
+ outColor[comp] = std::min(max, outColor[comp]);
+ }
+ break;
+ }
+ case SWR_TYPE_SINT:
+ {
+ if (FormatTraits<DstFormat>::GetBPC(comp) == 32)
+ {
+ outColor[comp] = *(UINT*)&src;
+ }
+ else
+ {
+ INT out = *(INT*)&src; // Hot tile format is SINT?
+ INT max = (1 << (FormatTraits<DstFormat>::GetBPC(comp) - 1)) - 1;
+ INT min = -1 - max;
+
+ ///@note The output is unsigned integer (bag of bits) and so performing
+ // the clamping here based on range of output component. Also, manually adding
+ // the sign bit in the appropriate spot. Maybe a better way?
+ out = std::max(out, min);
+ out = std::min(out, max);
+
+ outColor[comp] = *(UINT*)&out;
+ }
+ break;
+ }
+ case SWR_TYPE_FLOAT:
+ {
+ if (FormatTraits<DstFormat>::GetBPC(comp) == 16)
+ {
+ // Convert from 32-bit float to 16-bit float using _mm_cvtps_ph
+ // @todo 16bit float instruction support is orthogonal to avx support. need to
+ // add check for F16C support instead.
+#if KNOB_ARCH == KNOB_ARCH_AVX2
+ __m128 src128 = _mm_set1_ps(src);
+ __m128i srci128 = _mm_cvtps_ph(src128, _MM_FROUND_TRUNC);
+ UINT value = _mm_extract_epi16(srci128, 0);
+#else
+ UINT value = Convert32To16Float(src);
+#endif
+
+ outColor[comp] = value;
+ }
+ else if (FormatTraits<DstFormat>::GetBPC(comp) == 11)
+ {
+ outColor[comp] = Convert32ToSmallFloat<6>(src);
+ }
+ else if (FormatTraits<DstFormat>::GetBPC(comp) == 10)
+ {
+ outColor[comp] = Convert32ToSmallFloat<5>(src);
+ }
+ else
+ {
+ outColor[comp] = *(UINT*)&src;
+ }
+
+ break;
+ }
+ default:
+ SWR_ASSERT(0);
+ break;
+ }
+ }
+
+ typename FormatTraits<DstFormat>::FormatT* pPixel = (typename FormatTraits<DstFormat>::FormatT*)pDstPixel;
+
+ switch (FormatTraits<DstFormat>::numComps)
+ {
+ case 4:
+ pPixel->a = outColor[3];
+ case 3:
+ pPixel->b = outColor[2];
+ case 2:
+ pPixel->g = outColor[1];
+ case 1:
+ pPixel->r = outColor[0];
+ break;
+ default:
+ SWR_ASSERT(0);
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Convert pixel in any format to float32
+/// @param pDstPixel - Pointer to destination pixel.
+/// @param srcPixel - Pointer to source pixel
+template<SWR_FORMAT SrcFormat>
+INLINE static void ConvertPixelToFloat(
+ float dstPixel[4],
+ const BYTE* pSrc)
+{
+ UINT srcColor[4]; // typeless bits
+
+ // unpack src pixel
+ typename FormatTraits<SrcFormat>::FormatT* pPixel = (typename FormatTraits<SrcFormat>::FormatT*)pSrc;
+
+ // apply format defaults
+ for (uint32_t comp = 0; comp < 4; ++comp)
+ {
+ uint32_t def = FormatTraits<SrcFormat>::GetDefault(comp);
+ dstPixel[comp] = *(float*)&def;
+ }
+
+ // load format data
+ switch (FormatTraits<SrcFormat>::numComps)
+ {
+ case 4:
+ srcColor[3] = pPixel->a;
+ case 3:
+ srcColor[2] = pPixel->b;
+ case 2:
+ srcColor[1] = pPixel->g;
+ case 1:
+ srcColor[0] = pPixel->r;
+ break;
+ default:
+ SWR_ASSERT(0);
+ }
+
+ // Convert components
+ for (UINT comp = 0; comp < FormatTraits<SrcFormat>::numComps; ++comp)
+ {
+ SWR_TYPE type = FormatTraits<SrcFormat>::GetType(comp);
+
+ UINT src = srcColor[comp];
+
+ switch (type)
+ {
+ case SWR_TYPE_UNORM:
+ {
+ float dst;
+ if (FormatTraits<SrcFormat>::isSRGB && comp != 3)
+ {
+ dst = *(float*)&srgb8Table[src];
+ }
+ else
+ {
+ // component sizes > 16 must use fp divide to maintain ulp requirements
+ if (FormatTraits<SrcFormat>::GetBPC(comp) > 16)
+ {
+ dst = (float)src / (float)((1 << FormatTraits<SrcFormat>::GetBPC(comp)) - 1);
+ }
+ else
+ {
+ const float scale = (1.0f / (float)((1 << FormatTraits<SrcFormat>::GetBPC(comp)) - 1));
+ dst = (float)src * scale;
+ }
+ }
+ dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = dst;
+ break;
+ }
+ case SWR_TYPE_SNORM:
+ {
+ SWR_ASSERT(!FormatTraits<SrcFormat>::isSRGB);
+
+ float dst;
+ if (src == 0x10)
+ {
+ dst = -1.0f;
+ }
+ else
+ {
+ switch (FormatTraits<SrcFormat>::GetBPC(comp))
+ {
+ case 8:
+ dst = (float)((int8_t)src);
+ break;
+ case 16:
+ dst = (float)((int16_t)src);
+ break;
+ case 32:
+ dst = (float)((int32_t)src);
+ break;
+ default:
+ assert(0 && "attempted to load from SNORM with unsupported bpc");
+ dst = 0.0f;
+ break;
+ }
+ dst = dst * (1.0f / ((1 << (FormatTraits<SrcFormat>::GetBPC(comp) - 1)) - 1));
+ }
+ dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = dst;
+ break;
+ }
+ case SWR_TYPE_UINT:
+ {
+ UINT dst = (UINT)src;
+ dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst;
+ break;
+ }
+ case SWR_TYPE_SINT:
+ {
+ int dst;
+ switch (FormatTraits<SrcFormat>::GetBPC(comp))
+ {
+ case 8:
+ dst = (int8_t)src;
+ break;
+ case 16:
+ dst = (int16_t)src;
+ break;
+ case 32:
+ dst = (int32_t)src;
+ break;
+ default:
+ assert(0 && "attempted to load from SINT with unsupported bpc");
+ dst = 0;
+ break;
+ }
+ dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst;
+ break;
+ }
+ case SWR_TYPE_FLOAT:
+ {
+ float dst;
+ if (FormatTraits<SrcFormat>::GetBPC(comp) == 16)
+ {
+#if KNOB_ARCH == KNOB_ARCH_AVX2
+ // Convert from 16-bit float to 32-bit float using _mm_cvtph_ps
+ // @todo 16bit float instruction support is orthogonal to avx support. need to
+ // add check for F16C support instead.
+ __m128i src128 = _mm_set1_epi32(src);
+ __m128 res = _mm_cvtph_ps(src128);
+ _mm_store_ss(&dst, res);
+#else
+ dst = ConvertSmallFloatTo32(src);
+#endif
+ }
+ else if (FormatTraits<SrcFormat>::GetBPC(comp) == 11)
+ {
+ dst = ConvertSmallFloatTo32(src << 4);
+ }
+ else if (FormatTraits<SrcFormat>::GetBPC(comp) == 10)
+ {
+ dst = ConvertSmallFloatTo32(src << 5);
+ }
+ else
+ {
+ dst = *(float*)&src;
+ }
+
+ dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst;
+ break;
+ }
+ default:
+ SWR_ASSERT(0);
+ break;
+ }
+ }
+}
+
+// non-templated version of conversion functions
+INLINE static void ConvertPixelFromFloat(
+ SWR_FORMAT format,
+ uint8_t* pDst,
+ const float srcPixel[4])
+{
+ switch (format)
+ {
+ case R32G32B32A32_FLOAT: ConvertPixelFromFloat<R32G32B32A32_FLOAT>(pDst, srcPixel); break;
+ case R32G32B32A32_SINT: ConvertPixelFromFloat<R32G32B32A32_SINT>(pDst, srcPixel); break;
+ case R32G32B32A32_UINT: ConvertPixelFromFloat<R32G32B32A32_UINT>(pDst, srcPixel); break;
+ case R32G32B32X32_FLOAT: ConvertPixelFromFloat<R32G32B32X32_FLOAT>(pDst, srcPixel); break;
+ case R32G32B32A32_SSCALED: ConvertPixelFromFloat<R32G32B32A32_SSCALED>(pDst, srcPixel); break;
+ case R32G32B32A32_USCALED: ConvertPixelFromFloat<R32G32B32A32_USCALED>(pDst, srcPixel); break;
+ case R32G32B32_FLOAT: ConvertPixelFromFloat<R32G32B32_FLOAT>(pDst, srcPixel); break;
+ case R32G32B32_SINT: ConvertPixelFromFloat<R32G32B32_SINT>(pDst, srcPixel); break;
+ case R32G32B32_UINT: ConvertPixelFromFloat<R32G32B32_UINT>(pDst, srcPixel); break;
+ case R32G32B32_SSCALED: ConvertPixelFromFloat<R32G32B32_SSCALED>(pDst, srcPixel); break;
+ case R32G32B32_USCALED: ConvertPixelFromFloat<R32G32B32_USCALED>(pDst, srcPixel); break;
+ case R16G16B16A16_UNORM: ConvertPixelFromFloat<R16G16B16A16_UNORM>(pDst, srcPixel); break;
+ case R16G16B16A16_SNORM: ConvertPixelFromFloat<R16G16B16A16_SNORM>(pDst, srcPixel); break;
+ case R16G16B16A16_SINT: ConvertPixelFromFloat<R16G16B16A16_SINT>(pDst, srcPixel); break;
+ case R16G16B16A16_UINT: ConvertPixelFromFloat<R16G16B16A16_UINT>(pDst, srcPixel); break;
+ case R16G16B16A16_FLOAT: ConvertPixelFromFloat<R16G16B16A16_FLOAT>(pDst, srcPixel); break;
+ case R32G32_FLOAT: ConvertPixelFromFloat<R32G32_FLOAT>(pDst, srcPixel); break;
+ case R32G32_SINT: ConvertPixelFromFloat<R32G32_SINT>(pDst, srcPixel); break;
+ case R32G32_UINT: ConvertPixelFromFloat<R32G32_UINT>(pDst, srcPixel); break;
+ case R32_FLOAT_X8X24_TYPELESS: ConvertPixelFromFloat<R32_FLOAT_X8X24_TYPELESS>(pDst, srcPixel); break;
+ case R16G16B16X16_UNORM: ConvertPixelFromFloat<R16G16B16X16_UNORM>(pDst, srcPixel); break;
+ case R16G16B16X16_FLOAT: ConvertPixelFromFloat<R16G16B16X16_FLOAT>(pDst, srcPixel); break;
+ case R16G16B16A16_SSCALED: ConvertPixelFromFloat<R16G16B16A16_SSCALED>(pDst, srcPixel); break;
+ case R16G16B16A16_USCALED: ConvertPixelFromFloat<R16G16B16A16_USCALED>(pDst, srcPixel); break;
+ case R32G32_SSCALED: ConvertPixelFromFloat<R32G32_SSCALED>(pDst, srcPixel); break;
+ case R32G32_USCALED: ConvertPixelFromFloat<R32G32_USCALED>(pDst, srcPixel); break;
+ case R32_FLOAT_X8X24_TYPELESS_LD: ConvertPixelFromFloat<R32_FLOAT_X8X24_TYPELESS_LD>(pDst, srcPixel); break;
+ case B8G8R8A8_UNORM: ConvertPixelFromFloat<B8G8R8A8_UNORM>(pDst, srcPixel); break;
+ case B8G8R8A8_UNORM_SRGB: ConvertPixelFromFloat<B8G8R8A8_UNORM_SRGB>(pDst, srcPixel); break;
+ case R10G10B10A2_UNORM: ConvertPixelFromFloat<R10G10B10A2_UNORM>(pDst, srcPixel); break;
+ case R10G10B10A2_UNORM_SRGB: ConvertPixelFromFloat<R10G10B10A2_UNORM_SRGB>(pDst, srcPixel); break;
+ case R10G10B10A2_UINT: ConvertPixelFromFloat<R10G10B10A2_UINT>(pDst, srcPixel); break;
+ case R8G8B8A8_UNORM: ConvertPixelFromFloat<R8G8B8A8_UNORM>(pDst, srcPixel); break;
+ case R8G8B8A8_UNORM_SRGB: ConvertPixelFromFloat<R8G8B8A8_UNORM_SRGB>(pDst, srcPixel); break;
+ case R8G8B8A8_SNORM: ConvertPixelFromFloat<R8G8B8A8_SNORM>(pDst, srcPixel); break;
+ case R8G8B8A8_SINT: ConvertPixelFromFloat<R8G8B8A8_SINT>(pDst, srcPixel); break;
+ case R8G8B8A8_UINT: ConvertPixelFromFloat<R8G8B8A8_UINT>(pDst, srcPixel); break;
+ case R16G16_UNORM: ConvertPixelFromFloat<R16G16_UNORM>(pDst, srcPixel); break;
+ case R16G16_SNORM: ConvertPixelFromFloat<R16G16_SNORM>(pDst, srcPixel); break;
+ case R16G16_SINT: ConvertPixelFromFloat<R16G16_SINT>(pDst, srcPixel); break;
+ case R16G16_UINT: ConvertPixelFromFloat<R16G16_UINT>(pDst, srcPixel); break;
+ case R16G16_FLOAT: ConvertPixelFromFloat<R16G16_FLOAT>(pDst, srcPixel); break;
+ case B10G10R10A2_UNORM: ConvertPixelFromFloat<B10G10R10A2_UNORM>(pDst, srcPixel); break;
+ case B10G10R10A2_UNORM_SRGB: ConvertPixelFromFloat<B10G10R10A2_UNORM_SRGB>(pDst, srcPixel); break;
+ case R11G11B10_FLOAT: ConvertPixelFromFloat<R11G11B10_FLOAT>(pDst, srcPixel); break;
+ case R32_SINT: ConvertPixelFromFloat<R32_SINT>(pDst, srcPixel); break;
+ case R32_UINT: ConvertPixelFromFloat<R32_UINT>(pDst, srcPixel); break;
+ case R32_FLOAT: ConvertPixelFromFloat<R32_FLOAT>(pDst, srcPixel); break;
+ case R24_UNORM_X8_TYPELESS: ConvertPixelFromFloat<R24_UNORM_X8_TYPELESS>(pDst, srcPixel); break;
+ case R24_UNORM_X8_TYPELESS_LD: ConvertPixelFromFloat<R24_UNORM_X8_TYPELESS_LD>(pDst, srcPixel); break;
+ case A32_FLOAT: ConvertPixelFromFloat<A32_FLOAT>(pDst, srcPixel); break;
+ case B8G8R8X8_UNORM: ConvertPixelFromFloat<B8G8R8X8_UNORM>(pDst, srcPixel); break;
+ case B8G8R8X8_UNORM_SRGB: ConvertPixelFromFloat<B8G8R8X8_UNORM_SRGB>(pDst, srcPixel); break;
+ case R8G8B8X8_UNORM: ConvertPixelFromFloat<R8G8B8X8_UNORM>(pDst, srcPixel); break;
+ case R8G8B8X8_UNORM_SRGB: ConvertPixelFromFloat<R8G8B8X8_UNORM_SRGB>(pDst, srcPixel); break;
+ case R9G9B9E5_SHAREDEXP: ConvertPixelFromFloat<R9G9B9E5_SHAREDEXP>(pDst, srcPixel); break;
+ case B10G10R10X2_UNORM: ConvertPixelFromFloat<B10G10R10X2_UNORM>(pDst, srcPixel); break;
+ case R10G10B10X2_USCALED: ConvertPixelFromFloat<R10G10B10X2_USCALED>(pDst, srcPixel); break;
+ case R8G8B8A8_SSCALED: ConvertPixelFromFloat<R8G8B8A8_SSCALED>(pDst, srcPixel); break;
+ case R8G8B8A8_USCALED: ConvertPixelFromFloat<R8G8B8A8_USCALED>(pDst, srcPixel); break;
+ case R16G16_SSCALED: ConvertPixelFromFloat<R16G16_SSCALED>(pDst, srcPixel); break;
+ case R16G16_USCALED: ConvertPixelFromFloat<R16G16_USCALED>(pDst, srcPixel); break;
+ case R32_SSCALED: ConvertPixelFromFloat<R32_SSCALED>(pDst, srcPixel); break;
+ case R32_USCALED: ConvertPixelFromFloat<R32_USCALED>(pDst, srcPixel); break;
+ case B5G6R5_UNORM: ConvertPixelFromFloat<B5G6R5_UNORM>(pDst, srcPixel); break;
+ case B5G6R5_UNORM_SRGB: ConvertPixelFromFloat<B5G6R5_UNORM_SRGB>(pDst, srcPixel); break;
+ case B5G5R5A1_UNORM: ConvertPixelFromFloat<B5G5R5A1_UNORM>(pDst, srcPixel); break;
+ case B5G5R5A1_UNORM_SRGB: ConvertPixelFromFloat<B5G5R5A1_UNORM_SRGB>(pDst, srcPixel); break;
+ case B4G4R4A4_UNORM: ConvertPixelFromFloat<B4G4R4A4_UNORM>(pDst, srcPixel); break;
+ case B4G4R4A4_UNORM_SRGB: ConvertPixelFromFloat<B4G4R4A4_UNORM_SRGB>(pDst, srcPixel); break;
+ case R8G8_UNORM: ConvertPixelFromFloat<R8G8_UNORM>(pDst, srcPixel); break;
+ case R8G8_SNORM: ConvertPixelFromFloat<R8G8_SNORM>(pDst, srcPixel); break;
+ case R8G8_SINT: ConvertPixelFromFloat<R8G8_SINT>(pDst, srcPixel); break;
+ case R8G8_UINT: ConvertPixelFromFloat<R8G8_UINT>(pDst, srcPixel); break;
+ case R16_UNORM: ConvertPixelFromFloat<R16_UNORM>(pDst, srcPixel); break;
+ case R16_SNORM: ConvertPixelFromFloat<R16_SNORM>(pDst, srcPixel); break;
+ case R16_SINT: ConvertPixelFromFloat<R16_SINT>(pDst, srcPixel); break;
+ case R16_UINT: ConvertPixelFromFloat<R16_UINT>(pDst, srcPixel); break;
+ case R16_FLOAT: ConvertPixelFromFloat<R16_FLOAT>(pDst, srcPixel); break;
+ case A16_UNORM: ConvertPixelFromFloat<A16_UNORM>(pDst, srcPixel); break;
+ case A16_FLOAT: ConvertPixelFromFloat<A16_FLOAT>(pDst, srcPixel); break;
+ case B5G5R5X1_UNORM: ConvertPixelFromFloat<B5G5R5X1_UNORM>(pDst, srcPixel); break;
+ case B5G5R5X1_UNORM_SRGB: ConvertPixelFromFloat<B5G5R5X1_UNORM_SRGB>(pDst, srcPixel); break;
+ case R8G8_SSCALED: ConvertPixelFromFloat<R8G8_SSCALED>(pDst, srcPixel); break;
+ case R8G8_USCALED: ConvertPixelFromFloat<R8G8_USCALED>(pDst, srcPixel); break;
+ case R16_SSCALED: ConvertPixelFromFloat<R16_SSCALED>(pDst, srcPixel); break;
+ case R16_USCALED: ConvertPixelFromFloat<R16_USCALED>(pDst, srcPixel); break;
+ case R8_UNORM: ConvertPixelFromFloat<R8_UNORM>(pDst, srcPixel); break;
+ case R8_SNORM: ConvertPixelFromFloat<R8_SNORM>(pDst, srcPixel); break;
+ case R8_SINT: ConvertPixelFromFloat<R8_SINT>(pDst, srcPixel); break;
+ case R8_UINT: ConvertPixelFromFloat<R8_UINT>(pDst, srcPixel); break;
+ case A8_UNORM: ConvertPixelFromFloat<A8_UNORM>(pDst, srcPixel); break;
+ case R8_SSCALED: ConvertPixelFromFloat<R8_SSCALED>(pDst, srcPixel); break;
+ case R8_USCALED: ConvertPixelFromFloat<R8_USCALED>(pDst, srcPixel); break;
+ case YCRCB_SWAPUVY: ConvertPixelFromFloat<YCRCB_SWAPUVY>(pDst, srcPixel); break;
+ case BC1_UNORM: ConvertPixelFromFloat<BC1_UNORM>(pDst, srcPixel); break;
+ case BC2_UNORM: ConvertPixelFromFloat<BC2_UNORM>(pDst, srcPixel); break;
+ case BC3_UNORM: ConvertPixelFromFloat<BC3_UNORM>(pDst, srcPixel); break;
+ case BC4_UNORM: ConvertPixelFromFloat<BC4_UNORM>(pDst, srcPixel); break;
+ case BC5_UNORM: ConvertPixelFromFloat<BC5_UNORM>(pDst, srcPixel); break;
+ case BC1_UNORM_SRGB: ConvertPixelFromFloat<BC1_UNORM_SRGB>(pDst, srcPixel); break;
+ case BC2_UNORM_SRGB: ConvertPixelFromFloat<BC2_UNORM_SRGB>(pDst, srcPixel); break;
+ case BC3_UNORM_SRGB: ConvertPixelFromFloat<BC3_UNORM_SRGB>(pDst, srcPixel); break;
+ case YCRCB_SWAPUV: ConvertPixelFromFloat<YCRCB_SWAPUV>(pDst, srcPixel); break;
+ case R8G8B8_UNORM: ConvertPixelFromFloat<R8G8B8_UNORM>(pDst, srcPixel); break;
+ case R8G8B8_SNORM: ConvertPixelFromFloat<R8G8B8_SNORM>(pDst, srcPixel); break;
+ case R8G8B8_SSCALED: ConvertPixelFromFloat<R8G8B8_SSCALED>(pDst, srcPixel); break;
+ case R8G8B8_USCALED: ConvertPixelFromFloat<R8G8B8_USCALED>(pDst, srcPixel); break;
+ case BC4_SNORM: ConvertPixelFromFloat<BC4_SNORM>(pDst, srcPixel); break;
+ case BC5_SNORM: ConvertPixelFromFloat<BC5_SNORM>(pDst, srcPixel); break;
+ case R16G16B16_FLOAT: ConvertPixelFromFloat<R16G16B16_FLOAT>(pDst, srcPixel); break;
+ case R16G16B16_UNORM: ConvertPixelFromFloat<R16G16B16_UNORM>(pDst, srcPixel); break;
+ case R16G16B16_SNORM: ConvertPixelFromFloat<R16G16B16_SNORM>(pDst, srcPixel); break;
+ case R16G16B16_SSCALED: ConvertPixelFromFloat<R16G16B16_SSCALED>(pDst, srcPixel); break;
+ case R16G16B16_USCALED: ConvertPixelFromFloat<R16G16B16_USCALED>(pDst, srcPixel); break;
+ case BC7_UNORM: ConvertPixelFromFloat<BC7_UNORM>(pDst, srcPixel); break;
+ case BC7_UNORM_SRGB: ConvertPixelFromFloat<BC7_UNORM_SRGB>(pDst, srcPixel); break;
+ case R8G8B8_UNORM_SRGB: ConvertPixelFromFloat<R8G8B8_UNORM_SRGB>(pDst, srcPixel); break;
+ case R16G16B16_UINT: ConvertPixelFromFloat<R16G16B16_UINT>(pDst, srcPixel); break;
+ case R16G16B16_SINT: ConvertPixelFromFloat<R16G16B16_SINT>(pDst, srcPixel); break;
+ case R10G10B10A2_SNORM: ConvertPixelFromFloat<R10G10B10A2_SNORM>(pDst, srcPixel); break;
+ case R10G10B10A2_USCALED: ConvertPixelFromFloat<R10G10B10A2_USCALED>(pDst, srcPixel); break;
+ case R10G10B10A2_SSCALED: ConvertPixelFromFloat<R10G10B10A2_SSCALED>(pDst, srcPixel); break;
+ case R10G10B10A2_SINT: ConvertPixelFromFloat<R10G10B10A2_SINT>(pDst, srcPixel); break;
+ case B10G10R10A2_SNORM: ConvertPixelFromFloat<B10G10R10A2_SNORM>(pDst, srcPixel); break;
+ case B10G10R10A2_USCALED: ConvertPixelFromFloat<B10G10R10A2_USCALED>(pDst, srcPixel); break;
+ case B10G10R10A2_SSCALED: ConvertPixelFromFloat<B10G10R10A2_SSCALED>(pDst, srcPixel); break;
+ case B10G10R10A2_UINT: ConvertPixelFromFloat<B10G10R10A2_UINT>(pDst, srcPixel); break;
+ case B10G10R10A2_SINT: ConvertPixelFromFloat<B10G10R10A2_SINT>(pDst, srcPixel); break;
+ case R8G8B8_UINT: ConvertPixelFromFloat<R8G8B8_UINT>(pDst, srcPixel); break;
+ case R8G8B8_SINT: ConvertPixelFromFloat<R8G8B8_SINT>(pDst, srcPixel); break;
+ default:
+ break;
+ }
+}
+
+
diff --git a/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp
new file mode 100644
index 00000000000..5d9c0045a8a
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp
@@ -0,0 +1,396 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file LoadTile.cpp
+*
+* @brief Functionality for Load
+*
+******************************************************************************/
+#include "common/os.h"
+#include "common/formats.h"
+#include "core/context.h"
+#include "core/rdtsc_core.h"
+#include "memory/TilingFunctions.h"
+#include "memory/tilingtraits.h"
+#include "memory/Convert.h"
+
+typedef void(*PFN_LOAD_TILES)(SWR_SURFACE_STATE*, uint8_t*, uint32_t, uint32_t, uint32_t);
+
+//////////////////////////////////////////////////////////////////////////
+/// Load Raster Tile Function Tables.
+//////////////////////////////////////////////////////////////////////////
+static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_NONE[NUM_SWR_FORMATS];
+static PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_NONE[NUM_SWR_FORMATS];
+
+static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS];
+static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR[NUM_SWR_FORMATS];
+
+static PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS];
+
+//////////////////////////////////////////////////////////////////////////
+/// LoadRasterTile
+//////////////////////////////////////////////////////////////////////////
+template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct LoadRasterTile
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Retrieve color from hot tile source which is always float.
+ /// @param pSrc - Pointer to raster tile.
+ /// @param x, y - Coordinates to raster tile.
+ /// @param output - output color
+ INLINE static void SetSwizzledDstColor(
+ const float srcColor[4],
+ uint32_t x, uint32_t y,
+ uint8_t* pDst)
+ {
+ typedef SimdTile<DstFormat, SrcFormat> SimdT;
+
+ SimdT* pDstSimdTiles = (SimdT*)pDst;
+
+ // Compute which simd tile we're accessing within 8x8 tile.
+ // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
+ uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
+
+ SimdT* pSimdTile = &pDstSimdTiles[simdIndex];
+
+ uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
+
+ pSimdTile->SetSwizzledColor(simdOffset, srcColor);
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Loads an 8x8 raster tile from the src surface.
+ /// @param pSrcSurface - Src surface state
+ /// @param pDst - Destination hot tile pointer
+ /// @param x, y - Coordinates to raster tile.
+ INLINE static void Load(
+ SWR_SURFACE_STATE* pSrcSurface,
+ uint8_t* pDst,
+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
+ {
+ uint32_t lodWidth = (pSrcSurface->width == 1) ? 1 : pSrcSurface->width >> pSrcSurface->lod;
+ uint32_t lodHeight = (pSrcSurface->height == 1) ? 1 : pSrcSurface->height >> pSrcSurface->lod;
+
+ // For each raster tile pixel (rx, ry)
+ for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
+ {
+ for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
+ {
+ if (((x + rx) < lodWidth) &&
+ ((y + ry) < lodHeight))
+ {
+ uint8_t* pSrc = (uint8_t*)ComputeSurfaceAddress<false>(x + rx, y + ry, pSrcSurface->arrayIndex + renderTargetArrayIndex,
+ pSrcSurface->arrayIndex + renderTargetArrayIndex, sampleNum,
+ pSrcSurface->lod, pSrcSurface);
+
+ float srcColor[4];
+ ConvertPixelToFloat<SrcFormat>(srcColor, pSrc);
+
+ // store pixel to hottile
+ SetSwizzledDstColor(srcColor, rx, ry, pDst);
+ }
+ }
+ }
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// LoadMacroTile - Loads a macro tile which consists of raster tiles.
+//////////////////////////////////////////////////////////////////////////
+template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct LoadMacroTile
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Load a macrotile to the destination surface.
+ /// @param pSrc - Pointer to macro tile.
+ /// @param pDstSurface - Destination surface state
+ /// @param x, y - Coordinates to macro tile
+ static void Load(
+ SWR_SURFACE_STATE* pSrcSurface,
+ uint8_t *pDstHotTile,
+ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
+ {
+ // Load each raster tile from the hot tile to the destination surface.
+ for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+ {
+ for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+ {
+ for (uint32_t sampleNum = 0; sampleNum < pSrcSurface->numSamples; sampleNum++)
+ {
+ LoadRasterTile<TTraits, SrcFormat, DstFormat>::Load(pSrcSurface, pDstHotTile,
+ (x + col), (y + row), sampleNum, renderTargetArrayIndex);
+ pDstHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<DstFormat>::bpp / 8);
+ }
+ }
+ }
+ }
+};
+
+static void BUCKETS_START(UINT id)
+{
+#ifdef KNOB_ENABLE_RDTSC
+ gBucketMgr.StartBucket(id);
+#endif
+}
+
+static void BUCKETS_STOP(UINT id)
+{
+#ifdef KNOB_ENABLE_RDTSC
+ gBucketMgr.StopBucket(id);
+#endif
+}
+
+// on demand buckets for load tiles
+static std::vector<int> sBuckets(NUM_SWR_FORMATS, -1);
+static std::mutex sBucketMutex;
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Loads a full hottile from a render surface
+/// @param hPrivateContext - Handle to private DC
+/// @param dstFormat - Format for hot tile.
+/// @param renderTargetIndex - Index to src render target
+/// @param x, y - Coordinates to raster tile.
+/// @param pDstHotTile - Pointer to Hot Tile
+void LoadHotTile(
+ SWR_SURFACE_STATE *pSrcSurface,
+ SWR_FORMAT dstFormat,
+ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
+ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex,
+ uint8_t *pDstHotTile)
+{
+ PFN_LOAD_TILES pfnLoadTiles = NULL;
+
+ // don't need to load null surfaces
+ if (pSrcSurface->type == SURFACE_NULL)
+ {
+ return;
+ }
+
+ // force 0 if requested renderTargetArrayIndex is OOB
+ if (renderTargetArrayIndex >= pSrcSurface->depth)
+ {
+ renderTargetArrayIndex = 0;
+ }
+
+ if (renderTargetIndex < SWR_ATTACHMENT_DEPTH)
+ {
+ switch (pSrcSurface->tileMode)
+ {
+ case SWR_TILE_NONE:
+ pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_NONE[pSrcSurface->format];
+ break;
+ case SWR_TILE_MODE_YMAJOR:
+ pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR[pSrcSurface->format];
+ break;
+ case SWR_TILE_MODE_XMAJOR:
+ pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR[pSrcSurface->format];
+ break;
+ case SWR_TILE_MODE_WMAJOR:
+ SWR_ASSERT(pSrcSurface->format == R8_UINT);
+ pfnLoadTiles = LoadMacroTile<TilingTraits<SWR_TILE_MODE_WMAJOR, 8>, R8_UINT, R8_UINT>::Load;
+ break;
+ default:
+ SWR_ASSERT(0, "Unsupported tiling mode");
+ break;
+ }
+ }
+ else if (renderTargetIndex == SWR_ATTACHMENT_DEPTH)
+ {
+ // Currently depth can map to linear and tile-y.
+ switch (pSrcSurface->tileMode)
+ {
+ case SWR_TILE_NONE:
+ pfnLoadTiles = sLoadTilesDepthTable_SWR_TILE_NONE[pSrcSurface->format];
+ break;
+ case SWR_TILE_MODE_YMAJOR:
+ pfnLoadTiles = sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR[pSrcSurface->format];
+ break;
+ default:
+ SWR_ASSERT(0, "Unsupported tiling mode");
+ break;
+ }
+ }
+ else
+ {
+ SWR_ASSERT(renderTargetIndex == SWR_ATTACHMENT_STENCIL);
+ SWR_ASSERT(pSrcSurface->format == R8_UINT);
+ switch (pSrcSurface->tileMode)
+ {
+ case SWR_TILE_NONE:
+ pfnLoadTiles = LoadMacroTile<TilingTraits<SWR_TILE_NONE, 8>, R8_UINT, R8_UINT>::Load;
+ break;
+ case SWR_TILE_MODE_WMAJOR:
+ pfnLoadTiles = LoadMacroTile<TilingTraits<SWR_TILE_MODE_WMAJOR, 8>, R8_UINT, R8_UINT>::Load;
+ break;
+ default:
+ SWR_ASSERT(0, "Unsupported tiling mode");
+ break;
+ }
+ }
+
+ if (pfnLoadTiles == nullptr)
+ {
+ SWR_ASSERT(false, "Unsupported format for load tile");
+ return;
+ }
+
+ // Load a macro tile.
+#ifdef KNOB_ENABLE_RDTSC
+ if (sBuckets[pSrcSurface->format] == -1)
+ {
+ // guard sBuckets update since storetiles is called by multiple threads
+ sBucketMutex.lock();
+ if (sBuckets[pSrcSurface->format] == -1)
+ {
+ const SWR_FORMAT_INFO& info = GetFormatInfo(pSrcSurface->format);
+ BUCKET_DESC desc{ info.name, "", false, 0xffffffff };
+ sBuckets[pSrcSurface->format] = gBucketMgr.RegisterBucket(desc);
+ }
+ sBucketMutex.unlock();
+ }
+#endif
+
+ BUCKETS_START(sBuckets[pSrcSurface->format]);
+ pfnLoadTiles(pSrcSurface, pDstHotTile, x, y, renderTargetArrayIndex);
+ BUCKETS_STOP(sBuckets[pSrcSurface->format]);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// INIT_LOAD_TILES_TABLE - Helper macro for setting up the tables.
+#define INIT_LOAD_TILES_COLOR_TABLE(tilemode) \
+ memset(sLoadTilesColorTable_##tilemode, 0, sizeof(sLoadTilesColorTable_##tilemode)); \
+ \
+ sLoadTilesColorTable_##tilemode[R32G32B32A32_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R32G32B32A32_SINT] = LoadMacroTile<TilingTraits<tilemode, 128>, R32G32B32A32_SINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R32G32B32A32_UINT] = LoadMacroTile<TilingTraits<tilemode, 128>, R32G32B32A32_UINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R32G32B32X32_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 128>, R32G32B32X32_FLOAT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R32G32B32_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 96>, R32G32B32_FLOAT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R32G32B32_SINT] = LoadMacroTile<TilingTraits<tilemode, 96>, R32G32B32_SINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R32G32B32_UINT] = LoadMacroTile<TilingTraits<tilemode, 96>, R32G32B32_UINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R16G16B16A16_UNORM] = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R16G16B16A16_SNORM] = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_SNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R16G16B16A16_SINT] = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_SINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R16G16B16A16_UINT] = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_UINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R16G16B16A16_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_FLOAT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R32G32_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 64>, R32G32_FLOAT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R32G32_SINT] = LoadMacroTile<TilingTraits<tilemode, 64>, R32G32_SINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R32G32_UINT] = LoadMacroTile<TilingTraits<tilemode, 64>, R32G32_UINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R16G16B16X16_UNORM] = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16X16_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R16G16B16X16_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16X16_FLOAT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[B8G8R8A8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, B8G8R8A8_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[B8G8R8A8_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 32>, B8G8R8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R10G10B10A2_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R10G10B10A2_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R10G10B10A2_UINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_UINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R8G8B8A8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R8G8B8A8_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R8G8B8A8_SNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_SNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R8G8B8A8_SINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_SINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R8G8B8A8_UINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_UINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R16G16_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R16G16_SNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_SNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R16G16_SINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_SINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R16G16_UINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_UINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R16G16_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_FLOAT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[B10G10R10A2_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[B10G10R10A2_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R11G11B10_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 32>, R11G11B10_FLOAT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R32_SINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R32_SINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R32_UINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R32_UINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R32_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 32>, R32_FLOAT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[A32_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 32>, A32_FLOAT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[B8G8R8X8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, B8G8R8X8_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[B8G8R8X8_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 32>, B8G8R8X8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R8G8B8X8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8X8_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R8G8B8X8_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8X8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[B10G10R10X2_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10X2_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[B5G6R5_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, B5G6R5_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[B5G6R5_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 16>, B5G6R5_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[B5G5R5A1_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, B5G5R5A1_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[B5G5R5A1_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 16>, B5G5R5A1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[B4G4R4A4_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, B4G4R4A4_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[B4G4R4A4_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 16>, B4G4R4A4_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R8G8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, R8G8_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R8G8_SNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, R8G8_SNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R8G8_SINT] = LoadMacroTile<TilingTraits<tilemode, 16>, R8G8_SINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R8G8_UINT] = LoadMacroTile<TilingTraits<tilemode, 16>, R8G8_UINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R16_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, R16_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R16_SNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, R16_SNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R16_SINT] = LoadMacroTile<TilingTraits<tilemode, 16>, R16_SINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R16_UINT] = LoadMacroTile<TilingTraits<tilemode, 16>, R16_UINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R16_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 16>, R16_FLOAT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[A16_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, A16_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[A16_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 16>, A16_FLOAT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[B5G5R5X1_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, B5G5R5X1_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[B5G5R5X1_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 16>, B5G5R5X1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 8>, R8_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R8_SNORM] = LoadMacroTile<TilingTraits<tilemode, 8>, R8_SNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R8_SINT] = LoadMacroTile<TilingTraits<tilemode, 8>, R8_SINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R8_UINT] = LoadMacroTile<TilingTraits<tilemode, 8>, R8_UINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[A8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 8>, A8_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[BC1_UNORM] = LoadMacroTile<TilingTraits<tilemode, 64>, BC1_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[BC2_UNORM] = LoadMacroTile<TilingTraits<tilemode, 128>, BC2_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[BC3_UNORM] = LoadMacroTile<TilingTraits<tilemode, 128>, BC3_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[BC4_UNORM] = LoadMacroTile<TilingTraits<tilemode, 64>, BC4_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[BC5_UNORM] = LoadMacroTile<TilingTraits<tilemode, 128>, BC5_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[BC1_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 64>, BC1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[BC2_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 128>, BC2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[BC3_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 128>, BC3_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R8G8B8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R8G8B8_SNORM] = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_SNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[BC4_SNORM] = LoadMacroTile<TilingTraits<tilemode, 64>, BC4_SNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[BC5_SNORM] = LoadMacroTile<TilingTraits<tilemode, 128>, BC5_SNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R16G16B16_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_FLOAT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R16G16B16_UNORM] = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_UNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R16G16B16_SNORM] = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_SNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R8G8B8_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R16G16B16_UINT] = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_UINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R16G16B16_SINT] = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_SINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R10G10B10A2_SNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_SNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R10G10B10A2_SINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_SINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[B10G10R10A2_SNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_SNORM, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[B10G10R10A2_UINT] = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_UINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[B10G10R10A2_SINT] = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_SINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R8G8B8_UINT] = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_UINT, R32G32B32A32_FLOAT>::Load; \
+ sLoadTilesColorTable_##tilemode[R8G8B8_SINT] = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_SINT, R32G32B32A32_FLOAT>::Load; \
+
+//////////////////////////////////////////////////////////////////////////
+/// INIT_LOAD_TILES_TABLE - Helper macro for setting up the tables.
+#define INIT_LOAD_TILES_DEPTH_TABLE(tilemode) \
+ memset(sLoadTilesDepthTable_##tilemode, 0, sizeof(sLoadTilesDepthTable_##tilemode)); \
+ \
+ sLoadTilesDepthTable_##tilemode[R16_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, R16_UNORM, R32_FLOAT>::Load; \
+ sLoadTilesDepthTable_##tilemode[R32_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 32>, R32_FLOAT, R32_FLOAT>::Load; \
+ sLoadTilesDepthTable_##tilemode[R24_UNORM_X8_TYPELESS] = LoadMacroTile<TilingTraits<tilemode, 32>, R24_UNORM_X8_TYPELESS, R32_FLOAT>::Load; \
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Sets up tables for LoadTile
+void InitSimLoadTilesTable()
+{
+ INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_NONE);
+ INIT_LOAD_TILES_DEPTH_TABLE(SWR_TILE_NONE);
+
+ INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_MODE_YMAJOR);
+ INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_MODE_XMAJOR);
+
+ INIT_LOAD_TILES_DEPTH_TABLE(SWR_TILE_MODE_YMAJOR);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp
new file mode 100644
index 00000000000..9ed1d0bd0ec
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp
@@ -0,0 +1,1717 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file StoreTile.cpp
+*
+* @brief Functionality for Store.
+*
+******************************************************************************/
+#include "common/os.h"
+#include "common/formats.h"
+#include "core/context.h"
+#include "core/rdtsc_core.h"
+#include "core/format_conversion.h"
+
+#include "memory/TilingFunctions.h"
+#include "memory/tilingtraits.h"
+#include "memory/Convert.h"
+#include "core/multisample.h"
+
+#include <array>
+#include <sstream>
+
+typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t);
+
+//////////////////////////////////////////////////////////////////////////
+/// Store Raster Tile Function Tables.
+//////////////////////////////////////////////////////////////////////////
+static PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {};
+static PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {};
+static PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {};
+
+//////////////////////////////////////////////////////////////////////////
+/// StorePixels
+/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
+/// @param pSrc - Pointer to source raster tile in SWRZ pixel order
+/// @param ppDsts - Array of destination pointers. Each pointer is
+/// to a single row of at most 16B.
+/// @tparam NumDests - Number of destination pointers. Each pair of
+/// pointers is for a 16-byte column of two rows.
+//////////////////////////////////////////////////////////////////////////
+template <size_t PixelSize, size_t NumDests>
+struct StorePixels
+{
+ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// StorePixels (32-bit pixel specialization)
+/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
+/// @param pSrc - Pointer to source raster tile in SWRZ pixel order
+/// @param ppDsts - Array of destination pointers. Each pointer is
+/// to a single row of at most 16B.
+/// @tparam NumDests - Number of destination pointers. Each pair of
+/// pointers is for a 16-byte column of two rows.
+//////////////////////////////////////////////////////////////////////////
+template <>
+struct StorePixels<8, 2>
+{
+ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
+ {
+ // Each 4-pixel row is 4 bytes.
+ const uint16_t* pPixSrc = (const uint16_t*)pSrc;
+
+ // Unswizzle from SWR-Z order
+ uint16_t* pRow = (uint16_t*)ppDsts[0];
+ pRow[0] = pPixSrc[0];
+ pRow[1] = pPixSrc[2];
+
+ pRow = (uint16_t*)ppDsts[1];
+ pRow[0] = pPixSrc[1];
+ pRow[1] = pPixSrc[3];
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// StorePixels (32-bit pixel specialization)
+/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
+/// @param pSrc - Pointer to source raster tile in SWRZ pixel order
+/// @param ppDsts - Array of destination pointers. Each pointer is
+/// to a single row of at most 16B.
+/// @tparam NumDests - Number of destination pointers. Each pair of
+/// pointers is for a 16-byte column of two rows.
+//////////////////////////////////////////////////////////////////////////
+template <>
+struct StorePixels<16, 2>
+{
+ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
+ {
+ // Each 4-pixel row is 8 bytes.
+ const uint32_t* pPixSrc = (const uint32_t*)pSrc;
+
+ // Unswizzle from SWR-Z order
+ uint32_t* pRow = (uint32_t*)ppDsts[0];
+ pRow[0] = pPixSrc[0];
+ pRow[1] = pPixSrc[2];
+
+ pRow = (uint32_t*)ppDsts[1];
+ pRow[0] = pPixSrc[1];
+ pRow[1] = pPixSrc[3];
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// StorePixels (32-bit pixel specialization)
+/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
+/// @param pSrc - Pointer to source raster tile in SWRZ pixel order
+/// @param ppDsts - Array of destination pointers. Each pointer is
+/// to a single row of at most 16B.
+/// @tparam NumDests - Number of destination pointers. Each pair of
+/// pointers is for a 16-byte column of two rows.
+//////////////////////////////////////////////////////////////////////////
+template <>
+struct StorePixels<32, 2>
+{
+ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
+ {
+ // Each 4-pixel row is 16-bytes
+ __m128i *pZRow01 = (__m128i*)pSrc;
+ __m128i vQuad00 = _mm_load_si128(pZRow01);
+ __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
+
+ __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
+ __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
+
+ _mm_storeu_si128((__m128i*)ppDsts[0], vRow00);
+ _mm_storeu_si128((__m128i*)ppDsts[1], vRow10);
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// StorePixels (32-bit pixel specialization)
+/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
+/// @param pSrc - Pointer to source raster tile in SWRZ pixel order
+/// @param ppDsts - Array of destination pointers. Each pointer is
+/// to a single row of at most 16B.
+/// @tparam NumDests - Number of destination pointers. Each pair of
+/// pointers is for a 16-byte column of two rows.
+//////////////////////////////////////////////////////////////////////////
+template <>
+struct StorePixels<64, 4>
+{
+ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
+ {
+ // Each 4-pixel row is 32 bytes.
+ const __m128i* pPixSrc = (const __m128i*)pSrc;
+
+ // order of pointers match SWR-Z layout
+ __m128i** pvDsts = (__m128i**)&ppDsts[0];
+ *pvDsts[0] = pPixSrc[0];
+ *pvDsts[1] = pPixSrc[1];
+ *pvDsts[2] = pPixSrc[2];
+ *pvDsts[3] = pPixSrc[3];
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// StorePixels (32-bit pixel specialization)
+/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
+/// @param pSrc - Pointer to source raster tile in SWRZ pixel order
+/// @param ppDsts - Array of destination pointers. Each pointer is
+/// to a single row of at most 16B.
+/// @tparam NumDests - Number of destination pointers. Each pair of
+/// pointers is for a 16-byte column of two rows.
+//////////////////////////////////////////////////////////////////////////
+template <>
+struct StorePixels<128, 8>
+{
+ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
+ {
+ // Each 4-pixel row is 64 bytes.
+ const __m128i* pPixSrc = (const __m128i*)pSrc;
+
+ // Unswizzle from SWR-Z order
+ __m128i** pvDsts = (__m128i**)&ppDsts[0];
+ *pvDsts[0] = pPixSrc[0];
+ *pvDsts[1] = pPixSrc[2];
+ *pvDsts[2] = pPixSrc[1];
+ *pvDsts[3] = pPixSrc[3];
+ *pvDsts[4] = pPixSrc[4];
+ *pvDsts[5] = pPixSrc[6];
+ *pvDsts[6] = pPixSrc[5];
+ *pvDsts[7] = pPixSrc[7];
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct ConvertPixelsSOAtoAOS
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Converts a SIMD from the Hot Tile to the destination format
+ /// and converts from SOA to AOS.
+ /// @param pSrc - Pointer to raster tile.
+ /// @param pDst - Pointer to destination surface or deswizzling buffer.
+ template <size_t NumDests>
+ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+ {
+ static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
+
+ OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
+ OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
+
+ // Convert from SrcFormat --> DstFormat
+ simdvector src;
+ LoadSOA<SrcFormat>(pSrc, src);
+ StoreSOA<DstFormat>(src, soaTile);
+
+ // Convert from SOA --> AOS
+ FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
+
+ // Store data into destination
+ StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
+/// Specialization for no format conversion
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT Format>
+struct ConvertPixelsSOAtoAOS<Format, Format>
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Converts a SIMD from the Hot Tile to the destination format
+ /// and converts from SOA to AOS.
+ /// @param pSrc - Pointer to raster tile.
+ /// @param pDst - Pointer to destination surface or deswizzling buffer.
+ template <size_t NumDests>
+ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+ {
+ static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
+
+ OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
+
+ // Convert from SOA --> AOS
+ FormatTraits<Format>::TransposeT::Transpose(pSrc, aosTile);
+
+ // Store data into destination
+ StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<>
+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Converts a SIMD from the Hot Tile to the destination format
+ /// and converts from SOA to AOS.
+ /// @param pSrc - Pointer to raster tile.
+ /// @param pDst - Pointer to destination surface or deswizzling buffer.
+ template <size_t NumDests>
+ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+ {
+ static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
+ static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
+ static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
+
+ OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
+
+ // Load hot-tile
+ simdvector src, dst;
+ LoadSOA<SrcFormat>(pSrc, src);
+
+ // deswizzle
+ dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
+ dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
+ dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
+
+ // clamp
+ dst.x = Clamp<DstFormat>(dst.x, 0);
+ dst.y = Clamp<DstFormat>(dst.y, 1);
+ dst.z = Clamp<DstFormat>(dst.z, 2);
+
+ // normalize
+ dst.x = Normalize<DstFormat>(dst.x, 0);
+ dst.y = Normalize<DstFormat>(dst.y, 1);
+ dst.z = Normalize<DstFormat>(dst.z, 2);
+
+ // pack
+ simdscalari packed = _simd_castps_si(dst.x);
+ packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits<DstFormat>::GetBPC(0)));
+ packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits<DstFormat>::GetBPC(0) +
+ FormatTraits<DstFormat>::GetBPC(1)));
+
+ // pack low 16 bits of each 32 bit lane to low 128 bits of dst
+ uint32_t *pPacked = (uint32_t*)&packed;
+ uint16_t *pAosTile = (uint16_t*)&aosTile[0];
+ for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t)
+ {
+ *pAosTile++ = *pPacked++;
+ }
+
+ // Store data into destination
+ StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
+//////////////////////////////////////////////////////////////////////////
+template<>
+struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
+{
+ static const SWR_FORMAT SrcFormat = R32_FLOAT;
+ static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS;
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Converts a SIMD from the Hot Tile to the destination format
+ /// and converts from SOA to AOS.
+ /// @param pSrc - Pointer to raster tile.
+ /// @param pDst - Pointer to destination surface or deswizzling buffer.
+ template <size_t NumDests>
+ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+ {
+ static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
+
+ OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
+ OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
+
+ // Convert from SrcFormat --> DstFormat
+ simdvector src;
+ LoadSOA<SrcFormat>(pSrc, src);
+ StoreSOA<DstFormat>(src, soaTile);
+
+ // Convert from SOA --> AOS
+ FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
+
+ // Store data into destination but don't overwrite the X8 bits
+ // Each 4-pixel row is 16-bytes
+ __m128i *pZRow01 = (__m128i*)aosTile;
+ __m128i vQuad00 = _mm_load_si128(pZRow01);
+ __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
+
+ __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
+ __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
+
+ __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]);
+ __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]);
+
+ __m128i vMask = _mm_set1_epi32(0xFFFFFF);
+
+ vDst0 = _mm_andnot_si128(vMask, vDst0);
+ vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask));
+ vDst1 = _mm_andnot_si128(vMask, vDst1);
+ vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask));
+
+ _mm_storeu_si128((__m128i*)ppDsts[0], vDst0);
+ _mm_storeu_si128((__m128i*)ppDsts[1], vDst1);
+ }
+};
+
+template<SWR_FORMAT DstFormat>
+INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
+{
+ static const uint32_t offset = sizeof(simdscalar);
+
+ // swizzle rgba -> bgra while we load
+ simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
+ simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
+ simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
+ simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa
+
+ // clamp
+ vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
+ vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
+
+ vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
+ vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
+
+ vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
+ vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
+
+ vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps());
+ vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f));
+
+ if (FormatTraits<DstFormat>::isSRGB)
+ {
+ // Gamma-correct only rgb
+ vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
+ vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
+ vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
+ }
+
+ // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
+ vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
+ vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
+ vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
+ vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
+
+ // moving to 8 wide integer vector types
+ __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
+ __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
+ __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
+ __m256i src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
+
+#if KNOB_ARCH == KNOB_ARCH_AVX
+
+ // splitting into two sets of 4 wide integer vector types
+ // because AVX doesn't have instructions to support this operation at 8 wide
+ __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
+ __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
+ __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
+ __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
+
+ __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
+ __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
+ __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
+ __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
+
+ srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
+ srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
+ srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
+ srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
+ srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
+ srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
+
+ srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
+ srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00
+
+ srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
+ srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00
+
+ srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr
+ srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr
+
+ // unpack into rows that get the tiling order correct
+ __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr
+ __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
+
+ __m256i final = _mm256_castsi128_si256(vRow00);
+ final = _mm256_insertf128_si256(final, vRow10, 1);
+
+#elif KNOB_ARCH == KNOB_ARCH_AVX2
+
+ // logic is as above, only wider
+ src1 = _mm256_slli_si256(src1, 1);
+ src2 = _mm256_slli_si256(src2, 2);
+ src3 = _mm256_slli_si256(src3, 3);
+
+ src0 = _mm256_or_si256(src0, src1);
+ src2 = _mm256_or_si256(src2, src3);
+
+ __m256i final = _mm256_or_si256(src0, src2);
+
+ // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
+ final = _mm256_permute4x64_epi64(final, 0xD8);
+
+#endif
+
+ _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final);
+}
+
+template<SWR_FORMAT DstFormat>
+INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
+{
+ static const uint32_t offset = sizeof(simdscalar);
+
+ // swizzle rgba -> bgra while we load
+ simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
+ simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
+ simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
+ // clamp
+ vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
+ vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
+
+ vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
+ vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
+
+ vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
+ vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
+
+ if (FormatTraits<DstFormat>::isSRGB)
+ {
+ // Gamma-correct only rgb
+ vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
+ vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
+ vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
+ }
+
+ // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
+ vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
+ vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
+ vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
+
+ // moving to 8 wide integer vector types
+ __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
+ __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
+ __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
+
+#if KNOB_ARCH == KNOB_ARCH_AVX
+
+ // splitting into two sets of 4 wide integer vector types
+ // because AVX doesn't have instructions to support this operation at 8 wide
+ __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
+ __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
+ __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
+
+ __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
+ __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
+ __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
+
+ srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
+ srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
+ srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
+ srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
+
+ srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
+
+ srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
+
+ srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
+ srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
+
+ // unpack into rows that get the tiling order correct
+ __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
+ __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
+
+ __m256i final = _mm256_castsi128_si256(vRow00);
+ final = _mm256_insertf128_si256(final, vRow10, 1);
+
+#elif KNOB_ARCH == KNOB_ARCH_AVX2
+
+ // logic is as above, only wider
+ src1 = _mm256_slli_si256(src1, 1);
+ src2 = _mm256_slli_si256(src2, 2);
+
+ src0 = _mm256_or_si256(src0, src1);
+
+ __m256i final = _mm256_or_si256(src0, src2);
+
+ // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
+ final = _mm256_permute4x64_epi64(final, 0xD8);
+
+#endif
+
+ _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final);
+}
+
+template<>
+struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>
+{
+ template <size_t NumDests>
+ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+ {
+ FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
+ }
+};
+
+template<>
+struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>
+{
+ template <size_t NumDests>
+ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+ {
+ FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
+ }
+};
+
+template<>
+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB >
+{
+ template <size_t NumDests>
+ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+ {
+ FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
+ }
+};
+
+template<>
+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB >
+{
+ template <size_t NumDests>
+ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+ {
+ FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
+ }
+};
+
+template<>
+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM >
+{
+ template <size_t NumDests>
+ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+ {
+ FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
+ }
+};
+
+template<>
+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM >
+{
+ template <size_t NumDests>
+ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+ {
+ FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
+ }
+};
+
+template<>
+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB >
+{
+ template <size_t NumDests>
+ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+ {
+ FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
+ }
+};
+
+template<>
+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB >
+{
+ template <size_t NumDests>
+ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+ {
+ FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// StoreRasterTile
+//////////////////////////////////////////////////////////////////////////
+template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct StoreRasterTile
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Retrieve color from hot tile source which is always float.
+ /// @param pSrc - Pointer to raster tile.
+ /// @param x, y - Coordinates to raster tile.
+ /// @param output - output color
+ INLINE static void GetSwizzledSrcColor(
+ uint8_t* pSrc,
+ uint32_t x, uint32_t y,
+ float outputColor[4])
+ {
+ typedef SimdTile<SrcFormat, DstFormat> SimdT;
+
+ SimdT* pSrcSimdTiles = (SimdT*)pSrc;
+
+ // Compute which simd tile we're accessing within 8x8 tile.
+ // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
+ uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
+
+ SimdT* pSimdTile = &pSrcSimdTiles[simdIndex];
+
+ uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
+
+ pSimdTile->GetSwizzledColor(simdOffset, outputColor);
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Stores an 8x8 raster tile to the destination surface.
+ /// @param pSrc - Pointer to raster tile.
+ /// @param pDstSurface - Destination surface state
+ /// @param x, y - Coordinates to raster tile.
+ INLINE static void Store(
+ uint8_t *pSrc,
+ SWR_SURFACE_STATE* pDstSurface,
+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
+ {
+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+
+ // For each raster tile pixel (rx, ry)
+ for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
+ {
+ for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
+ {
+ // Perform bounds checking.
+ if (((x + rx) < lodWidth) &&
+ ((y + ry) < lodHeight))
+ {
+ float srcColor[4];
+ GetSwizzledSrcColor(pSrc, rx, ry, srcColor);
+
+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false>((x + rx), (y + ry),
+ pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ sampleNum, pDstSurface->lod, pDstSurface);
+ ConvertPixelFromFloat<DstFormat>(pDst, srcColor);
+ }
+ }
+ }
+ }
+};
+
+template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat>
+{};
+
+//////////////////////////////////////////////////////////////////////////
+/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat >
+{
+ typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile;
+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
+ static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Stores an 8x8 raster tile to the destination surface.
+ /// @param pSrc - Pointer to raster tile.
+ /// @param pDstSurface - Destination surface state
+ /// @param x, y - Coordinates to raster tile.
+ INLINE static void Store(
+ uint8_t *pSrc,
+ SWR_SURFACE_STATE* pDstSurface,
+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
+ {
+ // Punt non-full tiles to generic store
+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+ if (x + KNOB_TILE_X_DIM > lodWidth ||
+ y + KNOB_TILE_Y_DIM > lodHeight)
+ {
+ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
+ }
+
+ uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+ uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
+
+ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
+ {
+ uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
+
+ for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
+ {
+ // Format conversion and convert from SOA to AOS, and store the rows.
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
+
+ ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
+ ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;;
+ pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
+ }
+
+ ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
+ ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
+ }
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat >
+{
+ typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile;
+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
+ static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Stores an 8x8 raster tile to the destination surface.
+ /// @param pSrc - Pointer to raster tile.
+ /// @param pDstSurface - Destination surface state
+ /// @param x, y - Coordinates to raster tile.
+ INLINE static void Store(
+ uint8_t *pSrc,
+ SWR_SURFACE_STATE* pDstSurface,
+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
+ {
+ // Punt non-full tiles to generic store
+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+ if (x + KNOB_TILE_X_DIM > lodWidth ||
+ y + KNOB_TILE_Y_DIM > lodHeight)
+ {
+ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
+ }
+
+ uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+ uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
+
+ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
+ {
+ uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
+
+ for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
+ {
+ // Format conversion and convert from SOA to AOS, and store the rows.
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
+
+ ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
+ ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;;
+ pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
+ }
+
+ ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
+ ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
+ }
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat >
+{
+ typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile;
+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
+ static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Stores an 8x8 raster tile to the destination surface.
+ /// @param pSrc - Pointer to raster tile.
+ /// @param pDstSurface - Destination surface state
+ /// @param x, y - Coordinates to raster tile.
+ INLINE static void Store(
+ uint8_t *pSrc,
+ SWR_SURFACE_STATE* pDstSurface,
+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
+ {
+ // Punt non-full tiles to generic store
+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+ if (x + KNOB_TILE_X_DIM > lodWidth ||
+ y + KNOB_TILE_Y_DIM > lodHeight)
+ {
+ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
+ }
+
+ uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+ uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
+
+ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
+ {
+ uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
+
+ for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
+ {
+ // Format conversion and convert from SOA to AOS, and store the rows.
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
+
+ ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
+ ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;;
+ pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
+ }
+
+ ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
+ ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
+ }
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat >
+{
+ typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
+ static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
+ static const size_t MAX_DST_COLUMN_BYTES = 16;
+ static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
+ static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Stores an 8x8 raster tile to the destination surface.
+ /// @param pSrc - Pointer to raster tile.
+ /// @param pDstSurface - Destination surface state
+ /// @param x, y - Coordinates to raster tile.
+ INLINE static void Store(
+ uint8_t *pSrc,
+ SWR_SURFACE_STATE* pDstSurface,
+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
+ {
+ // Punt non-full tiles to generic store
+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+ if (x + KNOB_TILE_X_DIM > lodWidth ||
+ y + KNOB_TILE_Y_DIM > lodHeight)
+ {
+ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
+ }
+
+ uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+ uint8_t* ppDsts[] =
+ {
+ pDst, // row 0, col 0
+ pDst + pDstSurface->pitch, // row 1, col 0
+ pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
+ };
+
+ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
+ {
+ uint8_t* ppStartRows[] =
+ {
+ ppDsts[0],
+ ppDsts[1],
+ ppDsts[2],
+ ppDsts[3],
+ };
+
+ for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
+ {
+ // Format conversion and convert from SOA to AOS, and store the rows.
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
+ ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
+ ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
+ ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
+ pSrc += SRC_COLUMN_BYTES;
+ }
+
+ ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
+ ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
+ ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch;
+ ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch;
+ }
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat >
+{
+ typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
+ static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
+ static const size_t MAX_DST_COLUMN_BYTES = 16;
+ static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
+ static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Stores an 8x8 raster tile to the destination surface.
+ /// @param pSrc - Pointer to raster tile.
+ /// @param pDstSurface - Destination surface state
+ /// @param x, y - Coordinates to raster tile.
+ INLINE static void Store(
+ uint8_t *pSrc,
+ SWR_SURFACE_STATE* pDstSurface,
+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
+ {
+ // Punt non-full tiles to generic store
+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+ if (x + KNOB_TILE_X_DIM > lodWidth ||
+ y + KNOB_TILE_Y_DIM > lodHeight)
+ {
+ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
+ }
+
+ uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+ struct DstPtrs
+ {
+ uint8_t* ppDsts[8];
+ } ptrs;
+
+ // Need 8 pointers, 4 columns of 2 rows each
+ for (uint32_t y = 0; y < 2; ++y)
+ {
+ for (uint32_t x = 0; x < 4; ++x)
+ {
+ ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
+ }
+ }
+
+ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
+ {
+ DstPtrs startPtrs = ptrs;
+
+ for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
+ {
+ // Format conversion and convert from SOA to AOS, and store the rows.
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
+
+ ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
+ ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
+ ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
+ ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
+ ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
+ ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
+ ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
+ ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
+ pSrc += SRC_COLUMN_BYTES;
+ }
+
+ ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch;
+ ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch;
+ ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch;
+ ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch;
+ ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch;
+ ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch;
+ ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch;
+ ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch;
+ }
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat >
+{
+ typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile;
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Stores an 8x8 raster tile to the destination surface.
+ /// @param pSrc - Pointer to raster tile.
+ /// @param pDstSurface - Destination surface state
+ /// @param x, y - Coordinates to raster tile.
+ INLINE static void Store(
+ uint8_t *pSrc,
+ SWR_SURFACE_STATE* pDstSurface,
+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
+ {
+ static const uint32_t DestRowWidthBytes = 16; // 16B rows
+
+ // Punt non-full tiles to generic store
+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+ if (x + KNOB_TILE_X_DIM > lodWidth ||
+ y + KNOB_TILE_Y_DIM > lodHeight)
+ {
+ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
+ }
+
+ // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
+ // We can compute the offsets to each column within the raster tile once and increment from these.
+ // There will be 2 x 4-wide columns in an 8x8 raster tile.
+ uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+
+ // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
+ uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
+
+ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
+ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
+ {
+ uint32_t rowOffset = row * DestRowWidthBytes;
+
+ uint8_t* pRow = pCol0 + rowOffset;
+ uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
+
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+ pSrc += pSrcInc;
+
+ ppDsts[0] += DestRowWidthBytes / 4;
+ ppDsts[1] += DestRowWidthBytes / 4;
+
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+ pSrc += pSrcInc;
+ }
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat >
+{
+ typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile;
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Stores an 8x8 raster tile to the destination surface.
+ /// @param pSrc - Pointer to raster tile.
+ /// @param pDstSurface - Destination surface state
+ /// @param x, y - Coordinates to raster tile.
+ INLINE static void Store(
+ uint8_t *pSrc,
+ SWR_SURFACE_STATE* pDstSurface,
+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
+ {
+ static const uint32_t DestRowWidthBytes = 16; // 16B rows
+
+ // Punt non-full tiles to generic store
+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+ if (x + KNOB_TILE_X_DIM > lodWidth ||
+ y + KNOB_TILE_Y_DIM > lodHeight)
+ {
+ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
+ }
+
+ // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
+ // We can compute the offsets to each column within the raster tile once and increment from these.
+ // There will be 2 x 4-wide columns in an 8x8 raster tile.
+ uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+
+ // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
+ uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
+
+ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
+ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
+ {
+ uint32_t rowOffset = row * DestRowWidthBytes;
+
+ uint8_t* pRow = pCol0 + rowOffset;
+ uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
+
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+ pSrc += pSrcInc;
+
+ ppDsts[0] += DestRowWidthBytes / 2;
+ ppDsts[1] += DestRowWidthBytes / 2;
+
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+ pSrc += pSrcInc;
+ }
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat >
+{
+ typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Stores an 8x8 raster tile to the destination surface.
+ /// @param pSrc - Pointer to raster tile.
+ /// @param pDstSurface - Destination surface state
+ /// @param x, y - Coordinates to raster tile.
+ INLINE static void Store(
+ uint8_t *pSrc,
+ SWR_SURFACE_STATE* pDstSurface,
+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
+ {
+ static const uint32_t DestRowWidthBytes = 512; // 512B rows
+
+ // Punt non-full tiles to generic store
+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+ if (x + KNOB_TILE_X_DIM > lodWidth ||
+ y + KNOB_TILE_Y_DIM > lodHeight)
+ {
+ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
+ }
+
+ // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
+ // We can compute the offsets to each column within the raster tile once and increment from these.
+ uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+ uint8_t* pRow1 = pRow0 + DestRowWidthBytes;
+
+ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
+ {
+ for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM)
+ {
+ uint32_t xRowOffset = col * (FormatTraits<DstFormat>::bpp / 8);
+
+ uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset };
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
+ pSrc += (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
+ }
+
+ pRow0 += (DestRowWidthBytes * 2);
+ pRow1 += (DestRowWidthBytes * 2);
+ }
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat >
+{
+ typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Stores an 8x8 raster tile to the destination surface.
+ /// @param pSrc - Pointer to raster tile.
+ /// @param pDstSurface - Destination surface state
+ /// @param x, y - Coordinates to raster tile.
+ INLINE static void Store(
+ uint8_t *pSrc,
+ SWR_SURFACE_STATE* pDstSurface,
+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
+ {
+ static const uint32_t DestRowWidthBytes = 16; // 16B rows
+ static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
+
+ // Punt non-full tiles to generic store
+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+ if (x + KNOB_TILE_X_DIM > lodWidth ||
+ y + KNOB_TILE_Y_DIM > lodHeight)
+ {
+ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
+ }
+
+ // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
+ // We can compute the offsets to each column within the raster tile once and increment from these.
+ // There will be 2 x 4-wide columns in an 8x8 raster tile.
+ uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+
+ // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
+ uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
+
+ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
+ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
+ {
+ uint32_t rowOffset = row * DestRowWidthBytes;
+
+ uint8_t* pRow = pCol0 + rowOffset;
+ uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
+
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+ pSrc += pSrcInc;
+
+ ppDsts[0] += DestColumnBytes;
+ ppDsts[1] += DestColumnBytes;
+
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+ pSrc += pSrcInc;
+ }
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat >
+{
+ typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile;
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Stores an 8x8 raster tile to the destination surface.
+ /// @param pSrc - Pointer to raster tile.
+ /// @param pDstSurface - Destination surface state
+ /// @param x, y - Coordinates to raster tile.
+ INLINE static void Store(
+ uint8_t *pSrc,
+ SWR_SURFACE_STATE* pDstSurface,
+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
+ {
+ static const uint32_t DestRowWidthBytes = 16; // 16B rows
+ static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
+
+ // Punt non-full tiles to generic store
+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+ if (x + KNOB_TILE_X_DIM > lodWidth ||
+ y + KNOB_TILE_Y_DIM > lodHeight)
+ {
+ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
+ }
+
+ // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
+ // We can compute the offsets to each column within the raster tile once and increment from these.
+ // There will be 2 x 4-wide columns in an 8x8 raster tile.
+ uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+ uint8_t* pCol1 = pCol0 + DestColumnBytes;
+
+ // There are 4 columns, each 2 pixels wide when we have 64bpp pixels.
+ // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
+ uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
+
+ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
+ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
+ {
+ uint32_t rowOffset = row * DestRowWidthBytes;
+ uint8_t* ppDsts[] =
+ {
+ pCol0 + rowOffset,
+ pCol0 + rowOffset + DestRowWidthBytes,
+ pCol1 + rowOffset,
+ pCol1 + rowOffset + DestRowWidthBytes,
+ };
+
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+ pSrc += pSrcInc;
+
+ ppDsts[0] += DestColumnBytes * 2;
+ ppDsts[1] += DestColumnBytes * 2;
+ ppDsts[2] += DestColumnBytes * 2;
+ ppDsts[3] += DestColumnBytes * 2;
+
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+ pSrc += pSrcInc;
+ }
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat >
+{
+ typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
+
+ static const size_t TILE_Y_COL_WIDTH_BYTES = 16;
+ static const size_t TILE_Y_ROWS = 32;
+ static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES;
+
+ static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
+ static const size_t MAX_DST_COLUMN_BYTES = 16;
+
+ static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
+ static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4;
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Stores an 8x8 raster tile to the destination surface.
+ /// @param pSrc - Pointer to raster tile.
+ /// @param pDstSurface - Destination surface state
+ /// @param x, y - Coordinates to raster tile.
+ INLINE static void Store(
+ uint8_t *pSrc,
+ SWR_SURFACE_STATE* pDstSurface,
+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
+ {
+ // Punt non-full tiles to generic store
+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+ if (x + KNOB_TILE_X_DIM > lodWidth ||
+ y + KNOB_TILE_Y_DIM > lodHeight)
+ {
+ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
+ }
+
+ uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+ struct DstPtrs
+ {
+ uint8_t* ppDsts[8];
+ } ptrs;
+
+ // Need 8 pointers, 4 columns of 2 rows each
+ for (uint32_t y = 0; y < 2; ++y)
+ {
+ for (uint32_t x = 0; x < 4; ++x)
+ {
+ ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES;
+ }
+ }
+
+ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
+ {
+ DstPtrs startPtrs = ptrs;
+
+ for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
+ {
+ // Format conversion and convert from SOA to AOS, and store the rows.
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
+
+ ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
+ ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
+ ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
+ ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
+ ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
+ ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
+ ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
+ ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
+ pSrc += SRC_COLUMN_BYTES;
+ }
+
+ ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES;
+ ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES;
+ ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES;
+ ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES;
+ ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES;
+ ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES;
+ ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES;
+ ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES;
+ }
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// StoreMacroTile - Stores a macro tile which consists of raster tiles.
+//////////////////////////////////////////////////////////////////////////
+template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct StoreMacroTile
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Stores a macrotile to the destination surface using safe implementation.
+ /// @param pSrc - Pointer to macro tile.
+ /// @param pDstSurface - Destination surface state
+ /// @param x, y - Coordinates to macro tile
+ static void StoreGeneric(
+ uint8_t *pSrcHotTile,
+ SWR_SURFACE_STATE* pDstSurface,
+ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
+ {
+ // Store each raster tile from the hot tile to the destination surface.
+ for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+ {
+ for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+ {
+ for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
+ {
+ StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store (pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum,
+ renderTargetArrayIndex);
+ pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
+ }
+ }
+ }
+ }
+
+ typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t);
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Stores a macrotile to the destination surface.
+ /// @param pSrc - Pointer to macro tile.
+ /// @param pDstSurface - Destination surface state
+ /// @param x, y - Coordinates to macro tile
+ static void Store(
+ uint8_t *pSrcHotTile,
+ SWR_SURFACE_STATE* pDstSurface,
+ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
+ {
+ PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
+ for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
+ {
+ size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false>(
+ 0,
+ 0,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces
+ pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays
+ sampleNum,
+ pDstSurface->lod,
+ pDstSurface);
+
+ // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
+ bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) || (pDstSurface->bInterleavedSamples);
+
+ pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
+ }
+
+ // Store each raster tile from the hot tile to the destination surface.
+ for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+ {
+ for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+ {
+ for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
+ {
+ pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
+ pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
+ }
+ }
+ }
+ }
+};
+
+static void BUCKETS_START(UINT id)
+{
+#ifdef KNOB_ENABLE_RDTSC
+ gBucketMgr.StartBucket(id);
+#endif
+}
+
+static void BUCKETS_STOP(UINT id)
+{
+#ifdef KNOB_ENABLE_RDTSC
+ gBucketMgr.StopBucket(id);
+#endif
+}
+
+// on demand buckets for store tiles
+static std::mutex sBucketMutex;
+static std::vector<int32_t> sBuckets(NUM_SWR_FORMATS, -1);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Deswizzles and stores a full hottile to a render surface
+/// @param hPrivateContext - Handle to private DC
+/// @param srcFormat - Format for hot tile.
+/// @param renderTargetIndex - Index to destination render target
+/// @param x, y - Coordinates to raster tile.
+/// @param pSrcHotTile - Pointer to Hot Tile
+void StoreHotTile(
+ SWR_SURFACE_STATE *pDstSurface,
+ SWR_FORMAT srcFormat,
+ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
+ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex,
+ uint8_t *pSrcHotTile)
+{
+ if (pDstSurface->type == SURFACE_NULL)
+ {
+ return;
+ }
+
+ // force 0 if requested renderTargetArrayIndex is OOB
+ if (renderTargetArrayIndex >= pDstSurface->depth)
+ {
+ renderTargetArrayIndex = 0;
+ }
+
+ PFN_STORE_TILES pfnStoreTiles = nullptr;
+
+ if ((renderTargetIndex <= SWR_ATTACHMENT_COLOR7) && (pDstSurface->tileMode != SWR_TILE_MODE_WMAJOR))
+ {
+ pfnStoreTiles = sStoreTilesTableColor[pDstSurface->tileMode][pDstSurface->format];
+ }
+ else if (renderTargetIndex == SWR_ATTACHMENT_DEPTH)
+ {
+ pfnStoreTiles = sStoreTilesTableDepth[pDstSurface->tileMode][pDstSurface->format];
+ }
+ else
+ {
+ pfnStoreTiles = sStoreTilesTableStencil[pDstSurface->tileMode][pDstSurface->format];
+ }
+
+ if(nullptr == pfnStoreTiles)
+ {
+ SWR_ASSERT(false, "Invalid pixel format / tile mode for store tiles");
+ return;
+ }
+
+ // Store a macro tile
+#ifdef KNOB_ENABLE_RDTSC
+ if (sBuckets[pDstSurface->format] == -1)
+ {
+ // guard sBuckets update since storetiles is called by multiple threads
+ sBucketMutex.lock();
+ if (sBuckets[pDstSurface->format] == -1)
+ {
+ const SWR_FORMAT_INFO& info = GetFormatInfo(pDstSurface->format);
+ BUCKET_DESC desc{info.name, "", false, 0xffffffff};
+ sBuckets[pDstSurface->format] = gBucketMgr.RegisterBucket(desc);
+ }
+ sBucketMutex.unlock();
+ }
+#endif
+
+ BUCKETS_START(sBuckets[pDstSurface->format]);
+ pfnStoreTiles(pSrcHotTile, pDstSurface, x, y, renderTargetArrayIndex);
+ BUCKETS_STOP(sBuckets[pDstSurface->format]);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// InitStoreTilesTable - Helper for setting up the tables.
+template <SWR_TILE_MODE TileModeT, size_t NumTileModesT, size_t ArraySizeT>
+void InitStoreTilesTableColor(
+ PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT])
+{
+ table[TileModeT][R32G32B32A32_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store;
+ table[TileModeT][R32G32B32A32_SINT] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store;
+ table[TileModeT][R32G32B32A32_UINT] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store;
+ table[TileModeT][R32G32B32X32_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store;
+ table[TileModeT][R32G32B32_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store;
+ table[TileModeT][R32G32B32_SINT] = StoreMacroTile<TilingTraits<TileModeT, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store;
+ table[TileModeT][R32G32B32_UINT] = StoreMacroTile<TilingTraits<TileModeT, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store;
+ table[TileModeT][R16G16B16A16_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store;
+ table[TileModeT][R16G16B16A16_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store;
+ table[TileModeT][R16G16B16A16_SINT] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store;
+ table[TileModeT][R16G16B16A16_UINT] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store;
+ table[TileModeT][R16G16B16A16_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store;
+ table[TileModeT][R32G32_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store;
+ table[TileModeT][R32G32_SINT] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store;
+ table[TileModeT][R32G32_UINT] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store;
+ table[TileModeT][R16G16B16X16_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store;
+ table[TileModeT][R16G16B16X16_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store;
+ table[TileModeT][B8G8R8A8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store;
+ table[TileModeT][B8G8R8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store;
+
+ // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now
+ table[TileModeT][R10G10B10A2_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric;
+ table[TileModeT][R10G10B10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric;
+ table[TileModeT][R10G10B10A2_UINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric;
+
+ table[TileModeT][R8G8B8A8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store;
+ table[TileModeT][R8G8B8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store;
+ table[TileModeT][R8G8B8A8_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store;
+ table[TileModeT][R8G8B8A8_SINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store;
+ table[TileModeT][R8G8B8A8_UINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store;
+ table[TileModeT][R16G16_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store;
+ table[TileModeT][R16G16_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store;
+ table[TileModeT][R16G16_SINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store;
+ table[TileModeT][R16G16_UINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store;
+ table[TileModeT][R16G16_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store;
+
+ // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now
+ table[TileModeT][B10G10R10A2_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric;
+ table[TileModeT][B10G10R10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric;
+ table[TileModeT][R11G11B10_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric;
+
+ table[TileModeT][R32_SINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store;
+ table[TileModeT][R32_UINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store;
+ table[TileModeT][R32_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store;
+ table[TileModeT][A32_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store;
+ table[TileModeT][B8G8R8X8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store;
+ table[TileModeT][B8G8R8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store;
+ table[TileModeT][R8G8B8X8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store;
+ table[TileModeT][R8G8B8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store;
+
+ // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now
+ table[TileModeT][B10G10R10X2_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric;
+ table[TileModeT][B5G6R5_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store;
+ table[TileModeT][B5G6R5_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric;
+ table[TileModeT][B5G5R5A1_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric;
+ table[TileModeT][B5G5R5A1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric;
+ table[TileModeT][B4G4R4A4_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric;
+ table[TileModeT][B4G4R4A4_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric;
+
+ table[TileModeT][R8G8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store;
+ table[TileModeT][R8G8_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store;
+ table[TileModeT][R8G8_SINT] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store;
+ table[TileModeT][R8G8_UINT] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store;
+ table[TileModeT][R16_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store;
+ table[TileModeT][R16_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store;
+ table[TileModeT][R16_SINT] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store;
+ table[TileModeT][R16_UINT] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store;
+ table[TileModeT][R16_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store;
+ table[TileModeT][A16_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store;
+ table[TileModeT][A16_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store;
+
+ // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now
+ table[TileModeT][B5G5R5X1_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric;
+ table[TileModeT][B5G5R5X1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric;
+
+ table[TileModeT][R8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store;
+ table[TileModeT][R8_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store;
+ table[TileModeT][R8_SINT] = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store;
+ table[TileModeT][R8_UINT] = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store;
+ table[TileModeT][A8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store;
+ table[TileModeT][BC1_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, BC1_UNORM>::Store;
+ table[TileModeT][BC2_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC2_UNORM>::Store;
+ table[TileModeT][BC3_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC3_UNORM>::Store;
+ table[TileModeT][BC4_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, BC4_UNORM>::Store;
+ table[TileModeT][BC5_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC5_UNORM>::Store;
+ table[TileModeT][BC1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, BC1_UNORM_SRGB>::Store;
+ table[TileModeT][BC2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC2_UNORM_SRGB>::Store;
+ table[TileModeT][BC3_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC3_UNORM_SRGB>::Store;
+ table[TileModeT][R8G8B8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store;
+ table[TileModeT][R8G8B8_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store;
+ table[TileModeT][BC4_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, BC4_SNORM>::Store;
+ table[TileModeT][BC5_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC5_SNORM>::Store;
+ table[TileModeT][R16G16B16_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store;
+ table[TileModeT][R16G16B16_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store;
+ table[TileModeT][R16G16B16_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store;
+ table[TileModeT][R8G8B8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store;
+ table[TileModeT][R16G16B16_UINT] = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store;
+ table[TileModeT][R16G16B16_SINT] = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store;
+
+ // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now
+ table[TileModeT][R10G10B10A2_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric;
+ table[TileModeT][R10G10B10A2_SINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric;
+ table[TileModeT][B10G10R10A2_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric;
+ table[TileModeT][B10G10R10A2_UINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric;
+ table[TileModeT][B10G10R10A2_SINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric;
+
+ table[TileModeT][R8G8B8_UINT] = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store;
+ table[TileModeT][R8G8B8_SINT] = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
+template <SWR_TILE_MODE TileModeT, size_t NumTileModes, size_t ArraySizeT>
+void InitStoreTilesTableDepth(
+ PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
+{
+ table[TileModeT][R32_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32_FLOAT, R32_FLOAT>::Store;
+ table[TileModeT][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store;
+ table[TileModeT][R16_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32_FLOAT, R16_UNORM>::Store;
+}
+
+template <SWR_TILE_MODE TileModeT, size_t NumTileModes, size_t ArraySizeT>
+void InitStoreTilesTableStencil(
+ PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
+{
+ table[TileModeT][R32_UINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R8_UINT, R32_UINT>::Store;
+ table[TileModeT][R8_UINT] = StoreMacroTile<TilingTraits<TileModeT, 8>, R8_UINT, R8_UINT>::Store;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Sets up tables for StoreTile
+void InitSimStoreTilesTable()
+{
+ memset(sStoreTilesTableColor, 0, sizeof(sStoreTilesTableColor));
+ memset(sStoreTilesTableDepth, 0, sizeof(sStoreTilesTableDepth));
+
+ InitStoreTilesTableColor<SWR_TILE_NONE>(sStoreTilesTableColor);
+ InitStoreTilesTableDepth<SWR_TILE_NONE>(sStoreTilesTableDepth);
+ InitStoreTilesTableStencil<SWR_TILE_NONE>(sStoreTilesTableStencil);
+
+ InitStoreTilesTableColor<SWR_TILE_MODE_YMAJOR>(sStoreTilesTableColor);
+ InitStoreTilesTableColor<SWR_TILE_MODE_XMAJOR>(sStoreTilesTableColor);
+
+ InitStoreTilesTableDepth<SWR_TILE_MODE_YMAJOR>(sStoreTilesTableDepth);
+ InitStoreTilesTableStencil<SWR_TILE_MODE_WMAJOR>(sStoreTilesTableStencil);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h
new file mode 100644
index 00000000000..a14f3bf3f7c
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h
@@ -0,0 +1,581 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file TilingFunctions.h
+*
+* @brief Tiling functions.
+*
+******************************************************************************/
+#pragma once
+
+#include "core/state.h"
+#include "core/format_traits.h"
+#include "memory/tilingtraits.h"
+
+#include <algorithm>
+
+#define MAX_NUM_LOD 15
+
+#define GFX_ALIGN(x, a) (((x) + ((a) - 1)) - (((x) + ((a) - 1)) & ((a) - 1))) // Alt implementation with bitwise not (~) has issue with uint32 align used with 64-bit value, since ~'ed value will remain 32-bit.
+
+//////////////////////////////////////////////////////////////////////////
+/// SimdTile SSE(2x2), AVX(4x2), or AVX-512(4x4?)
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT HotTileFormat, SWR_FORMAT SrcOrDstFormat>
+struct SimdTile
+{
+ // SimdTile is SOA (e.g. rrrrrrrr gggggggg bbbbbbbb aaaaaaaa )
+ float color[FormatTraits<HotTileFormat>::numComps][KNOB_SIMD_WIDTH];
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Retrieve color from simd.
+ /// @param index - linear index to color within simd.
+ /// @param outputColor - output color
+ INLINE void GetSwizzledColor(
+ uint32_t index,
+ float outputColor[4])
+ {
+ // SOA pattern for 2x2 is a subset of 4x2.
+ // 0 1 4 5
+ // 2 3 6 7
+ // The offset converts pattern to linear
+#if (SIMD_TILE_X_DIM == 4)
+ static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 };
+#elif (SIMD_TILE_X_DIM == 2)
+ static const uint32_t offset[] = { 0, 1, 2, 3 };
+#endif
+
+ for (uint32_t i = 0; i < FormatTraits<SrcOrDstFormat>::numComps; ++i)
+ {
+ outputColor[i] = this->color[FormatTraits<SrcOrDstFormat>::swizzle(i)][offset[index]];
+ }
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Retrieve color from simd.
+ /// @param index - linear index to color within simd.
+ /// @param outputColor - output color
+ INLINE void SetSwizzledColor(
+ uint32_t index,
+ const float src[4])
+ {
+ // SOA pattern for 2x2 is a subset of 4x2.
+ // 0 1 4 5
+ // 2 3 6 7
+ // The offset converts pattern to linear
+#if (SIMD_TILE_X_DIM == 4)
+ static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 };
+#elif (SIMD_TILE_X_DIM == 2)
+ static const uint32_t offset[] = { 0, 1, 2, 3 };
+#endif
+
+ // Only loop over the components needed for destination.
+ for (uint32_t i = 0; i < FormatTraits<SrcOrDstFormat>::numComps; ++i)
+ {
+ this->color[i][offset[index]] = src[i];
+ }
+ }
+};
+
+template<>
+struct SimdTile <R8_UINT,R8_UINT>
+{
+ // SimdTile is SOA (e.g. rrrrrrrr gggggggg bbbbbbbb aaaaaaaa )
+ uint8_t color[FormatTraits<R8_UINT>::numComps][KNOB_SIMD_WIDTH];
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Retrieve color from simd.
+ /// @param index - linear index to color within simd.
+ /// @param outputColor - output color
+ INLINE void GetSwizzledColor(
+ uint32_t index,
+ float outputColor[4])
+ {
+ // SOA pattern for 2x2 is a subset of 4x2.
+ // 0 1 4 5
+ // 2 3 6 7
+ // The offset converts pattern to linear
+#if (SIMD_TILE_X_DIM == 4)
+ static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 };
+#elif (SIMD_TILE_X_DIM == 2)
+ static const uint32_t offset[] = { 0, 1, 2, 3 };
+#endif
+
+ for (uint32_t i = 0; i < FormatTraits<R8_UINT>::numComps; ++i)
+ {
+ uint32_t src = this->color[FormatTraits<R8_UINT>::swizzle(i)][offset[index]];
+ outputColor[i] = *(float*)&src;
+ }
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Retrieve color from simd.
+ /// @param index - linear index to color within simd.
+ /// @param outputColor - output color
+ INLINE void SetSwizzledColor(
+ uint32_t index,
+ const float src[4])
+ {
+ // SOA pattern for 2x2 is a subset of 4x2.
+ // 0 1 4 5
+ // 2 3 6 7
+ // The offset converts pattern to linear
+#if (SIMD_TILE_X_DIM == 4)
+ static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 };
+#elif (SIMD_TILE_X_DIM == 2)
+ static const uint32_t offset[] = { 0, 1, 2, 3 };
+#endif
+
+ // Only loop over the components needed for destination.
+ for (uint32_t i = 0; i < FormatTraits<R8_UINT>::numComps; ++i)
+ {
+ this->color[i][offset[index]] = *(uint8_t*)&src[i];
+ }
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes lod offset for 1D surface at specified lod.
+/// @param baseWidth - width of basemip (mip 0).
+/// @param hAlign - horizontal alignment per miip, in texels
+/// @param lod - lod index
+/// @param offset - output offset.
+INLINE void ComputeLODOffset1D(
+ const SWR_FORMAT_INFO& info,
+ uint32_t baseWidth,
+ uint32_t hAlign,
+ uint32_t lod,
+ uint32_t &offset)
+{
+ if (lod == 0)
+ {
+ offset = 0;
+ }
+ else
+ {
+ uint32_t curWidth = baseWidth;
+ // translate mip width from pixels to blocks for block compressed formats
+ // @note hAlign is already in blocks for compressed formats so no need to convert
+ if (info.isBC) curWidth /= info.bcWidth;
+
+ offset = GFX_ALIGN(curWidth, hAlign);
+ for (uint32_t l = 1; l < lod; ++l)
+ {
+ curWidth = GFX_ALIGN(std::max<uint32_t>(curWidth >> 1, 1U), hAlign);
+ offset += curWidth;
+ }
+
+ if (info.isSubsampled)
+ {
+ offset /= info.bcWidth;
+ }
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes x lod offset for 2D surface at specified lod.
+/// @param baseWidth - width of basemip (mip 0).
+/// @param hAlign - horizontal alignment per mip, in texels
+/// @param lod - lod index
+/// @param offset - output offset.
+INLINE void ComputeLODOffsetX(
+ const SWR_FORMAT_INFO& info,
+ uint32_t baseWidth,
+ uint32_t hAlign,
+ uint32_t lod,
+ uint32_t &offset)
+{
+ if (lod < 2)
+ {
+ offset = 0;
+ }
+ else
+ {
+ uint32_t curWidth = baseWidth;
+ // convert mip width from pixels to blocks for block compressed formats
+ // @note hAlign is already in blocks for compressed formats so no need to convert
+ if (info.isBC) curWidth /= info.bcWidth;
+
+ curWidth = std::max<uint32_t>(curWidth >> 1, 1U);
+ curWidth = GFX_ALIGN(curWidth, hAlign);
+
+ if (info.isSubsampled)
+ {
+ curWidth /= info.bcWidth;
+ }
+
+ offset = curWidth;
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes y lod offset for 2D surface at specified lod.
+/// @param baseWidth - width of basemip (mip 0).
+/// @param vAlign - vertical alignment per mip, in rows
+/// @param lod - lod index
+/// @param offset - output offset.
+INLINE void ComputeLODOffsetY(
+ const SWR_FORMAT_INFO& info,
+ uint32_t baseHeight,
+ uint32_t vAlign,
+ uint32_t lod,
+ uint32_t &offset)
+{
+ if (lod == 0)
+ {
+ offset = 0;
+ }
+ else
+ {
+ offset = 0;
+ uint32_t mipHeight = baseHeight;
+
+ // translate mip height from pixels to blocks for block compressed formats
+ // @note VAlign is already in blocks for compressed formats so no need to convert
+ if (info.isBC) mipHeight /= info.bcHeight;
+
+ for (uint32_t l = 1; l <= lod; ++l)
+ {
+ uint32_t alignedMipHeight = GFX_ALIGN(mipHeight, vAlign);
+ offset += ((l != 2) ? alignedMipHeight : 0);
+ mipHeight = std::max<uint32_t>(mipHeight >> 1, 1U);
+ }
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes 1D surface offset
+/// @param x - offset from start of array slice at given lod.
+/// @param array - array slice index
+/// @param lod - lod index
+/// @param pState - surface state
+/// @param xOffsetBytes - output offset in bytes.
+template<bool UseCachedOffsets>
+INLINE void ComputeSurfaceOffset1D(
+ uint32_t x,
+ uint32_t array,
+ uint32_t lod,
+ const SWR_SURFACE_STATE *pState,
+ uint32_t &xOffsetBytes)
+{
+ const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format);
+ uint32_t lodOffset;
+
+ if (UseCachedOffsets)
+ {
+ lodOffset = pState->lodOffsets[0][lod];
+ }
+ else
+ {
+ ComputeLODOffset1D(info, pState->width, pState->halign, lod, lodOffset);
+ }
+
+ xOffsetBytes = (array * pState->qpitch + lodOffset + x) * info.Bpp;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Adjusts the array slice for legacy TileY MSAA
+/// @param pState - surface state
+/// @param array - array slice index
+/// @param sampleNum - requested sample
+INLINE void AdjustCoordsForMSAA(const SWR_SURFACE_STATE *pState, uint32_t& x, uint32_t& y, uint32_t& arrayIndex, uint32_t sampleNum)
+{
+ /// @todo: might want to templatize adjusting for sample slices when we support tileYS/tileYF.
+ if((pState->tileMode == SWR_TILE_MODE_YMAJOR ||
+ pState->tileMode == SWR_TILE_MODE_WMAJOR) &&
+ pState->bInterleavedSamples)
+ {
+ uint32_t newX, newY, newSampleX, newSampleY;
+ switch(pState->numSamples)
+ {
+ case 1:
+ newX = x;
+ newY = y;
+ newSampleX = newSampleY = 0;
+ break;
+ case 2:
+ {
+ assert(pState->type == SURFACE_2D);
+ static const uint32_t xMask = 0xFFFFFFFD;
+ static const uint32_t sampleMaskX = 0x1;
+ newX = pdep_u32(x, xMask);
+ newY = y;
+ newSampleX = pext_u32(sampleNum, sampleMaskX);
+ newSampleY = 0;
+ }
+ break;
+ case 4:
+ {
+ assert(pState->type == SURFACE_2D);
+ static const uint32_t mask = 0xFFFFFFFD;
+ static const uint32_t sampleMaskX = 0x1;
+ static const uint32_t sampleMaskY = 0x2;
+ newX = pdep_u32(x, mask);
+ newY = pdep_u32(y, mask);
+ newSampleX = pext_u32(sampleNum, sampleMaskX);
+ newSampleY = pext_u32(sampleNum, sampleMaskY);
+ }
+ break;
+ case 8:
+ {
+ assert(pState->type == SURFACE_2D);
+ static const uint32_t xMask = 0xFFFFFFF9;
+ static const uint32_t yMask = 0xFFFFFFFD;
+ static const uint32_t sampleMaskX = 0x5;
+ static const uint32_t sampleMaskY = 0x2;
+ newX = pdep_u32(x, xMask);
+ newY = pdep_u32(y, yMask);
+ newSampleX = pext_u32(sampleNum, sampleMaskX);
+ newSampleY = pext_u32(sampleNum, sampleMaskY);
+ }
+ break;
+ case 16:
+ {
+ assert(pState->type == SURFACE_2D);
+ static const uint32_t mask = 0xFFFFFFF9;
+ static const uint32_t sampleMaskX = 0x5;
+ static const uint32_t sampleMaskY = 0xA;
+ newX = pdep_u32(x, mask);
+ newY = pdep_u32(y, mask);
+ newSampleX = pext_u32(sampleNum, sampleMaskX);
+ newSampleY = pext_u32(sampleNum, sampleMaskY);
+ }
+ break;
+ default:
+ assert(0 && "Unsupported sample count");
+ newX = newY = 0;
+ newSampleX = newSampleY = 0;
+ break;
+ }
+ x = newX | (newSampleX << 1);
+ y = newY | (newSampleY << 1);
+ }
+ else if(pState->tileMode == SWR_TILE_MODE_YMAJOR ||
+ pState->tileMode == SWR_TILE_NONE)
+ {
+ uint32_t sampleShift;
+ switch(pState->numSamples)
+ {
+ case 1:
+ assert(sampleNum == 0);
+ sampleShift = 0;
+ break;
+ case 2:
+ assert(pState->type == SURFACE_2D);
+ sampleShift = 1;
+ break;
+ case 4:
+ assert(pState->type == SURFACE_2D);
+ sampleShift = 2;
+ break;
+ case 8:
+ assert(pState->type == SURFACE_2D);
+ sampleShift = 3;
+ break;
+ case 16:
+ assert(pState->type == SURFACE_2D);
+ sampleShift = 4;
+ break;
+ default:
+ assert(0 && "Unsupported sample count");
+ sampleShift = 0;
+ break;
+ }
+ arrayIndex = (arrayIndex << sampleShift) | sampleNum;
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes 2D surface offset
+/// @param x - horizontal offset from start of array slice and lod.
+/// @param y - vertical offset from start of array slice and lod.
+/// @param array - array slice index
+/// @param lod - lod index
+/// @param pState - surface state
+/// @param xOffsetBytes - output x offset in bytes.
+/// @param yOffsetRows - output y offset in bytes.
+template<bool UseCachedOffsets>
+INLINE void ComputeSurfaceOffset2D(uint32_t x, uint32_t y, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState, uint32_t &xOffsetBytes, uint32_t &yOffsetRows)
+{
+ const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format);
+ uint32_t lodOffsetX, lodOffsetY;
+
+ if (UseCachedOffsets)
+ {
+ lodOffsetX = pState->lodOffsets[0][lod];
+ lodOffsetY = pState->lodOffsets[1][lod];
+ }
+ else
+ {
+ ComputeLODOffsetX(info, pState->width, pState->halign, lod, lodOffsetX);
+ ComputeLODOffsetY(info, pState->height, pState->valign, lod, lodOffsetY);
+ }
+
+ AdjustCoordsForMSAA(pState, x, y, array, sampleNum);
+ xOffsetBytes = (x + lodOffsetX + pState->xOffset) * info.Bpp;
+ yOffsetRows = (array * pState->qpitch) + lodOffsetY + y + pState->yOffset;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes 3D surface offset
+/// @param x - horizontal offset from start of array slice and lod.
+/// @param y - vertical offset from start of array slice and lod.
+/// @param z - depth offset from start of array slice and lod.
+/// @param lod - lod index
+/// @param pState - surface state
+/// @param xOffsetBytes - output x offset in bytes.
+/// @param yOffsetRows - output y offset in rows.
+/// @param zOffsetSlices - output y offset in slices.
+template<bool UseCachedOffsets>
+INLINE void ComputeSurfaceOffset3D(uint32_t x, uint32_t y, uint32_t z, uint32_t lod, const SWR_SURFACE_STATE *pState, uint32_t &xOffsetBytes, uint32_t &yOffsetRows, uint32_t &zOffsetSlices)
+{
+ const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format);
+ uint32_t lodOffsetX, lodOffsetY;
+
+ if (UseCachedOffsets)
+ {
+ lodOffsetX = pState->lodOffsets[0][lod];
+ lodOffsetY = pState->lodOffsets[1][lod];
+ }
+ else
+ {
+ ComputeLODOffsetX(info, pState->width, pState->halign, lod, lodOffsetX);
+ ComputeLODOffsetY(info, pState->height, pState->valign, lod, lodOffsetY);
+ }
+
+ xOffsetBytes = (x + lodOffsetX) * info.Bpp;
+ yOffsetRows = lodOffsetY + y;
+ zOffsetSlices = z;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Swizzles the linear x,y offsets depending on surface tiling mode
+/// and returns final surface address
+/// @param xOffsetBytes - x offset from base of surface in bytes
+/// @param yOffsetRows - y offset from base of surface in rows
+/// @param pState - pointer to the surface state
+template<typename TTraits>
+INLINE uint32_t ComputeTileSwizzle2D(uint32_t xOffsetBytes, uint32_t yOffsetRows, const SWR_SURFACE_STATE *pState)
+{
+ return ComputeOffset2D<TTraits>(pState->pitch, xOffsetBytes, yOffsetRows);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Swizzles the linear x,y offsets depending on surface tiling mode
+/// and returns final surface address
+/// @param xOffsetBytes - x offset from base of surface in bytes
+/// @param yOffsetRows - y offset from base of surface in rows
+/// @param pState - pointer to the surface state
+template<typename TTraits>
+INLINE uint32_t ComputeTileSwizzle3D(uint32_t xOffsetBytes, uint32_t yOffsetRows, uint32_t zOffsetSlices, const SWR_SURFACE_STATE *pState)
+{
+ return ComputeOffset3D<TTraits>(pState->qpitch, pState->pitch, xOffsetBytes, yOffsetRows, zOffsetSlices);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Swizzles the linear x,y offsets depending on surface tiling mode
+/// and returns final surface address
+/// @param xOffsetBytes - x offset from base of surface in bytes
+/// @param yOffsetRows - y offset from base of surface in rows
+/// @param pState - pointer to the surface state
+INLINE
+uint32_t TileSwizzle2D(uint32_t xOffsetBytes, uint32_t yOffsetRows, const SWR_SURFACE_STATE *pState)
+{
+ switch (pState->tileMode)
+ {
+ case SWR_TILE_NONE: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_NONE, 32> >(xOffsetBytes, yOffsetRows, pState);
+ case SWR_TILE_SWRZ: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_SWRZ, 32> >(xOffsetBytes, yOffsetRows, pState);
+ case SWR_TILE_MODE_XMAJOR: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_MODE_XMAJOR, 8> >(xOffsetBytes, yOffsetRows, pState);
+ case SWR_TILE_MODE_YMAJOR: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_MODE_YMAJOR, 32> >(xOffsetBytes, yOffsetRows, pState);
+ case SWR_TILE_MODE_WMAJOR: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_MODE_WMAJOR, 8> >(xOffsetBytes, yOffsetRows, pState);
+ default: SWR_ASSERT(0, "Unsupported tiling mode");
+ }
+ return (uint32_t) NULL;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Swizzles the linear x,y,z offsets depending on surface tiling mode
+/// and returns final surface address
+/// @param xOffsetBytes - x offset from base of surface in bytes
+/// @param yOffsetRows - y offset from base of surface in rows
+/// @param zOffsetSlices - z offset from base of surface in slices
+/// @param pState - pointer to the surface state
+INLINE
+uint32_t TileSwizzle3D(uint32_t xOffsetBytes, uint32_t yOffsetRows, uint32_t zOffsetSlices, const SWR_SURFACE_STATE *pState)
+{
+ switch (pState->tileMode)
+ {
+ case SWR_TILE_NONE: return ComputeTileSwizzle3D<TilingTraits<SWR_TILE_NONE, 32> >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState);
+ case SWR_TILE_SWRZ: return ComputeTileSwizzle3D<TilingTraits<SWR_TILE_SWRZ, 32> >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState);
+ case SWR_TILE_MODE_YMAJOR: return ComputeTileSwizzle3D<TilingTraits<SWR_TILE_MODE_YMAJOR, 32> >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState);
+ default: SWR_ASSERT(0, "Unsupported tiling mode");
+ }
+ return (uint32_t) NULL;
+}
+
+template<bool UseCachedOffsets>
+INLINE
+uint32_t ComputeSurfaceOffset(uint32_t x, uint32_t y, uint32_t z, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState)
+{
+ uint32_t offsetX = 0, offsetY = 0, offsetZ = 0;
+ switch (pState->type)
+ {
+ case SURFACE_BUFFER:
+ case SURFACE_STRUCTURED_BUFFER:
+ offsetX = x * pState->pitch;
+ return offsetX;
+ break;
+ case SURFACE_1D:
+ ComputeSurfaceOffset1D<UseCachedOffsets>(x, array, lod, pState, offsetX);
+ return TileSwizzle2D(offsetX, 0, pState);
+ break;
+ case SURFACE_2D:
+ ComputeSurfaceOffset2D<UseCachedOffsets>(x, y, array, sampleNum, lod, pState, offsetX, offsetY);
+ return TileSwizzle2D(offsetX, offsetY, pState);
+ case SURFACE_3D:
+ ComputeSurfaceOffset3D<UseCachedOffsets>(x, y, z, lod, pState, offsetX, offsetY, offsetZ);
+ return TileSwizzle3D(offsetX, offsetY, offsetZ, pState);
+ break;
+ case SURFACE_CUBE:
+ ComputeSurfaceOffset2D<UseCachedOffsets>(x, y, array, sampleNum, lod, pState, offsetX, offsetY);
+ return TileSwizzle2D(offsetX, offsetY, pState);
+ break;
+ default: SWR_ASSERT(0, "Unsupported format");
+ }
+
+ return (uint32_t) NULL;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes surface address at the given location and lod
+/// @param x - x location in pixels
+/// @param y - y location in rows
+/// @param z - z location for 3D surfaces
+/// @param array - array slice for 1D and 2D surfaces
+/// @param lod - level of detail
+/// @param pState - pointer to the surface state
+template<bool UseCachedOffsets>
+INLINE
+void* ComputeSurfaceAddress(uint32_t x, uint32_t y, uint32_t z, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState)
+{
+ return pState->pBaseAddress + ComputeSurfaceOffset<UseCachedOffsets>(x, y, z, array, sampleNum, lod, pState);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
new file mode 100644
index 00000000000..50f8e57c22a
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
@@ -0,0 +1,263 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file tilingtraits.h
+*
+* @brief Tiling traits.
+*
+******************************************************************************/
+#pragma once
+
+#include "core/state.h"
+
+template<SWR_TILE_MODE mode, int>
+struct TilingTraits
+{
+ static const SWR_TILE_MODE TileMode{ mode };
+ static UINT GetCu() { SWR_ASSERT(0); return 0; }
+ static UINT GetCv() { SWR_ASSERT(0); return 0; }
+ static UINT GetCr() { SWR_ASSERT(0); return 0; }
+ static UINT GetTileIDShift() { SWR_ASSERT(0); return 0; }
+
+ /// @todo correct pdep shifts for all rastertile dims. Unused for now
+ static UINT GetPdepX() { SWR_ASSERT(0); return 0x37; }
+ static UINT GetPdepY() { SWR_ASSERT(0); return 0xC8; }
+};
+
+template<int X> struct TilingTraits <SWR_TILE_NONE, X>
+{
+ static const SWR_TILE_MODE TileMode{ SWR_TILE_NONE };
+ static UINT GetCu() { return 0; }
+ static UINT GetCv() { return 0; }
+ static UINT GetCr() { return 0; }
+ static UINT GetTileIDShift() { return 0; }
+ static UINT GetPdepX() { return 0x00; }
+ static UINT GetPdepY() { return 0x00; }
+};
+
+template<> struct TilingTraits <SWR_TILE_SWRZ, 8>
+{
+ static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ };
+ static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT; }
+ static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; }
+ static UINT GetCr() { return 0; }
+ static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT; }
+
+ /// @todo correct pdep shifts for all rastertile dims. Unused for now
+ static UINT GetPdepX() { SWR_ASSERT(0); return 0x00; }
+ static UINT GetPdepY() { SWR_ASSERT(0); return 0x00; }
+};
+
+template<> struct TilingTraits <SWR_TILE_SWRZ, 32>
+{
+ static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ };
+ static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT + 2; }
+ static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; }
+ static UINT GetCr() { return 0; }
+ static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT + 2; }
+
+ static UINT GetPdepX() { return 0x37; }
+ static UINT GetPdepY() { return 0xC8; }
+};
+
+template<> struct TilingTraits <SWR_TILE_SWRZ, 128>
+{
+ static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ };
+ static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT + 4; }
+ static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; }
+ static UINT GetCr() { return 0; }
+ static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT + 4; }
+
+ /// @todo correct pdep shifts for all rastertile dims. Unused for now
+ static UINT GetPdepX() { SWR_ASSERT(0); return 0x37; }
+ static UINT GetPdepY() { SWR_ASSERT(0); return 0xC8; }
+};
+
+// y-major tiling layout unaffected by element size
+template<int X> struct TilingTraits <SWR_TILE_MODE_YMAJOR, X>
+{
+ static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_YMAJOR };
+ static UINT GetCu() { return 7; }
+ static UINT GetCv() { return 5; }
+ static UINT GetCr() { return 0; }
+ static UINT GetTileIDShift() { return 12; }
+
+ static UINT GetPdepX() { return 0xe0f; }
+ static UINT GetPdepY() { return 0x1f0; }
+};
+
+// x-major tiling layout unaffected by element size
+template<int X> struct TilingTraits <SWR_TILE_MODE_XMAJOR, X>
+{
+ static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_XMAJOR };
+ static UINT GetCu() { return 9; }
+ static UINT GetCv() { return 3; }
+ static UINT GetCr() { return 0; }
+ static UINT GetTileIDShift() { return 12; }
+
+ static UINT GetPdepX() { return 0x1ff; }
+ static UINT GetPdepY() { return 0xe00; }
+};
+
+template<int X> struct TilingTraits <SWR_TILE_MODE_WMAJOR, X>
+{
+ static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_WMAJOR };
+ static UINT GetCu() { return 6; }
+ static UINT GetCv() { return 6; }
+ static UINT GetCr() { return 0; }
+ static UINT GetTileIDShift() { return 12; }
+
+ static UINT GetPdepX() { return 0xe15; }
+ static UINT GetPdepY() { return 0x1ea; }
+};
+
+INLINE
+UINT pdep_u32(UINT a, UINT mask)
+{
+#if KNOB_ARCH==KNOB_ARCH_AVX2
+ return _pdep_u32(a, mask);
+#else
+ UINT result = 0;
+
+ // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
+ // using bsf instead of funky loop
+ DWORD maskIndex;
+ while (_BitScanForward(&maskIndex, mask))
+ {
+ // 1. isolate lowest set bit of mask
+ const UINT lowest = 1 << maskIndex;
+
+ // 2. populate LSB from src
+ const UINT LSB = (UINT)((int)(a << 31) >> 31);
+
+ // 3. copy bit from mask
+ result |= LSB & lowest;
+
+ // 4. clear lowest bit
+ mask &= ~lowest;
+
+ // 5. prepare for next iteration
+ a >>= 1;
+ }
+
+ return result;
+#endif
+}
+
+INLINE
+UINT pext_u32(UINT a, UINT mask)
+{
+#if KNOB_ARCH==KNOB_ARCH_AVX2
+ return _pext_u32(a, mask);
+#else
+ UINT result = 0;
+ DWORD maskIndex;
+ uint32_t currentBit = 0;
+ while (_BitScanForward(&maskIndex, mask))
+ {
+ // 1. isolate lowest set bit of mask
+ const UINT lowest = 1 << maskIndex;
+
+ // 2. copy bit from mask
+ result |= ((a & lowest) > 0) << currentBit++;
+
+ // 3. clear lowest bit
+ mask &= ~lowest;
+ }
+ return result;
+#endif
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes the tileID for 2D tiled surfaces
+/// @param pitch - surface pitch in bytes
+/// @param tileX - x offset in tiles
+/// @param tileY - y offset in tiles
+template<typename TTraits>
+INLINE UINT ComputeTileOffset2D(UINT pitch, UINT tileX, UINT tileY)
+{
+ UINT tileID = tileY * (pitch >> TTraits::GetCu()) + tileX;
+ return tileID << TTraits::GetTileIDShift();
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes the tileID for 3D tiled surfaces
+/// @param qpitch - surface qpitch in rows
+/// @param pitch - surface pitch in bytes
+/// @param tileX - x offset in tiles
+/// @param tileY - y offset in tiles
+/// @param tileZ - y offset in tiles
+template<typename TTraits>
+INLINE UINT ComputeTileOffset3D(UINT qpitch, UINT pitch, UINT tileX, UINT tileY, UINT tileZ)
+{
+ UINT tileID = (tileZ * (qpitch >> TTraits::GetCv()) + tileY) * (pitch >> TTraits::GetCu()) + tileX;
+ return tileID << TTraits::GetTileIDShift();
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes the byte offset for 2D tiled surfaces
+/// @param pitch - surface pitch in bytes
+/// @param x - x offset in bytes
+/// @param y - y offset in rows
+template<typename TTraits>
+INLINE UINT ComputeOffset2D(UINT pitch, UINT x, UINT y)
+{
+ UINT tileID = ComputeTileOffset2D<TTraits>(pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv());
+ UINT xSwizzle = pdep_u32(x, TTraits::GetPdepX());
+ UINT ySwizzle = pdep_u32(y, TTraits::GetPdepY());
+ return (tileID | xSwizzle | ySwizzle);
+}
+
+#if KNOB_ARCH <= KNOB_ARCH_AVX
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes the byte offset for 2D tiled surfaces. Specialization
+/// for tile-y surfaces that uses bit twiddling instead of pdep emulation.
+/// @param pitch - surface pitch in bytes
+/// @param x - x offset in bytes
+/// @param y - y offset in rows
+template<>
+INLINE UINT ComputeOffset2D<TilingTraits<SWR_TILE_MODE_YMAJOR, 32> >(UINT pitch, UINT x, UINT y)
+{
+ typedef TilingTraits<SWR_TILE_MODE_YMAJOR, 32> TTraits;
+
+ UINT tileID = ComputeTileOffset2D<TTraits>(pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv());
+ UINT xSwizzle = ((x << 5) & 0xe00) | (x & 0xf);
+ UINT ySwizzle = (y << 4) & 0x1f0;
+ return (tileID | xSwizzle | ySwizzle);
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes the byte offset for 3D tiled surfaces
+/// @param qpitch - depth pitch in rows
+/// @param pitch - surface pitch in bytes
+/// @param x - x offset in bytes
+/// @param y - y offset in rows
+/// @param z - y offset in slices
+template<typename TTraits>
+INLINE UINT ComputeOffset3D(UINT qpitch, UINT pitch, UINT x, UINT y, UINT z)
+{
+ UINT tileID = ComputeTileOffset3D<TTraits>(qpitch, pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv(), z >> TTraits::GetCr());
+ UINT xSwizzle = pdep_u32(x, TTraits::GetPdepX());
+ UINT ySwizzle = pdep_u32(y, TTraits::GetPdepY());
+ return (tileID | xSwizzle | ySwizzle);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py
new file mode 100644
index 00000000000..44ab69815b1
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py
@@ -0,0 +1,79 @@
+# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+# Python source
+from __future__ import print_function
+import os
+import sys
+import knob_defs
+from mako.template import Template
+from mako.exceptions import RichTraceback
+
+def write_template_to_string(template_filename, **kwargs):
+ try:
+ template = Template(filename=template_filename)
+ # Split + Join fixes line-endings for whatever platform you are using
+ return '\n'.join(template.render(**kwargs).splitlines())
+ except:
+ traceback = RichTraceback()
+ for (filename, lineno, function, line) in traceback.traceback:
+ print("File %s, line %s, in %s" % (filename, lineno, function))
+ print(line, "\n")
+ print("%s: %s" % (str(traceback.error.__class__.__name__), traceback.error))
+
+def write_template_to_file(template_filename, output_filename, **kwargs):
+ with open(output_filename, "w") as outfile:
+ print(write_template_to_string(template_filename, **kwargs), file=outfile)
+
+def main(args=sys.argv[1:]):
+ if len(args) != 1:
+ print('Usage:', sys.argv[0], '<output_directory>', file=sys.stderr)
+ return 1
+
+ output_dir = args[0]
+ if not os.path.isdir(output_dir):
+ if os.path.exists(output_dir):
+ print('ERROR: Invalid output directory:', output_dir, file=sys.stderr)
+ return 1
+
+ try:
+ os.makedirs(output_dir)
+ except:
+ print('ERROR: Could not create output directory:', output_dir, file=sys.stderr)
+ return 1
+
+ # Output path exists, now just run the template
+ template_file = os.sep.join([sys.path[0], 'templates', 'knobs.template'])
+ output_file = os.sep.join([output_dir, 'gen_knobs.cpp'])
+ output_header = os.sep.join([output_dir, 'gen_knobs.h'])
+
+ for f in [output_header, output_file]:
+ write_template_to_file(template_file, f,
+ filename='gen_knobs',
+ knobs=knob_defs.KNOBS,
+ includes=['core/knobs_init.h', 'common/os.h', 'sstream', 'iomanip'],
+ gen_header=True if f == output_header else False)
+
+ return 0
+
+if __name__ == '__main__':
+ sys.exit(main())
+
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
new file mode 100644
index 00000000000..8c51e1e8e73
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
@@ -0,0 +1,226 @@
+# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+# Python source
+KNOBS = [
+ ['ENABLE_ASSERT_DIALOGS', {
+ 'type' : 'bool',
+ 'default' : 'true',
+ 'desc' : ['Use dialogs when asserts fire.',
+ 'Asserts are only enabled in debug builds'],
+ }],
+
+ ['SINGLE_THREADED', {
+ 'type' : 'bool',
+ 'default' : 'false',
+ 'desc' : ['If enabled will perform all rendering on the API thread.',
+ 'This is useful mainly for debugging purposes.'],
+ }],
+
+ ['DUMP_SHADER_IR', {
+ 'type' : 'bool',
+ 'default' : 'false',
+ 'desc' : ['Dumps shader LLVM IR at various stages of jit compilation.'],
+ }],
+
+ ['USE_GENERIC_STORETILE', {
+ 'type' : 'bool',
+ 'default' : 'false',
+ 'desc' : ['Always use generic function for performing StoreTile.',
+ 'Will be slightly slower than using optimized (jitted) path'],
+ }],
+
+ ['FAST_CLEAR', {
+ 'type' : 'bool',
+ 'default' : 'true',
+ 'desc' : ['Replace 3D primitive execute with a SWRClearRT operation and',
+ 'defer clear execution to first backend op on hottile, or hottile store'],
+ }],
+
+ ['MAX_NUMA_NODES', {
+ 'type' : 'uint32_t',
+ 'default' : '0',
+ 'desc' : ['Maximum # of NUMA-nodes per system used for worker threads',
+ ' 0 == ALL NUMA-nodes in the system',
+ ' N == Use at most N NUMA-nodes for rendering'],
+ }],
+
+ ['MAX_CORES_PER_NUMA_NODE', {
+ 'type' : 'uint32_t',
+ 'default' : '0',
+ 'desc' : ['Maximum # of cores per NUMA-node used for worker threads.',
+ ' 0 == ALL non-API thread cores per NUMA-node',
+ ' N == Use at most N cores per NUMA-node'],
+ }],
+
+ ['MAX_THREADS_PER_CORE', {
+ 'type' : 'uint32_t',
+ 'default' : '1',
+ 'desc' : ['Maximum # of (hyper)threads per physical core used for worker threads.',
+ ' 0 == ALL hyper-threads per core',
+ ' N == Use at most N hyper-threads per physical core'],
+ }],
+
+ ['MAX_WORKER_THREADS', {
+ 'type' : 'uint32_t',
+ 'default' : '0',
+ 'desc' : ['Maximum worker threads to spawn.',
+ '',
+ 'IMPORTANT: If this is non-zero, no worker threads will be bound to',
+ 'specific HW threads. They will all be "floating" SW threads.',
+ 'In this case, the above 3 KNOBS will be ignored.'],
+ }],
+
+ ['BUCKETS_START_FRAME', {
+ 'type' : 'uint32_t',
+ 'default' : '1200',
+ 'desc' : ['Frame from when to start saving buckets data.',
+ '',
+ 'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
+ 'for this to have an effect.'],
+ }],
+
+ ['BUCKETS_END_FRAME', {
+ 'type' : 'uint32_t',
+ 'default' : '1400',
+ 'desc' : ['Frame at which to stop saving buckets data.',
+ '',
+ 'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
+ 'for this to have an effect.'],
+ }],
+
+ ['WORKER_SPIN_LOOP_COUNT', {
+ 'type' : 'uint32_t',
+ 'default' : '5000',
+ 'desc' : ['Number of spin-loop iterations worker threads will perform',
+ 'before going to sleep when waiting for work'],
+ }],
+
+ ['MAX_DRAWS_IN_FLIGHT', {
+ 'type' : 'uint32_t',
+ 'default' : '160',
+ 'desc' : ['Maximum number of draws outstanding before API thread blocks.'],
+ }],
+
+ ['MAX_PRIMS_PER_DRAW', {
+ 'type' : 'uint32_t',
+ 'default' : '2040',
+ 'desc' : ['Maximum primitives in a single Draw().',
+ 'Larger primitives are split into smaller Draw calls.',
+ 'Should be a multiple of (3 * vectorWidth).'],
+ }],
+
+ ['MAX_TESS_PRIMS_PER_DRAW', {
+ 'type' : 'uint32_t',
+ 'default' : '16',
+ 'desc' : ['Maximum primitives in a single Draw() with tessellation enabled.',
+ 'Larger primitives are split into smaller Draw calls.',
+ 'Should be a multiple of (vectorWidth).'],
+ }],
+
+ ['MAX_FRAC_ODD_TESS_FACTOR', {
+ 'type' : 'float',
+ 'default' : '63.0f',
+ 'desc' : ['(DEBUG) Maximum tessellation factor for fractional-odd partitioning.'],
+ }],
+
+ ['MAX_FRAC_EVEN_TESS_FACTOR', {
+ 'type' : 'float',
+ 'default' : '64.0f',
+ 'desc' : ['(DEBUG) Maximum tessellation factor for fractional-even partitioning.'],
+ }],
+
+ ['MAX_INTEGER_TESS_FACTOR', {
+ 'type' : 'uint32_t',
+ 'default' : '64',
+ 'desc' : ['(DEBUG) Maximum tessellation factor for integer partitioning.'],
+ }],
+
+
+ ['BUCKETS_ENABLE_THREADVIZ', {
+ 'type' : 'bool',
+ 'default' : 'false',
+ 'desc' : ['Enable threadviz output.'],
+ }],
+
+ ['TOSS_DRAW', {
+ 'type' : 'bool',
+ 'default' : 'false',
+ 'desc' : ['Disable per-draw/dispatch execution'],
+ }],
+
+ ['TOSS_QUEUE_FE', {
+ 'type' : 'bool',
+ 'default' : 'false',
+ 'desc' : ['Stop per-draw execution at worker FE',
+ '',
+ 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+ }],
+
+ ['TOSS_FETCH', {
+ 'type' : 'bool',
+ 'default' : 'false',
+ 'desc' : ['Stop per-draw execution at vertex fetch',
+ '',
+ 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+ }],
+
+ ['TOSS_IA', {
+ 'type' : 'bool',
+ 'default' : 'false',
+ 'desc' : ['Stop per-draw execution at input assembler',
+ '',
+ 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+ }],
+
+ ['TOSS_VS', {
+ 'type' : 'bool',
+ 'default' : 'false',
+ 'desc' : ['Stop per-draw execution at vertex shader',
+ '',
+ 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+ }],
+
+ ['TOSS_SETUP_TRIS', {
+ 'type' : 'bool',
+ 'default' : 'false',
+ 'desc' : ['Stop per-draw execution at primitive setup',
+ '',
+ 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+ }],
+
+ ['TOSS_BIN_TRIS', {
+ 'type' : 'bool',
+ 'default' : 'false',
+ 'desc' : ['Stop per-draw execution at primitive binning',
+ '',
+ 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+ }],
+
+ ['TOSS_RS', {
+ 'type' : 'bool',
+ 'default' : 'false',
+ 'desc' : ['Stop per-draw execution at rasterizer',
+ '',
+ 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+ }],
+
+]
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/__init__.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/__init__.py
new file mode 100644
index 00000000000..d9638481889
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/__init__.py
@@ -0,0 +1,8 @@
+# mako/__init__.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+
+__version__ = '1.0.1'
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/_ast_util.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/_ast_util.py
new file mode 100644
index 00000000000..efbc4fc245d
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/_ast_util.py
@@ -0,0 +1,845 @@
+# mako/_ast_util.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+"""
+ ast
+ ~~~
+
+ The `ast` module helps Python applications to process trees of the Python
+ abstract syntax grammar. The abstract syntax itself might change with
+ each Python release; this module helps to find out programmatically what
+ the current grammar looks like and allows modifications of it.
+
+ An abstract syntax tree can be generated by passing `ast.PyCF_ONLY_AST` as
+ a flag to the `compile()` builtin function or by using the `parse()`
+ function from this module. The result will be a tree of objects whose
+ classes all inherit from `ast.AST`.
+
+ A modified abstract syntax tree can be compiled into a Python code object
+ using the built-in `compile()` function.
+
+ Additionally various helper functions are provided that make working with
+ the trees simpler. The main intention of the helper functions and this
+ module in general is to provide an easy to use interface for libraries
+ that work tightly with the python syntax (template engines for example).
+
+
+ :copyright: Copyright 2008 by Armin Ronacher.
+ :license: Python License.
+"""
+from _ast import *
+from mako.compat import arg_stringname
+
+BOOLOP_SYMBOLS = {
+ And: 'and',
+ Or: 'or'
+}
+
+BINOP_SYMBOLS = {
+ Add: '+',
+ Sub: '-',
+ Mult: '*',
+ Div: '/',
+ FloorDiv: '//',
+ Mod: '%',
+ LShift: '<<',
+ RShift: '>>',
+ BitOr: '|',
+ BitAnd: '&',
+ BitXor: '^'
+}
+
+CMPOP_SYMBOLS = {
+ Eq: '==',
+ Gt: '>',
+ GtE: '>=',
+ In: 'in',
+ Is: 'is',
+ IsNot: 'is not',
+ Lt: '<',
+ LtE: '<=',
+ NotEq: '!=',
+ NotIn: 'not in'
+}
+
+UNARYOP_SYMBOLS = {
+ Invert: '~',
+ Not: 'not',
+ UAdd: '+',
+ USub: '-'
+}
+
+ALL_SYMBOLS = {}
+ALL_SYMBOLS.update(BOOLOP_SYMBOLS)
+ALL_SYMBOLS.update(BINOP_SYMBOLS)
+ALL_SYMBOLS.update(CMPOP_SYMBOLS)
+ALL_SYMBOLS.update(UNARYOP_SYMBOLS)
+
+
+def parse(expr, filename='<unknown>', mode='exec'):
+ """Parse an expression into an AST node."""
+ return compile(expr, filename, mode, PyCF_ONLY_AST)
+
+
+def to_source(node, indent_with=' ' * 4):
+ """
+ This function can convert a node tree back into python sourcecode. This
+ is useful for debugging purposes, especially if you're dealing with custom
+ asts not generated by python itself.
+
+ It could be that the sourcecode is evaluable when the AST itself is not
+ compilable / evaluable. The reason for this is that the AST contains some
+ more data than regular sourcecode does, which is dropped during
+ conversion.
+
+ Each level of indentation is replaced with `indent_with`. Per default this
+ parameter is equal to four spaces as suggested by PEP 8, but it might be
+ adjusted to match the application's styleguide.
+ """
+ generator = SourceGenerator(indent_with)
+ generator.visit(node)
+ return ''.join(generator.result)
+
+
+def dump(node):
+ """
+ A very verbose representation of the node passed. This is useful for
+ debugging purposes.
+ """
+ def _format(node):
+ if isinstance(node, AST):
+ return '%s(%s)' % (node.__class__.__name__,
+ ', '.join('%s=%s' % (a, _format(b))
+ for a, b in iter_fields(node)))
+ elif isinstance(node, list):
+ return '[%s]' % ', '.join(_format(x) for x in node)
+ return repr(node)
+ if not isinstance(node, AST):
+ raise TypeError('expected AST, got %r' % node.__class__.__name__)
+ return _format(node)
+
+
+def copy_location(new_node, old_node):
+ """
+ Copy the source location hint (`lineno` and `col_offset`) from the
+ old to the new node if possible and return the new one.
+ """
+ for attr in 'lineno', 'col_offset':
+ if attr in old_node._attributes and attr in new_node._attributes \
+ and hasattr(old_node, attr):
+ setattr(new_node, attr, getattr(old_node, attr))
+ return new_node
+
+
+def fix_missing_locations(node):
+ """
+ Some nodes require a line number and the column offset. Without that
+ information the compiler will abort the compilation. Because it can be
+ a dull task to add appropriate line numbers and column offsets when
+ adding new nodes this function can help. It copies the line number and
+ column offset of the parent node to the child nodes without this
+ information.
+
+ Unlike `copy_location` this works recursive and won't touch nodes that
+ already have a location information.
+ """
+ def _fix(node, lineno, col_offset):
+ if 'lineno' in node._attributes:
+ if not hasattr(node, 'lineno'):
+ node.lineno = lineno
+ else:
+ lineno = node.lineno
+ if 'col_offset' in node._attributes:
+ if not hasattr(node, 'col_offset'):
+ node.col_offset = col_offset
+ else:
+ col_offset = node.col_offset
+ for child in iter_child_nodes(node):
+ _fix(child, lineno, col_offset)
+ _fix(node, 1, 0)
+ return node
+
+
+def increment_lineno(node, n=1):
+ """
+ Increment the line numbers of all nodes by `n` if they have line number
+ attributes. This is useful to "move code" to a different location in a
+ file.
+ """
+ for node in zip((node,), walk(node)):
+ if 'lineno' in node._attributes:
+ node.lineno = getattr(node, 'lineno', 0) + n
+
+
+def iter_fields(node):
+ """Iterate over all fields of a node, only yielding existing fields."""
+ # CPython 2.5 compat
+ if not hasattr(node, '_fields') or not node._fields:
+ return
+ for field in node._fields:
+ try:
+ yield field, getattr(node, field)
+ except AttributeError:
+ pass
+
+
+def get_fields(node):
+ """Like `iter_fiels` but returns a dict."""
+ return dict(iter_fields(node))
+
+
+def iter_child_nodes(node):
+ """Iterate over all child nodes or a node."""
+ for name, field in iter_fields(node):
+ if isinstance(field, AST):
+ yield field
+ elif isinstance(field, list):
+ for item in field:
+ if isinstance(item, AST):
+ yield item
+
+
+def get_child_nodes(node):
+ """Like `iter_child_nodes` but returns a list."""
+ return list(iter_child_nodes(node))
+
+
+def get_compile_mode(node):
+ """
+ Get the mode for `compile` of a given node. If the node is not a `mod`
+ node (`Expression`, `Module` etc.) a `TypeError` is thrown.
+ """
+ if not isinstance(node, mod):
+ raise TypeError('expected mod node, got %r' % node.__class__.__name__)
+ return {
+ Expression: 'eval',
+ Interactive: 'single'
+ }.get(node.__class__, 'expr')
+
+
+def get_docstring(node):
+ """
+ Return the docstring for the given node or `None` if no docstring can be
+ found. If the node provided does not accept docstrings a `TypeError`
+ will be raised.
+ """
+ if not isinstance(node, (FunctionDef, ClassDef, Module)):
+ raise TypeError("%r can't have docstrings" % node.__class__.__name__)
+ if node.body and isinstance(node.body[0], Str):
+ return node.body[0].s
+
+
+def walk(node):
+ """
+ Iterate over all nodes. This is useful if you only want to modify nodes in
+ place and don't care about the context or the order the nodes are returned.
+ """
+ from collections import deque
+ todo = deque([node])
+ while todo:
+ node = todo.popleft()
+ todo.extend(iter_child_nodes(node))
+ yield node
+
+
+class NodeVisitor(object):
+ """
+ Walks the abstract syntax tree and call visitor functions for every node
+ found. The visitor functions may return values which will be forwarded
+ by the `visit` method.
+
+ Per default the visitor functions for the nodes are ``'visit_'`` +
+ class name of the node. So a `TryFinally` node visit function would
+ be `visit_TryFinally`. This behavior can be changed by overriding
+ the `get_visitor` function. If no visitor function exists for a node
+ (return value `None`) the `generic_visit` visitor is used instead.
+
+ Don't use the `NodeVisitor` if you want to apply changes to nodes during
+ traversing. For this a special visitor exists (`NodeTransformer`) that
+ allows modifications.
+ """
+
+ def get_visitor(self, node):
+ """
+ Return the visitor function for this node or `None` if no visitor
+ exists for this node. In that case the generic visit function is
+ used instead.
+ """
+ method = 'visit_' + node.__class__.__name__
+ return getattr(self, method, None)
+
+ def visit(self, node):
+ """Visit a node."""
+ f = self.get_visitor(node)
+ if f is not None:
+ return f(node)
+ return self.generic_visit(node)
+
+ def generic_visit(self, node):
+ """Called if no explicit visitor function exists for a node."""
+ for field, value in iter_fields(node):
+ if isinstance(value, list):
+ for item in value:
+ if isinstance(item, AST):
+ self.visit(item)
+ elif isinstance(value, AST):
+ self.visit(value)
+
+
+class NodeTransformer(NodeVisitor):
+ """
+ Walks the abstract syntax tree and allows modifications of nodes.
+
+ The `NodeTransformer` will walk the AST and use the return value of the
+ visitor functions to replace or remove the old node. If the return
+ value of the visitor function is `None` the node will be removed
+ from the previous location otherwise it's replaced with the return
+ value. The return value may be the original node in which case no
+ replacement takes place.
+
+ Here an example transformer that rewrites all `foo` to `data['foo']`::
+
+ class RewriteName(NodeTransformer):
+
+ def visit_Name(self, node):
+ return copy_location(Subscript(
+ value=Name(id='data', ctx=Load()),
+ slice=Index(value=Str(s=node.id)),
+ ctx=node.ctx
+ ), node)
+
+ Keep in mind that if the node you're operating on has child nodes
+ you must either transform the child nodes yourself or call the generic
+ visit function for the node first.
+
+ Nodes that were part of a collection of statements (that applies to
+ all statement nodes) may also return a list of nodes rather than just
+ a single node.
+
+ Usually you use the transformer like this::
+
+ node = YourTransformer().visit(node)
+ """
+
+ def generic_visit(self, node):
+ for field, old_value in iter_fields(node):
+ old_value = getattr(node, field, None)
+ if isinstance(old_value, list):
+ new_values = []
+ for value in old_value:
+ if isinstance(value, AST):
+ value = self.visit(value)
+ if value is None:
+ continue
+ elif not isinstance(value, AST):
+ new_values.extend(value)
+ continue
+ new_values.append(value)
+ old_value[:] = new_values
+ elif isinstance(old_value, AST):
+ new_node = self.visit(old_value)
+ if new_node is None:
+ delattr(node, field)
+ else:
+ setattr(node, field, new_node)
+ return node
+
+
+class SourceGenerator(NodeVisitor):
+ """
+ This visitor is able to transform a well formed syntax tree into python
+ sourcecode. For more details have a look at the docstring of the
+ `node_to_source` function.
+ """
+
+ def __init__(self, indent_with):
+ self.result = []
+ self.indent_with = indent_with
+ self.indentation = 0
+ self.new_lines = 0
+
+ def write(self, x):
+ if self.new_lines:
+ if self.result:
+ self.result.append('\n' * self.new_lines)
+ self.result.append(self.indent_with * self.indentation)
+ self.new_lines = 0
+ self.result.append(x)
+
+ def newline(self, n=1):
+ self.new_lines = max(self.new_lines, n)
+
+ def body(self, statements):
+ self.new_line = True
+ self.indentation += 1
+ for stmt in statements:
+ self.visit(stmt)
+ self.indentation -= 1
+
+ def body_or_else(self, node):
+ self.body(node.body)
+ if node.orelse:
+ self.newline()
+ self.write('else:')
+ self.body(node.orelse)
+
+ def signature(self, node):
+ want_comma = []
+ def write_comma():
+ if want_comma:
+ self.write(', ')
+ else:
+ want_comma.append(True)
+
+ padding = [None] * (len(node.args) - len(node.defaults))
+ for arg, default in zip(node.args, padding + node.defaults):
+ write_comma()
+ self.visit(arg)
+ if default is not None:
+ self.write('=')
+ self.visit(default)
+ if node.vararg is not None:
+ write_comma()
+ self.write('*' + arg_stringname(node.vararg))
+ if node.kwarg is not None:
+ write_comma()
+ self.write('**' + arg_stringname(node.kwarg))
+
+ def decorators(self, node):
+ for decorator in node.decorator_list:
+ self.newline()
+ self.write('@')
+ self.visit(decorator)
+
+ # Statements
+
+ def visit_Assign(self, node):
+ self.newline()
+ for idx, target in enumerate(node.targets):
+ if idx:
+ self.write(', ')
+ self.visit(target)
+ self.write(' = ')
+ self.visit(node.value)
+
+ def visit_AugAssign(self, node):
+ self.newline()
+ self.visit(node.target)
+ self.write(BINOP_SYMBOLS[type(node.op)] + '=')
+ self.visit(node.value)
+
+ def visit_ImportFrom(self, node):
+ self.newline()
+ self.write('from %s%s import ' % ('.' * node.level, node.module))
+ for idx, item in enumerate(node.names):
+ if idx:
+ self.write(', ')
+ self.write(item)
+
+ def visit_Import(self, node):
+ self.newline()
+ for item in node.names:
+ self.write('import ')
+ self.visit(item)
+
+ def visit_Expr(self, node):
+ self.newline()
+ self.generic_visit(node)
+
+ def visit_FunctionDef(self, node):
+ self.newline(n=2)
+ self.decorators(node)
+ self.newline()
+ self.write('def %s(' % node.name)
+ self.signature(node.args)
+ self.write('):')
+ self.body(node.body)
+
+ def visit_ClassDef(self, node):
+ have_args = []
+ def paren_or_comma():
+ if have_args:
+ self.write(', ')
+ else:
+ have_args.append(True)
+ self.write('(')
+
+ self.newline(n=3)
+ self.decorators(node)
+ self.newline()
+ self.write('class %s' % node.name)
+ for base in node.bases:
+ paren_or_comma()
+ self.visit(base)
+ # XXX: the if here is used to keep this module compatible
+ # with python 2.6.
+ if hasattr(node, 'keywords'):
+ for keyword in node.keywords:
+ paren_or_comma()
+ self.write(keyword.arg + '=')
+ self.visit(keyword.value)
+ if node.starargs is not None:
+ paren_or_comma()
+ self.write('*')
+ self.visit(node.starargs)
+ if node.kwargs is not None:
+ paren_or_comma()
+ self.write('**')
+ self.visit(node.kwargs)
+ self.write(have_args and '):' or ':')
+ self.body(node.body)
+
+ def visit_If(self, node):
+ self.newline()
+ self.write('if ')
+ self.visit(node.test)
+ self.write(':')
+ self.body(node.body)
+ while True:
+ else_ = node.orelse
+ if len(else_) == 1 and isinstance(else_[0], If):
+ node = else_[0]
+ self.newline()
+ self.write('elif ')
+ self.visit(node.test)
+ self.write(':')
+ self.body(node.body)
+ else:
+ self.newline()
+ self.write('else:')
+ self.body(else_)
+ break
+
+ def visit_For(self, node):
+ self.newline()
+ self.write('for ')
+ self.visit(node.target)
+ self.write(' in ')
+ self.visit(node.iter)
+ self.write(':')
+ self.body_or_else(node)
+
+ def visit_While(self, node):
+ self.newline()
+ self.write('while ')
+ self.visit(node.test)
+ self.write(':')
+ self.body_or_else(node)
+
+ def visit_With(self, node):
+ self.newline()
+ self.write('with ')
+ self.visit(node.context_expr)
+ if node.optional_vars is not None:
+ self.write(' as ')
+ self.visit(node.optional_vars)
+ self.write(':')
+ self.body(node.body)
+
+ def visit_Pass(self, node):
+ self.newline()
+ self.write('pass')
+
+ def visit_Print(self, node):
+ # XXX: python 2.6 only
+ self.newline()
+ self.write('print ')
+ want_comma = False
+ if node.dest is not None:
+ self.write(' >> ')
+ self.visit(node.dest)
+ want_comma = True
+ for value in node.values:
+ if want_comma:
+ self.write(', ')
+ self.visit(value)
+ want_comma = True
+ if not node.nl:
+ self.write(',')
+
+ def visit_Delete(self, node):
+ self.newline()
+ self.write('del ')
+ for idx, target in enumerate(node):
+ if idx:
+ self.write(', ')
+ self.visit(target)
+
+ def visit_TryExcept(self, node):
+ self.newline()
+ self.write('try:')
+ self.body(node.body)
+ for handler in node.handlers:
+ self.visit(handler)
+
+ def visit_TryFinally(self, node):
+ self.newline()
+ self.write('try:')
+ self.body(node.body)
+ self.newline()
+ self.write('finally:')
+ self.body(node.finalbody)
+
+ def visit_Global(self, node):
+ self.newline()
+ self.write('global ' + ', '.join(node.names))
+
+ def visit_Nonlocal(self, node):
+ self.newline()
+ self.write('nonlocal ' + ', '.join(node.names))
+
+ def visit_Return(self, node):
+ self.newline()
+ self.write('return ')
+ self.visit(node.value)
+
+ def visit_Break(self, node):
+ self.newline()
+ self.write('break')
+
+ def visit_Continue(self, node):
+ self.newline()
+ self.write('continue')
+
+ def visit_Raise(self, node):
+ # XXX: Python 2.6 / 3.0 compatibility
+ self.newline()
+ self.write('raise')
+ if hasattr(node, 'exc') and node.exc is not None:
+ self.write(' ')
+ self.visit(node.exc)
+ if node.cause is not None:
+ self.write(' from ')
+ self.visit(node.cause)
+ elif hasattr(node, 'type') and node.type is not None:
+ self.visit(node.type)
+ if node.inst is not None:
+ self.write(', ')
+ self.visit(node.inst)
+ if node.tback is not None:
+ self.write(', ')
+ self.visit(node.tback)
+
+ # Expressions
+
+ def visit_Attribute(self, node):
+ self.visit(node.value)
+ self.write('.' + node.attr)
+
+ def visit_Call(self, node):
+ want_comma = []
+ def write_comma():
+ if want_comma:
+ self.write(', ')
+ else:
+ want_comma.append(True)
+
+ self.visit(node.func)
+ self.write('(')
+ for arg in node.args:
+ write_comma()
+ self.visit(arg)
+ for keyword in node.keywords:
+ write_comma()
+ self.write(keyword.arg + '=')
+ self.visit(keyword.value)
+ if node.starargs is not None:
+ write_comma()
+ self.write('*')
+ self.visit(node.starargs)
+ if node.kwargs is not None:
+ write_comma()
+ self.write('**')
+ self.visit(node.kwargs)
+ self.write(')')
+
+ def visit_Name(self, node):
+ self.write(node.id)
+
+ def visit_NameConstant(self, node):
+ self.write(str(node.value))
+
+ def visit_arg(self, node):
+ self.write(node.arg)
+
+ def visit_Str(self, node):
+ self.write(repr(node.s))
+
+ def visit_Bytes(self, node):
+ self.write(repr(node.s))
+
+ def visit_Num(self, node):
+ self.write(repr(node.n))
+
+ def visit_Tuple(self, node):
+ self.write('(')
+ idx = -1
+ for idx, item in enumerate(node.elts):
+ if idx:
+ self.write(', ')
+ self.visit(item)
+ self.write(idx and ')' or ',)')
+
+ def sequence_visit(left, right):
+ def visit(self, node):
+ self.write(left)
+ for idx, item in enumerate(node.elts):
+ if idx:
+ self.write(', ')
+ self.visit(item)
+ self.write(right)
+ return visit
+
+ visit_List = sequence_visit('[', ']')
+ visit_Set = sequence_visit('{', '}')
+ del sequence_visit
+
+ def visit_Dict(self, node):
+ self.write('{')
+ for idx, (key, value) in enumerate(zip(node.keys, node.values)):
+ if idx:
+ self.write(', ')
+ self.visit(key)
+ self.write(': ')
+ self.visit(value)
+ self.write('}')
+
+ def visit_BinOp(self, node):
+ self.write('(')
+ self.visit(node.left)
+ self.write(' %s ' % BINOP_SYMBOLS[type(node.op)])
+ self.visit(node.right)
+ self.write(')')
+
+ def visit_BoolOp(self, node):
+ self.write('(')
+ for idx, value in enumerate(node.values):
+ if idx:
+ self.write(' %s ' % BOOLOP_SYMBOLS[type(node.op)])
+ self.visit(value)
+ self.write(')')
+
+ def visit_Compare(self, node):
+ self.write('(')
+ self.visit(node.left)
+ for op, right in zip(node.ops, node.comparators):
+ self.write(' %s ' % CMPOP_SYMBOLS[type(op)])
+ self.visit(right)
+ self.write(')')
+
+ def visit_UnaryOp(self, node):
+ self.write('(')
+ op = UNARYOP_SYMBOLS[type(node.op)]
+ self.write(op)
+ if op == 'not':
+ self.write(' ')
+ self.visit(node.operand)
+ self.write(')')
+
+ def visit_Subscript(self, node):
+ self.visit(node.value)
+ self.write('[')
+ self.visit(node.slice)
+ self.write(']')
+
+ def visit_Slice(self, node):
+ if node.lower is not None:
+ self.visit(node.lower)
+ self.write(':')
+ if node.upper is not None:
+ self.visit(node.upper)
+ if node.step is not None:
+ self.write(':')
+ if not (isinstance(node.step, Name) and node.step.id == 'None'):
+ self.visit(node.step)
+
+ def visit_ExtSlice(self, node):
+ for idx, item in node.dims:
+ if idx:
+ self.write(', ')
+ self.visit(item)
+
+ def visit_Yield(self, node):
+ self.write('yield ')
+ self.visit(node.value)
+
+ def visit_Lambda(self, node):
+ self.write('lambda ')
+ self.signature(node.args)
+ self.write(': ')
+ self.visit(node.body)
+
+ def visit_Ellipsis(self, node):
+ self.write('Ellipsis')
+
+ def generator_visit(left, right):
+ def visit(self, node):
+ self.write(left)
+ self.visit(node.elt)
+ for comprehension in node.generators:
+ self.visit(comprehension)
+ self.write(right)
+ return visit
+
+ visit_ListComp = generator_visit('[', ']')
+ visit_GeneratorExp = generator_visit('(', ')')
+ visit_SetComp = generator_visit('{', '}')
+ del generator_visit
+
+ def visit_DictComp(self, node):
+ self.write('{')
+ self.visit(node.key)
+ self.write(': ')
+ self.visit(node.value)
+ for comprehension in node.generators:
+ self.visit(comprehension)
+ self.write('}')
+
+ def visit_IfExp(self, node):
+ self.visit(node.body)
+ self.write(' if ')
+ self.visit(node.test)
+ self.write(' else ')
+ self.visit(node.orelse)
+
+ def visit_Starred(self, node):
+ self.write('*')
+ self.visit(node.value)
+
+ def visit_Repr(self, node):
+ # XXX: python 2.6 only
+ self.write('`')
+ self.visit(node.value)
+ self.write('`')
+
+ # Helper Nodes
+
+ def visit_alias(self, node):
+ self.write(node.name)
+ if node.asname is not None:
+ self.write(' as ' + node.asname)
+
+ def visit_comprehension(self, node):
+ self.write(' for ')
+ self.visit(node.target)
+ self.write(' in ')
+ self.visit(node.iter)
+ if node.ifs:
+ for if_ in node.ifs:
+ self.write(' if ')
+ self.visit(if_)
+
+ def visit_excepthandler(self, node):
+ self.newline()
+ self.write('except')
+ if node.type is not None:
+ self.write(' ')
+ self.visit(node.type)
+ if node.name is not None:
+ self.write(' as ')
+ self.visit(node.name)
+ self.write(':')
+ self.body(node.body)
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/ast.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/ast.py
new file mode 100644
index 00000000000..65fd84dfe15
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/ast.py
@@ -0,0 +1,178 @@
+# mako/ast.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+"""utilities for analyzing expressions and blocks of Python
+code, as well as generating Python from AST nodes"""
+
+from mako import exceptions, pyparser, compat
+import re
+
+class PythonCode(object):
+ """represents information about a string containing Python code"""
+ def __init__(self, code, **exception_kwargs):
+ self.code = code
+
+ # represents all identifiers which are assigned to at some point in
+ # the code
+ self.declared_identifiers = set()
+
+ # represents all identifiers which are referenced before their
+ # assignment, if any
+ self.undeclared_identifiers = set()
+
+ # note that an identifier can be in both the undeclared and declared
+ # lists.
+
+ # using AST to parse instead of using code.co_varnames,
+ # code.co_names has several advantages:
+ # - we can locate an identifier as "undeclared" even if
+ # its declared later in the same block of code
+ # - AST is less likely to break with version changes
+ # (for example, the behavior of co_names changed a little bit
+ # in python version 2.5)
+ if isinstance(code, compat.string_types):
+ expr = pyparser.parse(code.lstrip(), "exec", **exception_kwargs)
+ else:
+ expr = code
+
+ f = pyparser.FindIdentifiers(self, **exception_kwargs)
+ f.visit(expr)
+
+class ArgumentList(object):
+ """parses a fragment of code as a comma-separated list of expressions"""
+ def __init__(self, code, **exception_kwargs):
+ self.codeargs = []
+ self.args = []
+ self.declared_identifiers = set()
+ self.undeclared_identifiers = set()
+ if isinstance(code, compat.string_types):
+ if re.match(r"\S", code) and not re.match(r",\s*$", code):
+ # if theres text and no trailing comma, insure its parsed
+ # as a tuple by adding a trailing comma
+ code += ","
+ expr = pyparser.parse(code, "exec", **exception_kwargs)
+ else:
+ expr = code
+
+ f = pyparser.FindTuple(self, PythonCode, **exception_kwargs)
+ f.visit(expr)
+
+class PythonFragment(PythonCode):
+ """extends PythonCode to provide identifier lookups in partial control
+ statements
+
+ e.g.
+ for x in 5:
+ elif y==9:
+ except (MyException, e):
+ etc.
+ """
+ def __init__(self, code, **exception_kwargs):
+ m = re.match(r'^(\w+)(?:\s+(.*?))?:\s*(#|$)', code.strip(), re.S)
+ if not m:
+ raise exceptions.CompileException(
+ "Fragment '%s' is not a partial control statement" %
+ code, **exception_kwargs)
+ if m.group(3):
+ code = code[:m.start(3)]
+ (keyword, expr) = m.group(1,2)
+ if keyword in ['for','if', 'while']:
+ code = code + "pass"
+ elif keyword == 'try':
+ code = code + "pass\nexcept:pass"
+ elif keyword == 'elif' or keyword == 'else':
+ code = "if False:pass\n" + code + "pass"
+ elif keyword == 'except':
+ code = "try:pass\n" + code + "pass"
+ elif keyword == 'with':
+ code = code + "pass"
+ else:
+ raise exceptions.CompileException(
+ "Unsupported control keyword: '%s'" %
+ keyword, **exception_kwargs)
+ super(PythonFragment, self).__init__(code, **exception_kwargs)
+
+
+class FunctionDecl(object):
+ """function declaration"""
+ def __init__(self, code, allow_kwargs=True, **exception_kwargs):
+ self.code = code
+ expr = pyparser.parse(code, "exec", **exception_kwargs)
+
+ f = pyparser.ParseFunc(self, **exception_kwargs)
+ f.visit(expr)
+ if not hasattr(self, 'funcname'):
+ raise exceptions.CompileException(
+ "Code '%s' is not a function declaration" % code,
+ **exception_kwargs)
+ if not allow_kwargs and self.kwargs:
+ raise exceptions.CompileException(
+ "'**%s' keyword argument not allowed here" %
+ self.kwargnames[-1], **exception_kwargs)
+
+ def get_argument_expressions(self, as_call=False):
+ """Return the argument declarations of this FunctionDecl as a printable
+ list.
+
+ By default the return value is appropriate for writing in a ``def``;
+ set `as_call` to true to build arguments to be passed to the function
+ instead (assuming locals with the same names as the arguments exist).
+ """
+
+ namedecls = []
+
+ # Build in reverse order, since defaults and slurpy args come last
+ argnames = self.argnames[::-1]
+ kwargnames = self.kwargnames[::-1]
+ defaults = self.defaults[::-1]
+ kwdefaults = self.kwdefaults[::-1]
+
+ # Named arguments
+ if self.kwargs:
+ namedecls.append("**" + kwargnames.pop(0))
+
+ for name in kwargnames:
+ # Keyword-only arguments must always be used by name, so even if
+ # this is a call, print out `foo=foo`
+ if as_call:
+ namedecls.append("%s=%s" % (name, name))
+ elif kwdefaults:
+ default = kwdefaults.pop(0)
+ if default is None:
+ # The AST always gives kwargs a default, since you can do
+ # `def foo(*, a=1, b, c=3)`
+ namedecls.append(name)
+ else:
+ namedecls.append("%s=%s" % (
+ name, pyparser.ExpressionGenerator(default).value()))
+ else:
+ namedecls.append(name)
+
+ # Positional arguments
+ if self.varargs:
+ namedecls.append("*" + argnames.pop(0))
+
+ for name in argnames:
+ if as_call or not defaults:
+ namedecls.append(name)
+ else:
+ default = defaults.pop(0)
+ namedecls.append("%s=%s" % (
+ name, pyparser.ExpressionGenerator(default).value()))
+
+ namedecls.reverse()
+ return namedecls
+
+ @property
+ def allargnames(self):
+ return tuple(self.argnames) + tuple(self.kwargnames)
+
+class FunctionArgs(FunctionDecl):
+ """the argument portion of a function declaration"""
+
+ def __init__(self, code, **kwargs):
+ super(FunctionArgs, self).__init__("def ANON(%s):pass" % code,
+ **kwargs)
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/cache.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/cache.py
new file mode 100644
index 00000000000..c405c5171d7
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/cache.py
@@ -0,0 +1,238 @@
+# mako/cache.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+from mako import compat, util
+
+_cache_plugins = util.PluginLoader("mako.cache")
+
+register_plugin = _cache_plugins.register
+register_plugin("beaker", "mako.ext.beaker_cache", "BeakerCacheImpl")
+
+
+class Cache(object):
+ """Represents a data content cache made available to the module
+ space of a specific :class:`.Template` object.
+
+ .. versionadded:: 0.6
+ :class:`.Cache` by itself is mostly a
+ container for a :class:`.CacheImpl` object, which implements
+ a fixed API to provide caching services; specific subclasses exist to
+ implement different
+ caching strategies. Mako includes a backend that works with
+ the Beaker caching system. Beaker itself then supports
+ a number of backends (i.e. file, memory, memcached, etc.)
+
+ The construction of a :class:`.Cache` is part of the mechanics
+ of a :class:`.Template`, and programmatic access to this
+ cache is typically via the :attr:`.Template.cache` attribute.
+
+ """
+
+ impl = None
+ """Provide the :class:`.CacheImpl` in use by this :class:`.Cache`.
+
+ This accessor allows a :class:`.CacheImpl` with additional
+ methods beyond that of :class:`.Cache` to be used programmatically.
+
+ """
+
+ id = None
+ """Return the 'id' that identifies this cache.
+
+ This is a value that should be globally unique to the
+ :class:`.Template` associated with this cache, and can
+ be used by a caching system to name a local container
+ for data specific to this template.
+
+ """
+
+ starttime = None
+ """Epochal time value for when the owning :class:`.Template` was
+ first compiled.
+
+ A cache implementation may wish to invalidate data earlier than
+ this timestamp; this has the effect of the cache for a specific
+ :class:`.Template` starting clean any time the :class:`.Template`
+ is recompiled, such as when the original template file changed on
+ the filesystem.
+
+ """
+
+ def __init__(self, template, *args):
+ # check for a stale template calling the
+ # constructor
+ if isinstance(template, compat.string_types) and args:
+ return
+ self.template = template
+ self.id = template.module.__name__
+ self.starttime = template.module._modified_time
+ self._def_regions = {}
+ self.impl = self._load_impl(self.template.cache_impl)
+
+ def _load_impl(self, name):
+ return _cache_plugins.load(name)(self)
+
+ def get_or_create(self, key, creation_function, **kw):
+ """Retrieve a value from the cache, using the given creation function
+ to generate a new value."""
+
+ return self._ctx_get_or_create(key, creation_function, None, **kw)
+
+ def _ctx_get_or_create(self, key, creation_function, context, **kw):
+ """Retrieve a value from the cache, using the given creation function
+ to generate a new value."""
+
+ if not self.template.cache_enabled:
+ return creation_function()
+
+ return self.impl.get_or_create(
+ key,
+ creation_function,
+ **self._get_cache_kw(kw, context))
+
+ def set(self, key, value, **kw):
+ """Place a value in the cache.
+
+ :param key: the value's key.
+ :param value: the value.
+ :param \**kw: cache configuration arguments.
+
+ """
+
+ self.impl.set(key, value, **self._get_cache_kw(kw, None))
+
+ put = set
+ """A synonym for :meth:`.Cache.set`.
+
+ This is here for backwards compatibility.
+
+ """
+
+ def get(self, key, **kw):
+ """Retrieve a value from the cache.
+
+ :param key: the value's key.
+ :param \**kw: cache configuration arguments. The
+ backend is configured using these arguments upon first request.
+ Subsequent requests that use the same series of configuration
+ values will use that same backend.
+
+ """
+ return self.impl.get(key, **self._get_cache_kw(kw, None))
+
+ def invalidate(self, key, **kw):
+ """Invalidate a value in the cache.
+
+ :param key: the value's key.
+ :param \**kw: cache configuration arguments. The
+ backend is configured using these arguments upon first request.
+ Subsequent requests that use the same series of configuration
+ values will use that same backend.
+
+ """
+ self.impl.invalidate(key, **self._get_cache_kw(kw, None))
+
+ def invalidate_body(self):
+ """Invalidate the cached content of the "body" method for this
+ template.
+
+ """
+ self.invalidate('render_body', __M_defname='render_body')
+
+ def invalidate_def(self, name):
+ """Invalidate the cached content of a particular ``<%def>`` within this
+ template.
+
+ """
+
+ self.invalidate('render_%s' % name, __M_defname='render_%s' % name)
+
+ def invalidate_closure(self, name):
+ """Invalidate a nested ``<%def>`` within this template.
+
+ Caching of nested defs is a blunt tool as there is no
+ management of scope -- nested defs that use cache tags
+ need to have names unique of all other nested defs in the
+ template, else their content will be overwritten by
+ each other.
+
+ """
+
+ self.invalidate(name, __M_defname=name)
+
+ def _get_cache_kw(self, kw, context):
+ defname = kw.pop('__M_defname', None)
+ if not defname:
+ tmpl_kw = self.template.cache_args.copy()
+ tmpl_kw.update(kw)
+ elif defname in self._def_regions:
+ tmpl_kw = self._def_regions[defname]
+ else:
+ tmpl_kw = self.template.cache_args.copy()
+ tmpl_kw.update(kw)
+ self._def_regions[defname] = tmpl_kw
+ if context and self.impl.pass_context:
+ tmpl_kw = tmpl_kw.copy()
+ tmpl_kw.setdefault('context', context)
+ return tmpl_kw
+
+
+class CacheImpl(object):
+ """Provide a cache implementation for use by :class:`.Cache`."""
+
+ def __init__(self, cache):
+ self.cache = cache
+
+ pass_context = False
+ """If ``True``, the :class:`.Context` will be passed to
+ :meth:`get_or_create <.CacheImpl.get_or_create>` as the name ``'context'``.
+ """
+
+ def get_or_create(self, key, creation_function, **kw):
+ """Retrieve a value from the cache, using the given creation function
+ to generate a new value.
+
+ This function *must* return a value, either from
+ the cache, or via the given creation function.
+ If the creation function is called, the newly
+ created value should be populated into the cache
+ under the given key before being returned.
+
+ :param key: the value's key.
+ :param creation_function: function that when called generates
+ a new value.
+ :param \**kw: cache configuration arguments.
+
+ """
+ raise NotImplementedError()
+
+ def set(self, key, value, **kw):
+ """Place a value in the cache.
+
+ :param key: the value's key.
+ :param value: the value.
+ :param \**kw: cache configuration arguments.
+
+ """
+ raise NotImplementedError()
+
+ def get(self, key, **kw):
+ """Retrieve a value from the cache.
+
+ :param key: the value's key.
+ :param \**kw: cache configuration arguments.
+
+ """
+ raise NotImplementedError()
+
+ def invalidate(self, key, **kw):
+ """Invalidate a value in the cache.
+
+ :param key: the value's key.
+ :param \**kw: cache configuration arguments.
+
+ """
+ raise NotImplementedError()
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/cmd.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/cmd.py
new file mode 100644
index 00000000000..1a9ca56637c
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/cmd.py
@@ -0,0 +1,62 @@
+# mako/cmd.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+from argparse import ArgumentParser
+from os.path import isfile, dirname
+import sys
+from mako.template import Template
+from mako.lookup import TemplateLookup
+from mako import exceptions
+
+def varsplit(var):
+ if "=" not in var:
+ return (var, "")
+ return var.split("=", 1)
+
+def _exit():
+ sys.stderr.write(exceptions.text_error_template().render())
+ sys.exit(1)
+
+def cmdline(argv=None):
+
+ parser = ArgumentParser("usage: %prog [FILENAME]")
+ parser.add_argument("--var", default=[], action="append",
+ help="variable (can be used multiple times, use name=value)")
+ parser.add_argument("--template-dir", default=[], action="append",
+ help="Directory to use for template lookup (multiple "
+ "directories may be provided). If not given then if the "
+ "template is read from stdin, the value defaults to be "
+ "the current directory, otherwise it defaults to be the "
+ "parent directory of the file provided.")
+ parser.add_argument('input', nargs='?', default='-')
+
+ options = parser.parse_args(argv)
+ if options.input == '-':
+ lookup_dirs = options.template_dir or ["."]
+ lookup = TemplateLookup(lookup_dirs)
+ try:
+ template = Template(sys.stdin.read(), lookup=lookup)
+ except:
+ _exit()
+ else:
+ filename = options.input
+ if not isfile(filename):
+ raise SystemExit("error: can't find %s" % filename)
+ lookup_dirs = options.template_dir or [dirname(filename)]
+ lookup = TemplateLookup(lookup_dirs)
+ try:
+ template = Template(filename=filename, lookup=lookup)
+ except:
+ _exit()
+
+ kw = dict([varsplit(var) for var in options.var])
+ try:
+ print(template.render(**kw))
+ except:
+ _exit()
+
+
+if __name__ == "__main__":
+ cmdline()
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/codegen.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/codegen.py
new file mode 100644
index 00000000000..4b0bda86731
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/codegen.py
@@ -0,0 +1,1237 @@
+# mako/codegen.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+"""provides functionality for rendering a parsetree constructing into module
+source code."""
+
+import time
+import re
+from mako.pygen import PythonPrinter
+from mako import util, ast, parsetree, filters, exceptions
+from mako import compat
+
+
+MAGIC_NUMBER = 10
+
+# names which are hardwired into the
+# template and are not accessed via the
+# context itself
+RESERVED_NAMES = set(['context', 'loop', 'UNDEFINED'])
+
+def compile(node,
+ uri,
+ filename=None,
+ default_filters=None,
+ buffer_filters=None,
+ imports=None,
+ future_imports=None,
+ source_encoding=None,
+ generate_magic_comment=True,
+ disable_unicode=False,
+ strict_undefined=False,
+ enable_loop=True,
+ reserved_names=frozenset()):
+
+ """Generate module source code given a parsetree node,
+ uri, and optional source filename"""
+
+ # if on Py2K, push the "source_encoding" string to be
+ # a bytestring itself, as we will be embedding it into
+ # the generated source and we don't want to coerce the
+ # result into a unicode object, in "disable_unicode" mode
+ if not compat.py3k and isinstance(source_encoding, compat.text_type):
+ source_encoding = source_encoding.encode(source_encoding)
+
+
+ buf = util.FastEncodingBuffer()
+
+ printer = PythonPrinter(buf)
+ _GenerateRenderMethod(printer,
+ _CompileContext(uri,
+ filename,
+ default_filters,
+ buffer_filters,
+ imports,
+ future_imports,
+ source_encoding,
+ generate_magic_comment,
+ disable_unicode,
+ strict_undefined,
+ enable_loop,
+ reserved_names),
+ node)
+ return buf.getvalue()
+
+class _CompileContext(object):
+ def __init__(self,
+ uri,
+ filename,
+ default_filters,
+ buffer_filters,
+ imports,
+ future_imports,
+ source_encoding,
+ generate_magic_comment,
+ disable_unicode,
+ strict_undefined,
+ enable_loop,
+ reserved_names):
+ self.uri = uri
+ self.filename = filename
+ self.default_filters = default_filters
+ self.buffer_filters = buffer_filters
+ self.imports = imports
+ self.future_imports = future_imports
+ self.source_encoding = source_encoding
+ self.generate_magic_comment = generate_magic_comment
+ self.disable_unicode = disable_unicode
+ self.strict_undefined = strict_undefined
+ self.enable_loop = enable_loop
+ self.reserved_names = reserved_names
+
+class _GenerateRenderMethod(object):
+ """A template visitor object which generates the
+ full module source for a template.
+
+ """
+ def __init__(self, printer, compiler, node):
+ self.printer = printer
+ self.compiler = compiler
+ self.node = node
+ self.identifier_stack = [None]
+ self.in_def = isinstance(node, (parsetree.DefTag, parsetree.BlockTag))
+
+ if self.in_def:
+ name = "render_%s" % node.funcname
+ args = node.get_argument_expressions()
+ filtered = len(node.filter_args.args) > 0
+ buffered = eval(node.attributes.get('buffered', 'False'))
+ cached = eval(node.attributes.get('cached', 'False'))
+ defs = None
+ pagetag = None
+ if node.is_block and not node.is_anonymous:
+ args += ['**pageargs']
+ else:
+ defs = self.write_toplevel()
+ pagetag = self.compiler.pagetag
+ name = "render_body"
+ if pagetag is not None:
+ args = pagetag.body_decl.get_argument_expressions()
+ if not pagetag.body_decl.kwargs:
+ args += ['**pageargs']
+ cached = eval(pagetag.attributes.get('cached', 'False'))
+ self.compiler.enable_loop = self.compiler.enable_loop or eval(
+ pagetag.attributes.get(
+ 'enable_loop', 'False')
+ )
+ else:
+ args = ['**pageargs']
+ cached = False
+ buffered = filtered = False
+ if args is None:
+ args = ['context']
+ else:
+ args = [a for a in ['context'] + args]
+
+ self.write_render_callable(
+ pagetag or node,
+ name, args,
+ buffered, filtered, cached)
+
+ if defs is not None:
+ for node in defs:
+ _GenerateRenderMethod(printer, compiler, node)
+
+ if not self.in_def:
+ self.write_metadata_struct()
+
+ def write_metadata_struct(self):
+ self.printer.source_map[self.printer.lineno] = \
+ max(self.printer.source_map)
+ struct = {
+ "filename": self.compiler.filename,
+ "uri": self.compiler.uri,
+ "source_encoding": self.compiler.source_encoding,
+ "line_map": self.printer.source_map,
+ }
+ self.printer.writelines(
+ '"""',
+ '__M_BEGIN_METADATA',
+ compat.json.dumps(struct),
+ '__M_END_METADATA\n'
+ '"""'
+ )
+
+ @property
+ def identifiers(self):
+ return self.identifier_stack[-1]
+
+ def write_toplevel(self):
+ """Traverse a template structure for module-level directives and
+ generate the start of module-level code.
+
+ """
+ inherit = []
+ namespaces = {}
+ module_code = []
+
+ self.compiler.pagetag = None
+
+ class FindTopLevel(object):
+ def visitInheritTag(s, node):
+ inherit.append(node)
+ def visitNamespaceTag(s, node):
+ namespaces[node.name] = node
+ def visitPageTag(s, node):
+ self.compiler.pagetag = node
+ def visitCode(s, node):
+ if node.ismodule:
+ module_code.append(node)
+
+ f = FindTopLevel()
+ for n in self.node.nodes:
+ n.accept_visitor(f)
+
+ self.compiler.namespaces = namespaces
+
+ module_ident = set()
+ for n in module_code:
+ module_ident = module_ident.union(n.declared_identifiers())
+
+ module_identifiers = _Identifiers(self.compiler)
+ module_identifiers.declared = module_ident
+
+ # module-level names, python code
+ if self.compiler.generate_magic_comment and \
+ self.compiler.source_encoding:
+ self.printer.writeline("# -*- coding:%s -*-" %
+ self.compiler.source_encoding)
+
+ if self.compiler.future_imports:
+ self.printer.writeline("from __future__ import %s" %
+ (", ".join(self.compiler.future_imports),))
+ self.printer.writeline("from mako import runtime, filters, cache")
+ self.printer.writeline("UNDEFINED = runtime.UNDEFINED")
+ self.printer.writeline("__M_dict_builtin = dict")
+ self.printer.writeline("__M_locals_builtin = locals")
+ self.printer.writeline("_magic_number = %r" % MAGIC_NUMBER)
+ self.printer.writeline("_modified_time = %r" % time.time())
+ self.printer.writeline("_enable_loop = %r" % self.compiler.enable_loop)
+ self.printer.writeline(
+ "_template_filename = %r" % self.compiler.filename)
+ self.printer.writeline("_template_uri = %r" % self.compiler.uri)
+ self.printer.writeline(
+ "_source_encoding = %r" % self.compiler.source_encoding)
+ if self.compiler.imports:
+ buf = ''
+ for imp in self.compiler.imports:
+ buf += imp + "\n"
+ self.printer.writeline(imp)
+ impcode = ast.PythonCode(
+ buf,
+ source='', lineno=0,
+ pos=0,
+ filename='template defined imports')
+ else:
+ impcode = None
+
+ main_identifiers = module_identifiers.branch(self.node)
+ module_identifiers.topleveldefs = \
+ module_identifiers.topleveldefs.\
+ union(main_identifiers.topleveldefs)
+ module_identifiers.declared.add("UNDEFINED")
+ if impcode:
+ module_identifiers.declared.update(impcode.declared_identifiers)
+
+ self.compiler.identifiers = module_identifiers
+ self.printer.writeline("_exports = %r" %
+ [n.name for n in
+ main_identifiers.topleveldefs.values()]
+ )
+ self.printer.write_blanks(2)
+
+ if len(module_code):
+ self.write_module_code(module_code)
+
+ if len(inherit):
+ self.write_namespaces(namespaces)
+ self.write_inherit(inherit[-1])
+ elif len(namespaces):
+ self.write_namespaces(namespaces)
+
+ return list(main_identifiers.topleveldefs.values())
+
+ def write_render_callable(self, node, name, args, buffered, filtered,
+ cached):
+ """write a top-level render callable.
+
+ this could be the main render() method or that of a top-level def."""
+
+ if self.in_def:
+ decorator = node.decorator
+ if decorator:
+ self.printer.writeline(
+ "@runtime._decorate_toplevel(%s)" % decorator)
+
+ self.printer.start_source(node.lineno)
+ self.printer.writelines(
+ "def %s(%s):" % (name, ','.join(args)),
+ # push new frame, assign current frame to __M_caller
+ "__M_caller = context.caller_stack._push_frame()",
+ "try:"
+ )
+ if buffered or filtered or cached:
+ self.printer.writeline("context._push_buffer()")
+
+ self.identifier_stack.append(
+ self.compiler.identifiers.branch(self.node))
+ if (not self.in_def or self.node.is_block) and '**pageargs' in args:
+ self.identifier_stack[-1].argument_declared.add('pageargs')
+
+ if not self.in_def and (
+ len(self.identifiers.locally_assigned) > 0 or
+ len(self.identifiers.argument_declared) > 0
+ ):
+ self.printer.writeline("__M_locals = __M_dict_builtin(%s)" %
+ ','.join([
+ "%s=%s" % (x, x) for x in
+ self.identifiers.argument_declared
+ ]))
+
+ self.write_variable_declares(self.identifiers, toplevel=True)
+
+ for n in self.node.nodes:
+ n.accept_visitor(self)
+
+ self.write_def_finish(self.node, buffered, filtered, cached)
+ self.printer.writeline(None)
+ self.printer.write_blanks(2)
+ if cached:
+ self.write_cache_decorator(
+ node, name,
+ args, buffered,
+ self.identifiers, toplevel=True)
+
+ def write_module_code(self, module_code):
+ """write module-level template code, i.e. that which
+ is enclosed in <%! %> tags in the template."""
+ for n in module_code:
+ self.printer.start_source(n.lineno)
+ self.printer.write_indented_block(n.text)
+
+ def write_inherit(self, node):
+ """write the module-level inheritance-determination callable."""
+
+ self.printer.writelines(
+ "def _mako_inherit(template, context):",
+ "_mako_generate_namespaces(context)",
+ "return runtime._inherit_from(context, %s, _template_uri)" %
+ (node.parsed_attributes['file']),
+ None
+ )
+
+ def write_namespaces(self, namespaces):
+ """write the module-level namespace-generating callable."""
+ self.printer.writelines(
+ "def _mako_get_namespace(context, name):",
+ "try:",
+ "return context.namespaces[(__name__, name)]",
+ "except KeyError:",
+ "_mako_generate_namespaces(context)",
+ "return context.namespaces[(__name__, name)]",
+ None, None
+ )
+ self.printer.writeline("def _mako_generate_namespaces(context):")
+
+
+ for node in namespaces.values():
+ if 'import' in node.attributes:
+ self.compiler.has_ns_imports = True
+ self.printer.start_source(node.lineno)
+ if len(node.nodes):
+ self.printer.writeline("def make_namespace():")
+ export = []
+ identifiers = self.compiler.identifiers.branch(node)
+ self.in_def = True
+ class NSDefVisitor(object):
+ def visitDefTag(s, node):
+ s.visitDefOrBase(node)
+
+ def visitBlockTag(s, node):
+ s.visitDefOrBase(node)
+
+ def visitDefOrBase(s, node):
+ if node.is_anonymous:
+ raise exceptions.CompileException(
+ "Can't put anonymous blocks inside "
+ "<%namespace>",
+ **node.exception_kwargs
+ )
+ self.write_inline_def(node, identifiers, nested=False)
+ export.append(node.funcname)
+ vis = NSDefVisitor()
+ for n in node.nodes:
+ n.accept_visitor(vis)
+ self.printer.writeline("return [%s]" % (','.join(export)))
+ self.printer.writeline(None)
+ self.in_def = False
+ callable_name = "make_namespace()"
+ else:
+ callable_name = "None"
+
+ if 'file' in node.parsed_attributes:
+ self.printer.writeline(
+ "ns = runtime.TemplateNamespace(%r,"
+ " context._clean_inheritance_tokens(),"
+ " templateuri=%s, callables=%s, "
+ " calling_uri=_template_uri)" %
+ (
+ node.name,
+ node.parsed_attributes.get('file', 'None'),
+ callable_name,
+ )
+ )
+ elif 'module' in node.parsed_attributes:
+ self.printer.writeline(
+ "ns = runtime.ModuleNamespace(%r,"
+ " context._clean_inheritance_tokens(),"
+ " callables=%s, calling_uri=_template_uri,"
+ " module=%s)" %
+ (
+ node.name,
+ callable_name,
+ node.parsed_attributes.get(
+ 'module', 'None')
+ )
+ )
+ else:
+ self.printer.writeline(
+ "ns = runtime.Namespace(%r,"
+ " context._clean_inheritance_tokens(),"
+ " callables=%s, calling_uri=_template_uri)" %
+ (
+ node.name,
+ callable_name,
+ )
+ )
+ if eval(node.attributes.get('inheritable', "False")):
+ self.printer.writeline("context['self'].%s = ns" % (node.name))
+
+ self.printer.writeline(
+ "context.namespaces[(__name__, %s)] = ns" % repr(node.name))
+ self.printer.write_blanks(1)
+ if not len(namespaces):
+ self.printer.writeline("pass")
+ self.printer.writeline(None)
+
+ def write_variable_declares(self, identifiers, toplevel=False, limit=None):
+ """write variable declarations at the top of a function.
+
+ the variable declarations are in the form of callable
+ definitions for defs and/or name lookup within the
+ function's context argument. the names declared are based
+ on the names that are referenced in the function body,
+ which don't otherwise have any explicit assignment
+ operation. names that are assigned within the body are
+ assumed to be locally-scoped variables and are not
+ separately declared.
+
+ for def callable definitions, if the def is a top-level
+ callable then a 'stub' callable is generated which wraps
+ the current Context into a closure. if the def is not
+ top-level, it is fully rendered as a local closure.
+
+ """
+
+ # collection of all defs available to us in this scope
+ comp_idents = dict([(c.funcname, c) for c in identifiers.defs])
+ to_write = set()
+
+ # write "context.get()" for all variables we are going to
+ # need that arent in the namespace yet
+ to_write = to_write.union(identifiers.undeclared)
+
+ # write closure functions for closures that we define
+ # right here
+ to_write = to_write.union(
+ [c.funcname for c in identifiers.closuredefs.values()])
+
+ # remove identifiers that are declared in the argument
+ # signature of the callable
+ to_write = to_write.difference(identifiers.argument_declared)
+
+ # remove identifiers that we are going to assign to.
+ # in this way we mimic Python's behavior,
+ # i.e. assignment to a variable within a block
+ # means that variable is now a "locally declared" var,
+ # which cannot be referenced beforehand.
+ to_write = to_write.difference(identifiers.locally_declared)
+
+ if self.compiler.enable_loop:
+ has_loop = "loop" in to_write
+ to_write.discard("loop")
+ else:
+ has_loop = False
+
+ # if a limiting set was sent, constraint to those items in that list
+ # (this is used for the caching decorator)
+ if limit is not None:
+ to_write = to_write.intersection(limit)
+
+ if toplevel and getattr(self.compiler, 'has_ns_imports', False):
+ self.printer.writeline("_import_ns = {}")
+ self.compiler.has_imports = True
+ for ident, ns in self.compiler.namespaces.items():
+ if 'import' in ns.attributes:
+ self.printer.writeline(
+ "_mako_get_namespace(context, %r)."
+ "_populate(_import_ns, %r)" %
+ (
+ ident,
+ re.split(r'\s*,\s*', ns.attributes['import'])
+ ))
+
+ if has_loop:
+ self.printer.writeline(
+ 'loop = __M_loop = runtime.LoopStack()'
+ )
+
+ for ident in to_write:
+ if ident in comp_idents:
+ comp = comp_idents[ident]
+ if comp.is_block:
+ if not comp.is_anonymous:
+ self.write_def_decl(comp, identifiers)
+ else:
+ self.write_inline_def(comp, identifiers, nested=True)
+ else:
+ if comp.is_root():
+ self.write_def_decl(comp, identifiers)
+ else:
+ self.write_inline_def(comp, identifiers, nested=True)
+
+ elif ident in self.compiler.namespaces:
+ self.printer.writeline(
+ "%s = _mako_get_namespace(context, %r)" %
+ (ident, ident)
+ )
+ else:
+ if getattr(self.compiler, 'has_ns_imports', False):
+ if self.compiler.strict_undefined:
+ self.printer.writelines(
+ "%s = _import_ns.get(%r, UNDEFINED)" %
+ (ident, ident),
+ "if %s is UNDEFINED:" % ident,
+ "try:",
+ "%s = context[%r]" % (ident, ident),
+ "except KeyError:",
+ "raise NameError(\"'%s' is not defined\")" %
+ ident,
+ None, None
+ )
+ else:
+ self.printer.writeline(
+ "%s = _import_ns.get(%r, context.get(%r, UNDEFINED))" %
+ (ident, ident, ident))
+ else:
+ if self.compiler.strict_undefined:
+ self.printer.writelines(
+ "try:",
+ "%s = context[%r]" % (ident, ident),
+ "except KeyError:",
+ "raise NameError(\"'%s' is not defined\")" %
+ ident,
+ None
+ )
+ else:
+ self.printer.writeline(
+ "%s = context.get(%r, UNDEFINED)" % (ident, ident)
+ )
+
+ self.printer.writeline("__M_writer = context.writer()")
+
+ def write_def_decl(self, node, identifiers):
+ """write a locally-available callable referencing a top-level def"""
+ funcname = node.funcname
+ namedecls = node.get_argument_expressions()
+ nameargs = node.get_argument_expressions(as_call=True)
+
+ if not self.in_def and (
+ len(self.identifiers.locally_assigned) > 0 or
+ len(self.identifiers.argument_declared) > 0):
+ nameargs.insert(0, 'context._locals(__M_locals)')
+ else:
+ nameargs.insert(0, 'context')
+ self.printer.writeline("def %s(%s):" % (funcname, ",".join(namedecls)))
+ self.printer.writeline(
+ "return render_%s(%s)" % (funcname, ",".join(nameargs)))
+ self.printer.writeline(None)
+
+ def write_inline_def(self, node, identifiers, nested):
+ """write a locally-available def callable inside an enclosing def."""
+
+ namedecls = node.get_argument_expressions()
+
+ decorator = node.decorator
+ if decorator:
+ self.printer.writeline(
+ "@runtime._decorate_inline(context, %s)" % decorator)
+ self.printer.writeline(
+ "def %s(%s):" % (node.funcname, ",".join(namedecls)))
+ filtered = len(node.filter_args.args) > 0
+ buffered = eval(node.attributes.get('buffered', 'False'))
+ cached = eval(node.attributes.get('cached', 'False'))
+ self.printer.writelines(
+ # push new frame, assign current frame to __M_caller
+ "__M_caller = context.caller_stack._push_frame()",
+ "try:"
+ )
+ if buffered or filtered or cached:
+ self.printer.writelines(
+ "context._push_buffer()",
+ )
+
+ identifiers = identifiers.branch(node, nested=nested)
+
+ self.write_variable_declares(identifiers)
+
+ self.identifier_stack.append(identifiers)
+ for n in node.nodes:
+ n.accept_visitor(self)
+ self.identifier_stack.pop()
+
+ self.write_def_finish(node, buffered, filtered, cached)
+ self.printer.writeline(None)
+ if cached:
+ self.write_cache_decorator(node, node.funcname,
+ namedecls, False, identifiers,
+ inline=True, toplevel=False)
+
+ def write_def_finish(self, node, buffered, filtered, cached,
+ callstack=True):
+ """write the end section of a rendering function, either outermost or
+ inline.
+
+ this takes into account if the rendering function was filtered,
+ buffered, etc. and closes the corresponding try: block if any, and
+ writes code to retrieve captured content, apply filters, send proper
+ return value."""
+
+ if not buffered and not cached and not filtered:
+ self.printer.writeline("return ''")
+ if callstack:
+ self.printer.writelines(
+ "finally:",
+ "context.caller_stack._pop_frame()",
+ None
+ )
+
+ if buffered or filtered or cached:
+ if buffered or cached:
+ # in a caching scenario, don't try to get a writer
+ # from the context after popping; assume the caching
+ # implemenation might be using a context with no
+ # extra buffers
+ self.printer.writelines(
+ "finally:",
+ "__M_buf = context._pop_buffer()"
+ )
+ else:
+ self.printer.writelines(
+ "finally:",
+ "__M_buf, __M_writer = context._pop_buffer_and_writer()"
+ )
+
+ if callstack:
+ self.printer.writeline("context.caller_stack._pop_frame()")
+
+ s = "__M_buf.getvalue()"
+ if filtered:
+ s = self.create_filter_callable(node.filter_args.args, s,
+ False)
+ self.printer.writeline(None)
+ if buffered and not cached:
+ s = self.create_filter_callable(self.compiler.buffer_filters,
+ s, False)
+ if buffered or cached:
+ self.printer.writeline("return %s" % s)
+ else:
+ self.printer.writelines(
+ "__M_writer(%s)" % s,
+ "return ''"
+ )
+
+ def write_cache_decorator(self, node_or_pagetag, name,
+ args, buffered, identifiers,
+ inline=False, toplevel=False):
+ """write a post-function decorator to replace a rendering
+ callable with a cached version of itself."""
+
+ self.printer.writeline("__M_%s = %s" % (name, name))
+ cachekey = node_or_pagetag.parsed_attributes.get('cache_key',
+ repr(name))
+
+ cache_args = {}
+ if self.compiler.pagetag is not None:
+ cache_args.update(
+ (
+ pa[6:],
+ self.compiler.pagetag.parsed_attributes[pa]
+ )
+ for pa in self.compiler.pagetag.parsed_attributes
+ if pa.startswith('cache_') and pa != 'cache_key'
+ )
+ cache_args.update(
+ (
+ pa[6:],
+ node_or_pagetag.parsed_attributes[pa]
+ ) for pa in node_or_pagetag.parsed_attributes
+ if pa.startswith('cache_') and pa != 'cache_key'
+ )
+ if 'timeout' in cache_args:
+ cache_args['timeout'] = int(eval(cache_args['timeout']))
+
+ self.printer.writeline("def %s(%s):" % (name, ','.join(args)))
+
+ # form "arg1, arg2, arg3=arg3, arg4=arg4", etc.
+ pass_args = [
+ "%s=%s" % ((a.split('=')[0],) * 2) if '=' in a else a
+ for a in args
+ ]
+
+ self.write_variable_declares(
+ identifiers,
+ toplevel=toplevel,
+ limit=node_or_pagetag.undeclared_identifiers()
+ )
+ if buffered:
+ s = "context.get('local')."\
+ "cache._ctx_get_or_create("\
+ "%s, lambda:__M_%s(%s), context, %s__M_defname=%r)" % (
+ cachekey, name, ','.join(pass_args),
+ ''.join(["%s=%s, " % (k, v)
+ for k, v in cache_args.items()]),
+ name
+ )
+ # apply buffer_filters
+ s = self.create_filter_callable(self.compiler.buffer_filters, s,
+ False)
+ self.printer.writelines("return " + s, None)
+ else:
+ self.printer.writelines(
+ "__M_writer(context.get('local')."
+ "cache._ctx_get_or_create("
+ "%s, lambda:__M_%s(%s), context, %s__M_defname=%r))" %
+ (
+ cachekey, name, ','.join(pass_args),
+ ''.join(["%s=%s, " % (k, v)
+ for k, v in cache_args.items()]),
+ name,
+ ),
+ "return ''",
+ None
+ )
+
+ def create_filter_callable(self, args, target, is_expression):
+ """write a filter-applying expression based on the filters
+ present in the given filter names, adjusting for the global
+ 'default' filter aliases as needed."""
+
+ def locate_encode(name):
+ if re.match(r'decode\..+', name):
+ return "filters." + name
+ elif self.compiler.disable_unicode:
+ return filters.NON_UNICODE_ESCAPES.get(name, name)
+ else:
+ return filters.DEFAULT_ESCAPES.get(name, name)
+
+ if 'n' not in args:
+ if is_expression:
+ if self.compiler.pagetag:
+ args = self.compiler.pagetag.filter_args.args + args
+ if self.compiler.default_filters:
+ args = self.compiler.default_filters + args
+ for e in args:
+ # if filter given as a function, get just the identifier portion
+ if e == 'n':
+ continue
+ m = re.match(r'(.+?)(\(.*\))', e)
+ if m:
+ ident, fargs = m.group(1, 2)
+ f = locate_encode(ident)
+ e = f + fargs
+ else:
+ e = locate_encode(e)
+ assert e is not None
+ target = "%s(%s)" % (e, target)
+ return target
+
+ def visitExpression(self, node):
+ self.printer.start_source(node.lineno)
+ if len(node.escapes) or \
+ (
+ self.compiler.pagetag is not None and
+ len(self.compiler.pagetag.filter_args.args)
+ ) or \
+ len(self.compiler.default_filters):
+
+ s = self.create_filter_callable(node.escapes_code.args,
+ "%s" % node.text, True)
+ self.printer.writeline("__M_writer(%s)" % s)
+ else:
+ self.printer.writeline("__M_writer(%s)" % node.text)
+
+ def visitControlLine(self, node):
+ if node.isend:
+ self.printer.writeline(None)
+ if node.has_loop_context:
+ self.printer.writeline('finally:')
+ self.printer.writeline("loop = __M_loop._exit()")
+ self.printer.writeline(None)
+ else:
+ self.printer.start_source(node.lineno)
+ if self.compiler.enable_loop and node.keyword == 'for':
+ text = mangle_mako_loop(node, self.printer)
+ else:
+ text = node.text
+ self.printer.writeline(text)
+ children = node.get_children()
+ # this covers the three situations where we want to insert a pass:
+ # 1) a ternary control line with no children,
+ # 2) a primary control line with nothing but its own ternary
+ # and end control lines, and
+ # 3) any control line with no content other than comments
+ if not children or (
+ compat.all(isinstance(c, (parsetree.Comment,
+ parsetree.ControlLine))
+ for c in children) and
+ compat.all((node.is_ternary(c.keyword) or c.isend)
+ for c in children
+ if isinstance(c, parsetree.ControlLine))):
+ self.printer.writeline("pass")
+
+ def visitText(self, node):
+ self.printer.start_source(node.lineno)
+ self.printer.writeline("__M_writer(%s)" % repr(node.content))
+
+ def visitTextTag(self, node):
+ filtered = len(node.filter_args.args) > 0
+ if filtered:
+ self.printer.writelines(
+ "__M_writer = context._push_writer()",
+ "try:",
+ )
+ for n in node.nodes:
+ n.accept_visitor(self)
+ if filtered:
+ self.printer.writelines(
+ "finally:",
+ "__M_buf, __M_writer = context._pop_buffer_and_writer()",
+ "__M_writer(%s)" %
+ self.create_filter_callable(
+ node.filter_args.args,
+ "__M_buf.getvalue()",
+ False),
+ None
+ )
+
+ def visitCode(self, node):
+ if not node.ismodule:
+ self.printer.start_source(node.lineno)
+ self.printer.write_indented_block(node.text)
+
+ if not self.in_def and len(self.identifiers.locally_assigned) > 0:
+ # if we are the "template" def, fudge locally
+ # declared/modified variables into the "__M_locals" dictionary,
+ # which is used for def calls within the same template,
+ # to simulate "enclosing scope"
+ self.printer.writeline(
+ '__M_locals_builtin_stored = __M_locals_builtin()')
+ self.printer.writeline(
+ '__M_locals.update(__M_dict_builtin([(__M_key,'
+ ' __M_locals_builtin_stored[__M_key]) for __M_key in'
+ ' [%s] if __M_key in __M_locals_builtin_stored]))' %
+ ','.join([repr(x) for x in node.declared_identifiers()]))
+
+ def visitIncludeTag(self, node):
+ self.printer.start_source(node.lineno)
+ args = node.attributes.get('args')
+ if args:
+ self.printer.writeline(
+ "runtime._include_file(context, %s, _template_uri, %s)" %
+ (node.parsed_attributes['file'], args))
+ else:
+ self.printer.writeline(
+ "runtime._include_file(context, %s, _template_uri)" %
+ (node.parsed_attributes['file']))
+
+ def visitNamespaceTag(self, node):
+ pass
+
+ def visitDefTag(self, node):
+ pass
+
+ def visitBlockTag(self, node):
+ if node.is_anonymous:
+ self.printer.writeline("%s()" % node.funcname)
+ else:
+ nameargs = node.get_argument_expressions(as_call=True)
+ nameargs += ['**pageargs']
+ self.printer.writeline("if 'parent' not in context._data or "
+ "not hasattr(context._data['parent'], '%s'):"
+ % node.funcname)
+ self.printer.writeline(
+ "context['self'].%s(%s)" % (node.funcname, ",".join(nameargs)))
+ self.printer.writeline("\n")
+
+ def visitCallNamespaceTag(self, node):
+ # TODO: we can put namespace-specific checks here, such
+ # as ensure the given namespace will be imported,
+ # pre-import the namespace, etc.
+ self.visitCallTag(node)
+
+ def visitCallTag(self, node):
+ self.printer.writeline("def ccall(caller):")
+ export = ['body']
+ callable_identifiers = self.identifiers.branch(node, nested=True)
+ body_identifiers = callable_identifiers.branch(node, nested=False)
+ # we want the 'caller' passed to ccall to be used
+ # for the body() function, but for other non-body()
+ # <%def>s within <%call> we want the current caller
+ # off the call stack (if any)
+ body_identifiers.add_declared('caller')
+
+ self.identifier_stack.append(body_identifiers)
+ class DefVisitor(object):
+ def visitDefTag(s, node):
+ s.visitDefOrBase(node)
+
+ def visitBlockTag(s, node):
+ s.visitDefOrBase(node)
+
+ def visitDefOrBase(s, node):
+ self.write_inline_def(node, callable_identifiers, nested=False)
+ if not node.is_anonymous:
+ export.append(node.funcname)
+ # remove defs that are within the <%call> from the
+ # "closuredefs" defined in the body, so they dont render twice
+ if node.funcname in body_identifiers.closuredefs:
+ del body_identifiers.closuredefs[node.funcname]
+
+ vis = DefVisitor()
+ for n in node.nodes:
+ n.accept_visitor(vis)
+ self.identifier_stack.pop()
+
+ bodyargs = node.body_decl.get_argument_expressions()
+ self.printer.writeline("def body(%s):" % ','.join(bodyargs))
+
+ # TODO: figure out best way to specify
+ # buffering/nonbuffering (at call time would be better)
+ buffered = False
+ if buffered:
+ self.printer.writelines(
+ "context._push_buffer()",
+ "try:"
+ )
+ self.write_variable_declares(body_identifiers)
+ self.identifier_stack.append(body_identifiers)
+
+ for n in node.nodes:
+ n.accept_visitor(self)
+ self.identifier_stack.pop()
+
+ self.write_def_finish(node, buffered, False, False, callstack=False)
+ self.printer.writelines(
+ None,
+ "return [%s]" % (','.join(export)),
+ None
+ )
+
+ self.printer.writelines(
+ # push on caller for nested call
+ "context.caller_stack.nextcaller = "
+ "runtime.Namespace('caller', context, "
+ "callables=ccall(__M_caller))",
+ "try:")
+ self.printer.start_source(node.lineno)
+ self.printer.writelines(
+ "__M_writer(%s)" % self.create_filter_callable(
+ [], node.expression, True),
+ "finally:",
+ "context.caller_stack.nextcaller = None",
+ None
+ )
+
+class _Identifiers(object):
+ """tracks the status of identifier names as template code is rendered."""
+
+ def __init__(self, compiler, node=None, parent=None, nested=False):
+ if parent is not None:
+ # if we are the branch created in write_namespaces(),
+ # we don't share any context from the main body().
+ if isinstance(node, parsetree.NamespaceTag):
+ self.declared = set()
+ self.topleveldefs = util.SetLikeDict()
+ else:
+ # things that have already been declared
+ # in an enclosing namespace (i.e. names we can just use)
+ self.declared = set(parent.declared).\
+ union([c.name for c in parent.closuredefs.values()]).\
+ union(parent.locally_declared).\
+ union(parent.argument_declared)
+
+ # if these identifiers correspond to a "nested"
+ # scope, it means whatever the parent identifiers
+ # had as undeclared will have been declared by that parent,
+ # and therefore we have them in our scope.
+ if nested:
+ self.declared = self.declared.union(parent.undeclared)
+
+ # top level defs that are available
+ self.topleveldefs = util.SetLikeDict(**parent.topleveldefs)
+ else:
+ self.declared = set()
+ self.topleveldefs = util.SetLikeDict()
+
+ self.compiler = compiler
+
+ # things within this level that are referenced before they
+ # are declared (e.g. assigned to)
+ self.undeclared = set()
+
+ # things that are declared locally. some of these things
+ # could be in the "undeclared" list as well if they are
+ # referenced before declared
+ self.locally_declared = set()
+
+ # assignments made in explicit python blocks.
+ # these will be propagated to
+ # the context of local def calls.
+ self.locally_assigned = set()
+
+ # things that are declared in the argument
+ # signature of the def callable
+ self.argument_declared = set()
+
+ # closure defs that are defined in this level
+ self.closuredefs = util.SetLikeDict()
+
+ self.node = node
+
+ if node is not None:
+ node.accept_visitor(self)
+
+ illegal_names = self.compiler.reserved_names.intersection(
+ self.locally_declared)
+ if illegal_names:
+ raise exceptions.NameConflictError(
+ "Reserved words declared in template: %s" %
+ ", ".join(illegal_names))
+
+
+ def branch(self, node, **kwargs):
+ """create a new Identifiers for a new Node, with
+ this Identifiers as the parent."""
+
+ return _Identifiers(self.compiler, node, self, **kwargs)
+
+ @property
+ def defs(self):
+ return set(self.topleveldefs.union(self.closuredefs).values())
+
+ def __repr__(self):
+ return "Identifiers(declared=%r, locally_declared=%r, "\
+ "undeclared=%r, topleveldefs=%r, closuredefs=%r, "\
+ "argumentdeclared=%r)" %\
+ (
+ list(self.declared),
+ list(self.locally_declared),
+ list(self.undeclared),
+ [c.name for c in self.topleveldefs.values()],
+ [c.name for c in self.closuredefs.values()],
+ self.argument_declared)
+
+ def check_declared(self, node):
+ """update the state of this Identifiers with the undeclared
+ and declared identifiers of the given node."""
+
+ for ident in node.undeclared_identifiers():
+ if ident != 'context' and\
+ ident not in self.declared.union(self.locally_declared):
+ self.undeclared.add(ident)
+ for ident in node.declared_identifiers():
+ self.locally_declared.add(ident)
+
+ def add_declared(self, ident):
+ self.declared.add(ident)
+ if ident in self.undeclared:
+ self.undeclared.remove(ident)
+
+ def visitExpression(self, node):
+ self.check_declared(node)
+
+ def visitControlLine(self, node):
+ self.check_declared(node)
+
+ def visitCode(self, node):
+ if not node.ismodule:
+ self.check_declared(node)
+ self.locally_assigned = self.locally_assigned.union(
+ node.declared_identifiers())
+
+ def visitNamespaceTag(self, node):
+ # only traverse into the sub-elements of a
+ # <%namespace> tag if we are the branch created in
+ # write_namespaces()
+ if self.node is node:
+ for n in node.nodes:
+ n.accept_visitor(self)
+
+ def _check_name_exists(self, collection, node):
+ existing = collection.get(node.funcname)
+ collection[node.funcname] = node
+ if existing is not None and \
+ existing is not node and \
+ (node.is_block or existing.is_block):
+ raise exceptions.CompileException(
+ "%%def or %%block named '%s' already "
+ "exists in this template." %
+ node.funcname, **node.exception_kwargs)
+
+ def visitDefTag(self, node):
+ if node.is_root() and not node.is_anonymous:
+ self._check_name_exists(self.topleveldefs, node)
+ elif node is not self.node:
+ self._check_name_exists(self.closuredefs, node)
+
+ for ident in node.undeclared_identifiers():
+ if ident != 'context' and \
+ ident not in self.declared.union(self.locally_declared):
+ self.undeclared.add(ident)
+
+ # visit defs only one level deep
+ if node is self.node:
+ for ident in node.declared_identifiers():
+ self.argument_declared.add(ident)
+
+ for n in node.nodes:
+ n.accept_visitor(self)
+
+ def visitBlockTag(self, node):
+ if node is not self.node and not node.is_anonymous:
+
+ if isinstance(self.node, parsetree.DefTag):
+ raise exceptions.CompileException(
+ "Named block '%s' not allowed inside of def '%s'"
+ % (node.name, self.node.name), **node.exception_kwargs)
+ elif isinstance(self.node,
+ (parsetree.CallTag, parsetree.CallNamespaceTag)):
+ raise exceptions.CompileException(
+ "Named block '%s' not allowed inside of <%%call> tag"
+ % (node.name, ), **node.exception_kwargs)
+
+ for ident in node.undeclared_identifiers():
+ if ident != 'context' and \
+ ident not in self.declared.union(self.locally_declared):
+ self.undeclared.add(ident)
+
+ if not node.is_anonymous:
+ self._check_name_exists(self.topleveldefs, node)
+ self.undeclared.add(node.funcname)
+ elif node is not self.node:
+ self._check_name_exists(self.closuredefs, node)
+ for ident in node.declared_identifiers():
+ self.argument_declared.add(ident)
+ for n in node.nodes:
+ n.accept_visitor(self)
+
+ def visitTextTag(self, node):
+ for ident in node.undeclared_identifiers():
+ if ident != 'context' and \
+ ident not in self.declared.union(self.locally_declared):
+ self.undeclared.add(ident)
+
+ def visitIncludeTag(self, node):
+ self.check_declared(node)
+
+ def visitPageTag(self, node):
+ for ident in node.declared_identifiers():
+ self.argument_declared.add(ident)
+ self.check_declared(node)
+
+ def visitCallNamespaceTag(self, node):
+ self.visitCallTag(node)
+
+ def visitCallTag(self, node):
+ if node is self.node:
+ for ident in node.undeclared_identifiers():
+ if ident != 'context' and \
+ ident not in self.declared.union(
+ self.locally_declared):
+ self.undeclared.add(ident)
+ for ident in node.declared_identifiers():
+ self.argument_declared.add(ident)
+ for n in node.nodes:
+ n.accept_visitor(self)
+ else:
+ for ident in node.undeclared_identifiers():
+ if ident != 'context' and \
+ ident not in self.declared.union(
+ self.locally_declared):
+ self.undeclared.add(ident)
+
+
+_FOR_LOOP = re.compile(
+ r'^for\s+((?:\(?)\s*[A-Za-z_][A-Za-z_0-9]*'
+ r'(?:\s*,\s*(?:[A-Za-z_][A-Za-z0-9_]*),??)*\s*(?:\)?))\s+in\s+(.*):'
+)
+
+def mangle_mako_loop(node, printer):
+ """converts a for loop into a context manager wrapped around a for loop
+ when access to the `loop` variable has been detected in the for loop body
+ """
+ loop_variable = LoopVariable()
+ node.accept_visitor(loop_variable)
+ if loop_variable.detected:
+ node.nodes[-1].has_loop_context = True
+ match = _FOR_LOOP.match(node.text)
+ if match:
+ printer.writelines(
+ 'loop = __M_loop._enter(%s)' % match.group(2),
+ 'try:'
+ #'with __M_loop(%s) as loop:' % match.group(2)
+ )
+ text = 'for %s in loop:' % match.group(1)
+ else:
+ raise SyntaxError("Couldn't apply loop context: %s" % node.text)
+ else:
+ text = node.text
+ return text
+
+
+class LoopVariable(object):
+ """A node visitor which looks for the name 'loop' within undeclared
+ identifiers."""
+
+ def __init__(self):
+ self.detected = False
+
+ def _loop_reference_detected(self, node):
+ if 'loop' in node.undeclared_identifiers():
+ self.detected = True
+ else:
+ for n in node.get_children():
+ n.accept_visitor(self)
+
+ def visitControlLine(self, node):
+ self._loop_reference_detected(node)
+
+ def visitCode(self, node):
+ self._loop_reference_detected(node)
+
+ def visitExpression(self, node):
+ self._loop_reference_detected(node)
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/compat.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/compat.py
new file mode 100644
index 00000000000..fe277bbf05a
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/compat.py
@@ -0,0 +1,174 @@
+import sys
+import time
+
+py3k = sys.version_info >= (3, 0)
+py33 = sys.version_info >= (3, 3)
+py2k = sys.version_info < (3,)
+py26 = sys.version_info >= (2, 6)
+jython = sys.platform.startswith('java')
+win32 = sys.platform.startswith('win')
+pypy = hasattr(sys, 'pypy_version_info')
+
+if py3k:
+ from io import StringIO
+ import builtins as compat_builtins
+ from urllib.parse import quote_plus, unquote_plus
+ from html.entities import codepoint2name, name2codepoint
+ string_types = str,
+ binary_type = bytes
+ text_type = str
+
+ from io import BytesIO as byte_buffer
+
+ def u(s):
+ return s
+
+ def b(s):
+ return s.encode("latin-1")
+
+ def octal(lit):
+ return eval("0o" + lit)
+
+else:
+ import __builtin__ as compat_builtins
+ try:
+ from cStringIO import StringIO
+ except:
+ from StringIO import StringIO
+
+ byte_buffer = StringIO
+
+ from urllib import quote_plus, unquote_plus
+ from htmlentitydefs import codepoint2name, name2codepoint
+ string_types = basestring,
+ binary_type = str
+ text_type = unicode
+
+ def u(s):
+ return unicode(s, "utf-8")
+
+ def b(s):
+ return s
+
+ def octal(lit):
+ return eval("0" + lit)
+
+
+if py33:
+ from importlib import machinery
+ def load_module(module_id, path):
+ return machinery.SourceFileLoader(module_id, path).load_module()
+else:
+ import imp
+ def load_module(module_id, path):
+ fp = open(path, 'rb')
+ try:
+ return imp.load_source(module_id, path, fp)
+ finally:
+ fp.close()
+
+
+if py3k:
+ def reraise(tp, value, tb=None, cause=None):
+ if cause is not None:
+ value.__cause__ = cause
+ if value.__traceback__ is not tb:
+ raise value.with_traceback(tb)
+ raise value
+else:
+ exec("def reraise(tp, value, tb=None, cause=None):\n"
+ " raise tp, value, tb\n")
+
+
+def exception_as():
+ return sys.exc_info()[1]
+
+try:
+ import threading
+ if py3k:
+ import _thread as thread
+ else:
+ import thread
+except ImportError:
+ import dummy_threading as threading
+ if py3k:
+ import _dummy_thread as thread
+ else:
+ import dummy_thread as thread
+
+if win32 or jython:
+ time_func = time.clock
+else:
+ time_func = time.time
+
+try:
+ from functools import partial
+except:
+ def partial(func, *args, **keywords):
+ def newfunc(*fargs, **fkeywords):
+ newkeywords = keywords.copy()
+ newkeywords.update(fkeywords)
+ return func(*(args + fargs), **newkeywords)
+ return newfunc
+
+
+all = all
+import json
+
+def exception_name(exc):
+ return exc.__class__.__name__
+
+try:
+ from inspect import CO_VARKEYWORDS, CO_VARARGS
+ def inspect_func_args(fn):
+ if py3k:
+ co = fn.__code__
+ else:
+ co = fn.func_code
+
+ nargs = co.co_argcount
+ names = co.co_varnames
+ args = list(names[:nargs])
+
+ varargs = None
+ if co.co_flags & CO_VARARGS:
+ varargs = co.co_varnames[nargs]
+ nargs = nargs + 1
+ varkw = None
+ if co.co_flags & CO_VARKEYWORDS:
+ varkw = co.co_varnames[nargs]
+
+ if py3k:
+ return args, varargs, varkw, fn.__defaults__
+ else:
+ return args, varargs, varkw, fn.func_defaults
+except ImportError:
+ import inspect
+ def inspect_func_args(fn):
+ return inspect.getargspec(fn)
+
+if py3k:
+ def callable(fn):
+ return hasattr(fn, '__call__')
+else:
+ callable = callable
+
+
+################################################
+# cross-compatible metaclass implementation
+# Copyright (c) 2010-2012 Benjamin Peterson
+def with_metaclass(meta, base=object):
+ """Create a base class with a metaclass."""
+ return meta("%sBase" % meta.__name__, (base,), {})
+################################################
+
+
+def arg_stringname(func_arg):
+ """Gets the string name of a kwarg or vararg
+ In Python3.4 a function's args are
+ of _ast.arg type not _ast.name
+ """
+ if hasattr(func_arg, 'arg'):
+ return func_arg.arg
+ else:
+ return str(func_arg)
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/exceptions.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/exceptions.py
new file mode 100644
index 00000000000..c531f2118d0
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/exceptions.py
@@ -0,0 +1,373 @@
+# mako/exceptions.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+"""exception classes"""
+
+import traceback
+import sys
+from mako import util, compat
+
+class MakoException(Exception):
+ pass
+
+class RuntimeException(MakoException):
+ pass
+
+def _format_filepos(lineno, pos, filename):
+ if filename is None:
+ return " at line: %d char: %d" % (lineno, pos)
+ else:
+ return " in file '%s' at line: %d char: %d" % (filename, lineno, pos)
+
+
+class CompileException(MakoException):
+ def __init__(self, message, source, lineno, pos, filename):
+ MakoException.__init__(self,
+ message + _format_filepos(lineno, pos, filename))
+ self.lineno = lineno
+ self.pos = pos
+ self.filename = filename
+ self.source = source
+
+class SyntaxException(MakoException):
+ def __init__(self, message, source, lineno, pos, filename):
+ MakoException.__init__(self,
+ message + _format_filepos(lineno, pos, filename))
+ self.lineno = lineno
+ self.pos = pos
+ self.filename = filename
+ self.source = source
+
+class UnsupportedError(MakoException):
+ """raised when a retired feature is used."""
+
+class NameConflictError(MakoException):
+ """raised when a reserved word is used inappropriately"""
+
+class TemplateLookupException(MakoException):
+ pass
+
+class TopLevelLookupException(TemplateLookupException):
+ pass
+
+class RichTraceback(object):
+ """Pull the current exception from the ``sys`` traceback and extracts
+ Mako-specific template information.
+
+ See the usage examples in :ref:`handling_exceptions`.
+
+ """
+ def __init__(self, error=None, traceback=None):
+ self.source, self.lineno = "", 0
+
+ if error is None or traceback is None:
+ t, value, tback = sys.exc_info()
+
+ if error is None:
+ error = value or t
+
+ if traceback is None:
+ traceback = tback
+
+ self.error = error
+ self.records = self._init(traceback)
+
+ if isinstance(self.error, (CompileException, SyntaxException)):
+ self.source = self.error.source
+ self.lineno = self.error.lineno
+ self._has_source = True
+
+ self._init_message()
+
+ @property
+ def errorname(self):
+ return compat.exception_name(self.error)
+
+ def _init_message(self):
+ """Find a unicode representation of self.error"""
+ try:
+ self.message = compat.text_type(self.error)
+ except UnicodeError:
+ try:
+ self.message = str(self.error)
+ except UnicodeEncodeError:
+ # Fallback to args as neither unicode nor
+ # str(Exception(u'\xe6')) work in Python < 2.6
+ self.message = self.error.args[0]
+ if not isinstance(self.message, compat.text_type):
+ self.message = compat.text_type(self.message, 'ascii', 'replace')
+
+ def _get_reformatted_records(self, records):
+ for rec in records:
+ if rec[6] is not None:
+ yield (rec[4], rec[5], rec[2], rec[6])
+ else:
+ yield tuple(rec[0:4])
+
+ @property
+ def traceback(self):
+ """Return a list of 4-tuple traceback records (i.e. normal python
+ format) with template-corresponding lines remapped to the originating
+ template.
+
+ """
+ return list(self._get_reformatted_records(self.records))
+
+ @property
+ def reverse_records(self):
+ return reversed(self.records)
+
+ @property
+ def reverse_traceback(self):
+ """Return the same data as traceback, except in reverse order.
+ """
+
+ return list(self._get_reformatted_records(self.reverse_records))
+
+ def _init(self, trcback):
+ """format a traceback from sys.exc_info() into 7-item tuples,
+ containing the regular four traceback tuple items, plus the original
+ template filename, the line number adjusted relative to the template
+ source, and code line from that line number of the template."""
+
+ import mako.template
+ mods = {}
+ rawrecords = traceback.extract_tb(trcback)
+ new_trcback = []
+ for filename, lineno, function, line in rawrecords:
+ if not line:
+ line = ''
+ try:
+ (line_map, template_lines) = mods[filename]
+ except KeyError:
+ try:
+ info = mako.template._get_module_info(filename)
+ module_source = info.code
+ template_source = info.source
+ template_filename = info.template_filename or filename
+ except KeyError:
+ # A normal .py file (not a Template)
+ if not compat.py3k:
+ try:
+ fp = open(filename, 'rb')
+ encoding = util.parse_encoding(fp)
+ fp.close()
+ except IOError:
+ encoding = None
+ if encoding:
+ line = line.decode(encoding)
+ else:
+ line = line.decode('ascii', 'replace')
+ new_trcback.append((filename, lineno, function, line,
+ None, None, None, None))
+ continue
+
+ template_ln = 1
+
+ source_map = mako.template.ModuleInfo.\
+ get_module_source_metadata(
+ module_source, full_line_map=True)
+ line_map = source_map['full_line_map']
+
+ template_lines = [line for line in
+ template_source.split("\n")]
+ mods[filename] = (line_map, template_lines)
+
+ template_ln = line_map[lineno - 1]
+
+ if template_ln <= len(template_lines):
+ template_line = template_lines[template_ln - 1]
+ else:
+ template_line = None
+ new_trcback.append((filename, lineno, function,
+ line, template_filename, template_ln,
+ template_line, template_source))
+ if not self.source:
+ for l in range(len(new_trcback) - 1, 0, -1):
+ if new_trcback[l][5]:
+ self.source = new_trcback[l][7]
+ self.lineno = new_trcback[l][5]
+ break
+ else:
+ if new_trcback:
+ try:
+ # A normal .py file (not a Template)
+ fp = open(new_trcback[-1][0], 'rb')
+ encoding = util.parse_encoding(fp)
+ fp.seek(0)
+ self.source = fp.read()
+ fp.close()
+ if encoding:
+ self.source = self.source.decode(encoding)
+ except IOError:
+ self.source = ''
+ self.lineno = new_trcback[-1][1]
+ return new_trcback
+
+
+def text_error_template(lookup=None):
+ """Provides a template that renders a stack trace in a similar format to
+ the Python interpreter, substituting source template filenames, line
+ numbers and code for that of the originating source template, as
+ applicable.
+
+ """
+ import mako.template
+ return mako.template.Template(r"""
+<%page args="error=None, traceback=None"/>
+<%!
+ from mako.exceptions import RichTraceback
+%>\
+<%
+ tback = RichTraceback(error=error, traceback=traceback)
+%>\
+Traceback (most recent call last):
+% for (filename, lineno, function, line) in tback.traceback:
+ File "${filename}", line ${lineno}, in ${function or '?'}
+ ${line | trim}
+% endfor
+${tback.errorname}: ${tback.message}
+""")
+
+
+def _install_pygments():
+ global syntax_highlight, pygments_html_formatter
+ from mako.ext.pygmentplugin import syntax_highlight,\
+ pygments_html_formatter
+
+def _install_fallback():
+ global syntax_highlight, pygments_html_formatter
+ from mako.filters import html_escape
+ pygments_html_formatter = None
+ def syntax_highlight(filename='', language=None):
+ return html_escape
+
+def _install_highlighting():
+ try:
+ _install_pygments()
+ except ImportError:
+ _install_fallback()
+_install_highlighting()
+
+def html_error_template():
+ """Provides a template that renders a stack trace in an HTML format,
+ providing an excerpt of code as well as substituting source template
+ filenames, line numbers and code for that of the originating source
+ template, as applicable.
+
+ The template's default ``encoding_errors`` value is
+ ``'htmlentityreplace'``. The template has two options. With the
+ ``full`` option disabled, only a section of an HTML document is
+ returned. With the ``css`` option disabled, the default stylesheet
+ won't be included.
+
+ """
+ import mako.template
+ return mako.template.Template(r"""
+<%!
+ from mako.exceptions import RichTraceback, syntax_highlight,\
+ pygments_html_formatter
+%>
+<%page args="full=True, css=True, error=None, traceback=None"/>
+% if full:
+<html>
+<head>
+ <title>Mako Runtime Error</title>
+% endif
+% if css:
+ <style>
+ body { font-family:verdana; margin:10px 30px 10px 30px;}
+ .stacktrace { margin:5px 5px 5px 5px; }
+ .highlight { padding:0px 10px 0px 10px; background-color:#9F9FDF; }
+ .nonhighlight { padding:0px; background-color:#DFDFDF; }
+ .sample { padding:10px; margin:10px 10px 10px 10px;
+ font-family:monospace; }
+ .sampleline { padding:0px 10px 0px 10px; }
+ .sourceline { margin:5px 5px 10px 5px; font-family:monospace;}
+ .location { font-size:80%; }
+ .highlight { white-space:pre; }
+ .sampleline { white-space:pre; }
+
+ % if pygments_html_formatter:
+ ${pygments_html_formatter.get_style_defs()}
+ .linenos { min-width: 2.5em; text-align: right; }
+ pre { margin: 0; }
+ .syntax-highlighted { padding: 0 10px; }
+ .syntax-highlightedtable { border-spacing: 1px; }
+ .nonhighlight { border-top: 1px solid #DFDFDF;
+ border-bottom: 1px solid #DFDFDF; }
+ .stacktrace .nonhighlight { margin: 5px 15px 10px; }
+ .sourceline { margin: 0 0; font-family:monospace; }
+ .code { background-color: #F8F8F8; width: 100%; }
+ .error .code { background-color: #FFBDBD; }
+ .error .syntax-highlighted { background-color: #FFBDBD; }
+ % endif
+
+ </style>
+% endif
+% if full:
+</head>
+<body>
+% endif
+
+<h2>Error !</h2>
+<%
+ tback = RichTraceback(error=error, traceback=traceback)
+ src = tback.source
+ line = tback.lineno
+ if src:
+ lines = src.split('\n')
+ else:
+ lines = None
+%>
+<h3>${tback.errorname}: ${tback.message|h}</h3>
+
+% if lines:
+ <div class="sample">
+ <div class="nonhighlight">
+% for index in range(max(0, line-4),min(len(lines), line+5)):
+ <%
+ if pygments_html_formatter:
+ pygments_html_formatter.linenostart = index + 1
+ %>
+ % if index + 1 == line:
+ <%
+ if pygments_html_formatter:
+ old_cssclass = pygments_html_formatter.cssclass
+ pygments_html_formatter.cssclass = 'error ' + old_cssclass
+ %>
+ ${lines[index] | syntax_highlight(language='mako')}
+ <%
+ if pygments_html_formatter:
+ pygments_html_formatter.cssclass = old_cssclass
+ %>
+ % else:
+ ${lines[index] | syntax_highlight(language='mako')}
+ % endif
+% endfor
+ </div>
+ </div>
+% endif
+
+<div class="stacktrace">
+% for (filename, lineno, function, line) in tback.reverse_traceback:
+ <div class="location">${filename}, line ${lineno}:</div>
+ <div class="nonhighlight">
+ <%
+ if pygments_html_formatter:
+ pygments_html_formatter.linenostart = lineno
+ %>
+ <div class="sourceline">${line | syntax_highlight(filename)}</div>
+ </div>
+% endfor
+</div>
+
+% if full:
+</body>
+</html>
+% endif
+""", output_encoding=sys.getdefaultencoding(),
+ encoding_errors='htmlentityreplace')
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/filters.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/filters.py
new file mode 100644
index 00000000000..d79ce2388f6
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/filters.py
@@ -0,0 +1,201 @@
+# mako/filters.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+
+import re
+import codecs
+
+from mako.compat import quote_plus, unquote_plus, codepoint2name, \
+ name2codepoint
+
+from mako import compat
+
+xml_escapes = {
+ '&': '&amp;',
+ '>': '&gt;',
+ '<': '&lt;',
+ '"': '&#34;', # also &quot; in html-only
+ "'": '&#39;' # also &apos; in html-only
+}
+
+# XXX: &quot; is valid in HTML and XML
+# &apos; is not valid HTML, but is valid XML
+
+def legacy_html_escape(s):
+ """legacy HTML escape for non-unicode mode."""
+ s = s.replace("&", "&amp;")
+ s = s.replace(">", "&gt;")
+ s = s.replace("<", "&lt;")
+ s = s.replace('"', "&#34;")
+ s = s.replace("'", "&#39;")
+ return s
+
+
+try:
+ import markupsafe
+ html_escape = markupsafe.escape
+except ImportError:
+ html_escape = legacy_html_escape
+
+def xml_escape(string):
+ return re.sub(r'([&<"\'>])', lambda m: xml_escapes[m.group()], string)
+
+def url_escape(string):
+ # convert into a list of octets
+ string = string.encode("utf8")
+ return quote_plus(string)
+
+def legacy_url_escape(string):
+ # convert into a list of octets
+ return quote_plus(string)
+
+def url_unescape(string):
+ text = unquote_plus(string)
+ if not is_ascii_str(text):
+ text = text.decode("utf8")
+ return text
+
+def trim(string):
+ return string.strip()
+
+
+class Decode(object):
+ def __getattr__(self, key):
+ def decode(x):
+ if isinstance(x, compat.text_type):
+ return x
+ elif not isinstance(x, compat.binary_type):
+ return decode(str(x))
+ else:
+ return compat.text_type(x, encoding=key)
+ return decode
+decode = Decode()
+
+
+_ASCII_re = re.compile(r'\A[\x00-\x7f]*\Z')
+
+def is_ascii_str(text):
+ return isinstance(text, str) and _ASCII_re.match(text)
+
+################################################################
+
+class XMLEntityEscaper(object):
+ def __init__(self, codepoint2name, name2codepoint):
+ self.codepoint2entity = dict([(c, compat.text_type('&%s;' % n))
+ for c, n in codepoint2name.items()])
+ self.name2codepoint = name2codepoint
+
+ def escape_entities(self, text):
+ """Replace characters with their character entity references.
+
+ Only characters corresponding to a named entity are replaced.
+ """
+ return compat.text_type(text).translate(self.codepoint2entity)
+
+ def __escape(self, m):
+ codepoint = ord(m.group())
+ try:
+ return self.codepoint2entity[codepoint]
+ except (KeyError, IndexError):
+ return '&#x%X;' % codepoint
+
+
+ __escapable = re.compile(r'["&<>]|[^\x00-\x7f]')
+
+ def escape(self, text):
+ """Replace characters with their character references.
+
+ Replace characters by their named entity references.
+ Non-ASCII characters, if they do not have a named entity reference,
+ are replaced by numerical character references.
+
+ The return value is guaranteed to be ASCII.
+ """
+ return self.__escapable.sub(self.__escape, compat.text_type(text)
+ ).encode('ascii')
+
+ # XXX: This regexp will not match all valid XML entity names__.
+ # (It punts on details involving involving CombiningChars and Extenders.)
+ #
+ # .. __: http://www.w3.org/TR/2000/REC-xml-20001006#NT-EntityRef
+ __characterrefs = re.compile(r'''& (?:
+ \#(\d+)
+ | \#x([\da-f]+)
+ | ( (?!\d) [:\w] [-.:\w]+ )
+ ) ;''',
+ re.X | re.UNICODE)
+
+ def __unescape(self, m):
+ dval, hval, name = m.groups()
+ if dval:
+ codepoint = int(dval)
+ elif hval:
+ codepoint = int(hval, 16)
+ else:
+ codepoint = self.name2codepoint.get(name, 0xfffd)
+ # U+FFFD = "REPLACEMENT CHARACTER"
+ if codepoint < 128:
+ return chr(codepoint)
+ return chr(codepoint)
+
+ def unescape(self, text):
+ """Unescape character references.
+
+ All character references (both entity references and numerical
+ character references) are unescaped.
+ """
+ return self.__characterrefs.sub(self.__unescape, text)
+
+
+_html_entities_escaper = XMLEntityEscaper(codepoint2name, name2codepoint)
+
+html_entities_escape = _html_entities_escaper.escape_entities
+html_entities_unescape = _html_entities_escaper.unescape
+
+
+def htmlentityreplace_errors(ex):
+ """An encoding error handler.
+
+ This python `codecs`_ error handler replaces unencodable
+ characters with HTML entities, or, if no HTML entity exists for
+ the character, XML character references.
+
+ >>> u'The cost was \u20ac12.'.encode('latin1', 'htmlentityreplace')
+ 'The cost was &euro;12.'
+ """
+ if isinstance(ex, UnicodeEncodeError):
+ # Handle encoding errors
+ bad_text = ex.object[ex.start:ex.end]
+ text = _html_entities_escaper.escape(bad_text)
+ return (compat.text_type(text), ex.end)
+ raise ex
+
+codecs.register_error('htmlentityreplace', htmlentityreplace_errors)
+
+
+# TODO: options to make this dynamic per-compilation will be added in a later
+# release
+DEFAULT_ESCAPES = {
+ 'x': 'filters.xml_escape',
+ 'h': 'filters.html_escape',
+ 'u': 'filters.url_escape',
+ 'trim': 'filters.trim',
+ 'entity': 'filters.html_entities_escape',
+ 'unicode': 'unicode',
+ 'decode': 'decode',
+ 'str': 'str',
+ 'n': 'n'
+}
+
+if compat.py3k:
+ DEFAULT_ESCAPES.update({
+ 'unicode': 'str'
+ })
+
+NON_UNICODE_ESCAPES = DEFAULT_ESCAPES.copy()
+NON_UNICODE_ESCAPES['h'] = 'filters.legacy_html_escape'
+NON_UNICODE_ESCAPES['u'] = 'filters.legacy_url_escape'
+
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/lexer.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/lexer.py
new file mode 100644
index 00000000000..1dda398215d
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/lexer.py
@@ -0,0 +1,441 @@
+# mako/lexer.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+"""provides the Lexer class for parsing template strings into parse trees."""
+
+import re
+import codecs
+from mako import parsetree, exceptions, compat
+from mako.pygen import adjust_whitespace
+
+_regexp_cache = {}
+
+class Lexer(object):
+ def __init__(self, text, filename=None,
+ disable_unicode=False,
+ input_encoding=None, preprocessor=None):
+ self.text = text
+ self.filename = filename
+ self.template = parsetree.TemplateNode(self.filename)
+ self.matched_lineno = 1
+ self.matched_charpos = 0
+ self.lineno = 1
+ self.match_position = 0
+ self.tag = []
+ self.control_line = []
+ self.ternary_stack = []
+ self.disable_unicode = disable_unicode
+ self.encoding = input_encoding
+
+ if compat.py3k and disable_unicode:
+ raise exceptions.UnsupportedError(
+ "Mako for Python 3 does not "
+ "support disabling Unicode")
+
+ if preprocessor is None:
+ self.preprocessor = []
+ elif not hasattr(preprocessor, '__iter__'):
+ self.preprocessor = [preprocessor]
+ else:
+ self.preprocessor = preprocessor
+
+ @property
+ def exception_kwargs(self):
+ return {'source': self.text,
+ 'lineno': self.matched_lineno,
+ 'pos': self.matched_charpos,
+ 'filename': self.filename}
+
+ def match(self, regexp, flags=None):
+ """compile the given regexp, cache the reg, and call match_reg()."""
+
+ try:
+ reg = _regexp_cache[(regexp, flags)]
+ except KeyError:
+ if flags:
+ reg = re.compile(regexp, flags)
+ else:
+ reg = re.compile(regexp)
+ _regexp_cache[(regexp, flags)] = reg
+
+ return self.match_reg(reg)
+
+ def match_reg(self, reg):
+ """match the given regular expression object to the current text
+ position.
+
+ if a match occurs, update the current text and line position.
+
+ """
+
+ mp = self.match_position
+
+ match = reg.match(self.text, self.match_position)
+ if match:
+ (start, end) = match.span()
+ if end == start:
+ self.match_position = end + 1
+ else:
+ self.match_position = end
+ self.matched_lineno = self.lineno
+ lines = re.findall(r"\n", self.text[mp:self.match_position])
+ cp = mp - 1
+ while (cp >= 0 and cp < self.textlength and self.text[cp] != '\n'):
+ cp -= 1
+ self.matched_charpos = mp - cp
+ self.lineno += len(lines)
+ #print "MATCHED:", match.group(0), "LINE START:",
+ # self.matched_lineno, "LINE END:", self.lineno
+ #print "MATCH:", regexp, "\n", self.text[mp : mp + 15], \
+ # (match and "TRUE" or "FALSE")
+ return match
+
+ def parse_until_text(self, *text):
+ startpos = self.match_position
+ text_re = r'|'.join(text)
+ brace_level = 0
+ while True:
+ match = self.match(r'#.*\n')
+ if match:
+ continue
+ match = self.match(r'(\"\"\"|\'\'\'|\"|\')((?<!\\)\\\1|.)*?\1',
+ re.S)
+ if match:
+ continue
+ match = self.match(r'(%s)' % text_re)
+ if match:
+ if match.group(1) == '}' and brace_level > 0:
+ brace_level -= 1
+ continue
+ return \
+ self.text[startpos:
+ self.match_position - len(match.group(1))],\
+ match.group(1)
+ match = self.match(r"(.*?)(?=\"|\'|#|%s)" % text_re, re.S)
+ if match:
+ brace_level += match.group(1).count('{')
+ brace_level -= match.group(1).count('}')
+ continue
+ raise exceptions.SyntaxException(
+ "Expected: %s" %
+ ','.join(text),
+ **self.exception_kwargs)
+
+ def append_node(self, nodecls, *args, **kwargs):
+ kwargs.setdefault('source', self.text)
+ kwargs.setdefault('lineno', self.matched_lineno)
+ kwargs.setdefault('pos', self.matched_charpos)
+ kwargs['filename'] = self.filename
+ node = nodecls(*args, **kwargs)
+ if len(self.tag):
+ self.tag[-1].nodes.append(node)
+ else:
+ self.template.nodes.append(node)
+ # build a set of child nodes for the control line
+ # (used for loop variable detection)
+ # also build a set of child nodes on ternary control lines
+ # (used for determining if a pass needs to be auto-inserted
+ if self.control_line:
+ control_frame = self.control_line[-1]
+ control_frame.nodes.append(node)
+ if not (isinstance(node, parsetree.ControlLine) and
+ control_frame.is_ternary(node.keyword)):
+ if self.ternary_stack and self.ternary_stack[-1]:
+ self.ternary_stack[-1][-1].nodes.append(node)
+ if isinstance(node, parsetree.Tag):
+ if len(self.tag):
+ node.parent = self.tag[-1]
+ self.tag.append(node)
+ elif isinstance(node, parsetree.ControlLine):
+ if node.isend:
+ self.control_line.pop()
+ self.ternary_stack.pop()
+ elif node.is_primary:
+ self.control_line.append(node)
+ self.ternary_stack.append([])
+ elif self.control_line and \
+ self.control_line[-1].is_ternary(node.keyword):
+ self.ternary_stack[-1].append(node)
+ elif self.control_line and \
+ not self.control_line[-1].is_ternary(node.keyword):
+ raise exceptions.SyntaxException(
+ "Keyword '%s' not a legal ternary for keyword '%s'" %
+ (node.keyword, self.control_line[-1].keyword),
+ **self.exception_kwargs)
+
+ _coding_re = re.compile(r'#.*coding[:=]\s*([-\w.]+).*\r?\n')
+
+ def decode_raw_stream(self, text, decode_raw, known_encoding, filename):
+ """given string/unicode or bytes/string, determine encoding
+ from magic encoding comment, return body as unicode
+ or raw if decode_raw=False
+
+ """
+ if isinstance(text, compat.text_type):
+ m = self._coding_re.match(text)
+ encoding = m and m.group(1) or known_encoding or 'ascii'
+ return encoding, text
+
+ if text.startswith(codecs.BOM_UTF8):
+ text = text[len(codecs.BOM_UTF8):]
+ parsed_encoding = 'utf-8'
+ m = self._coding_re.match(text.decode('utf-8', 'ignore'))
+ if m is not None and m.group(1) != 'utf-8':
+ raise exceptions.CompileException(
+ "Found utf-8 BOM in file, with conflicting "
+ "magic encoding comment of '%s'" % m.group(1),
+ text.decode('utf-8', 'ignore'),
+ 0, 0, filename)
+ else:
+ m = self._coding_re.match(text.decode('utf-8', 'ignore'))
+ if m:
+ parsed_encoding = m.group(1)
+ else:
+ parsed_encoding = known_encoding or 'ascii'
+
+ if decode_raw:
+ try:
+ text = text.decode(parsed_encoding)
+ except UnicodeDecodeError:
+ raise exceptions.CompileException(
+ "Unicode decode operation of encoding '%s' failed" %
+ parsed_encoding,
+ text.decode('utf-8', 'ignore'),
+ 0, 0, filename)
+
+ return parsed_encoding, text
+
+ def parse(self):
+ self.encoding, self.text = self.decode_raw_stream(self.text,
+ not self.disable_unicode,
+ self.encoding,
+ self.filename,)
+
+ for preproc in self.preprocessor:
+ self.text = preproc(self.text)
+
+ # push the match marker past the
+ # encoding comment.
+ self.match_reg(self._coding_re)
+
+ self.textlength = len(self.text)
+
+ while (True):
+ if self.match_position > self.textlength:
+ break
+
+ if self.match_end():
+ break
+ if self.match_expression():
+ continue
+ if self.match_control_line():
+ continue
+ if self.match_comment():
+ continue
+ if self.match_tag_start():
+ continue
+ if self.match_tag_end():
+ continue
+ if self.match_python_block():
+ continue
+ if self.match_text():
+ continue
+
+ if self.match_position > self.textlength:
+ break
+ raise exceptions.CompileException("assertion failed")
+
+ if len(self.tag):
+ raise exceptions.SyntaxException("Unclosed tag: <%%%s>" %
+ self.tag[-1].keyword,
+ **self.exception_kwargs)
+ if len(self.control_line):
+ raise exceptions.SyntaxException(
+ "Unterminated control keyword: '%s'" %
+ self.control_line[-1].keyword,
+ self.text,
+ self.control_line[-1].lineno,
+ self.control_line[-1].pos, self.filename)
+ return self.template
+
+ def match_tag_start(self):
+ match = self.match(r'''
+ \<% # opening tag
+
+ ([\w\.\:]+) # keyword
+
+ ((?:\s+\w+|\s*=\s*|".*?"|'.*?')*) # attrname, = \
+ # sign, string expression
+
+ \s* # more whitespace
+
+ (/)?> # closing
+
+ ''',
+
+ re.I | re.S | re.X)
+
+ if match:
+ keyword, attr, isend = match.groups()
+ self.keyword = keyword
+ attributes = {}
+ if attr:
+ for att in re.findall(
+ r"\s*(\w+)\s*=\s*(?:'([^']*)'|\"([^\"]*)\")", attr):
+ key, val1, val2 = att
+ text = val1 or val2
+ text = text.replace('\r\n', '\n')
+ attributes[key] = text
+ self.append_node(parsetree.Tag, keyword, attributes)
+ if isend:
+ self.tag.pop()
+ else:
+ if keyword == 'text':
+ match = self.match(r'(.*?)(?=\</%text>)', re.S)
+ if not match:
+ raise exceptions.SyntaxException(
+ "Unclosed tag: <%%%s>" %
+ self.tag[-1].keyword,
+ **self.exception_kwargs)
+ self.append_node(parsetree.Text, match.group(1))
+ return self.match_tag_end()
+ return True
+ else:
+ return False
+
+ def match_tag_end(self):
+ match = self.match(r'\</%[\t ]*(.+?)[\t ]*>')
+ if match:
+ if not len(self.tag):
+ raise exceptions.SyntaxException(
+ "Closing tag without opening tag: </%%%s>" %
+ match.group(1),
+ **self.exception_kwargs)
+ elif self.tag[-1].keyword != match.group(1):
+ raise exceptions.SyntaxException(
+ "Closing tag </%%%s> does not match tag: <%%%s>" %
+ (match.group(1), self.tag[-1].keyword),
+ **self.exception_kwargs)
+ self.tag.pop()
+ return True
+ else:
+ return False
+
+ def match_end(self):
+ match = self.match(r'\Z', re.S)
+ if match:
+ string = match.group()
+ if string:
+ return string
+ else:
+ return True
+ else:
+ return False
+
+ def match_text(self):
+ match = self.match(r"""
+ (.*?) # anything, followed by:
+ (
+ (?<=\n)(?=[ \t]*(?=%|\#\#)) # an eval or line-based
+ # comment preceded by a
+ # consumed newline and whitespace
+ |
+ (?=\${) # an expression
+ |
+ (?=</?[%&]) # a substitution or block or call start or end
+ # - don't consume
+ |
+ (\\\r?\n) # an escaped newline - throw away
+ |
+ \Z # end of string
+ )""", re.X | re.S)
+
+ if match:
+ text = match.group(1)
+ if text:
+ self.append_node(parsetree.Text, text)
+ return True
+ else:
+ return False
+
+ def match_python_block(self):
+ match = self.match(r"<%(!)?")
+ if match:
+ line, pos = self.matched_lineno, self.matched_charpos
+ text, end = self.parse_until_text(r'%>')
+ # the trailing newline helps
+ # compiler.parse() not complain about indentation
+ text = adjust_whitespace(text) + "\n"
+ self.append_node(
+ parsetree.Code,
+ text,
+ match.group(1) == '!', lineno=line, pos=pos)
+ return True
+ else:
+ return False
+
+ def match_expression(self):
+ match = self.match(r"\${")
+ if match:
+ line, pos = self.matched_lineno, self.matched_charpos
+ text, end = self.parse_until_text(r'\|', r'}')
+ if end == '|':
+ escapes, end = self.parse_until_text(r'}')
+ else:
+ escapes = ""
+ text = text.replace('\r\n', '\n')
+ self.append_node(
+ parsetree.Expression,
+ text, escapes.strip(),
+ lineno=line, pos=pos)
+ return True
+ else:
+ return False
+
+ def match_control_line(self):
+ match = self.match(
+ r"(?<=^)[\t ]*(%(?!%)|##)[\t ]*((?:(?:\\r?\n)|[^\r\n])*)"
+ r"(?:\r?\n|\Z)", re.M)
+ if match:
+ operator = match.group(1)
+ text = match.group(2)
+ if operator == '%':
+ m2 = re.match(r'(end)?(\w+)\s*(.*)', text)
+ if not m2:
+ raise exceptions.SyntaxException(
+ "Invalid control line: '%s'" %
+ text,
+ **self.exception_kwargs)
+ isend, keyword = m2.group(1, 2)
+ isend = (isend is not None)
+
+ if isend:
+ if not len(self.control_line):
+ raise exceptions.SyntaxException(
+ "No starting keyword '%s' for '%s'" %
+ (keyword, text),
+ **self.exception_kwargs)
+ elif self.control_line[-1].keyword != keyword:
+ raise exceptions.SyntaxException(
+ "Keyword '%s' doesn't match keyword '%s'" %
+ (text, self.control_line[-1].keyword),
+ **self.exception_kwargs)
+ self.append_node(parsetree.ControlLine, keyword, isend, text)
+ else:
+ self.append_node(parsetree.Comment, text)
+ return True
+ else:
+ return False
+
+ def match_comment(self):
+ """matches the multiline version of a comment"""
+ match = self.match(r"<%doc>(.*?)</%doc>", re.S)
+ if match:
+ self.append_node(parsetree.Comment, match.group(1))
+ return True
+ else:
+ return False
+
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/lookup.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/lookup.py
new file mode 100644
index 00000000000..2af5411907a
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/lookup.py
@@ -0,0 +1,359 @@
+# mako/lookup.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+import os, stat, posixpath, re
+from mako import exceptions, util
+from mako.template import Template
+
+try:
+ import threading
+except:
+ import dummy_threading as threading
+
+class TemplateCollection(object):
+ """Represent a collection of :class:`.Template` objects,
+ identifiable via URI.
+
+ A :class:`.TemplateCollection` is linked to the usage of
+ all template tags that address other templates, such
+ as ``<%include>``, ``<%namespace>``, and ``<%inherit>``.
+ The ``file`` attribute of each of those tags refers
+ to a string URI that is passed to that :class:`.Template`
+ object's :class:`.TemplateCollection` for resolution.
+
+ :class:`.TemplateCollection` is an abstract class,
+ with the usual default implementation being :class:`.TemplateLookup`.
+
+ """
+
+ def has_template(self, uri):
+ """Return ``True`` if this :class:`.TemplateLookup` is
+ capable of returning a :class:`.Template` object for the
+ given ``uri``.
+
+ :param uri: String URI of the template to be resolved.
+
+ """
+ try:
+ self.get_template(uri)
+ return True
+ except exceptions.TemplateLookupException:
+ return False
+
+ def get_template(self, uri, relativeto=None):
+ """Return a :class:`.Template` object corresponding to the given
+ ``uri``.
+
+ The default implementation raises
+ :class:`.NotImplementedError`. Implementations should
+ raise :class:`.TemplateLookupException` if the given ``uri``
+ cannot be resolved.
+
+ :param uri: String URI of the template to be resolved.
+ :param relativeto: if present, the given ``uri`` is assumed to
+ be relative to this URI.
+
+ """
+ raise NotImplementedError()
+
+ def filename_to_uri(self, uri, filename):
+ """Convert the given ``filename`` to a URI relative to
+ this :class:`.TemplateCollection`."""
+
+ return uri
+
+ def adjust_uri(self, uri, filename):
+ """Adjust the given ``uri`` based on the calling ``filename``.
+
+ When this method is called from the runtime, the
+ ``filename`` parameter is taken directly to the ``filename``
+ attribute of the calling template. Therefore a custom
+ :class:`.TemplateCollection` subclass can place any string
+ identifier desired in the ``filename`` parameter of the
+ :class:`.Template` objects it constructs and have them come back
+ here.
+
+ """
+ return uri
+
+class TemplateLookup(TemplateCollection):
+ """Represent a collection of templates that locates template source files
+ from the local filesystem.
+
+ The primary argument is the ``directories`` argument, the list of
+ directories to search:
+
+ .. sourcecode:: python
+
+ lookup = TemplateLookup(["/path/to/templates"])
+ some_template = lookup.get_template("/index.html")
+
+ The :class:`.TemplateLookup` can also be given :class:`.Template` objects
+ programatically using :meth:`.put_string` or :meth:`.put_template`:
+
+ .. sourcecode:: python
+
+ lookup = TemplateLookup()
+ lookup.put_string("base.html", '''
+ <html><body>${self.next()}</body></html>
+ ''')
+ lookup.put_string("hello.html", '''
+ <%include file='base.html'/>
+
+ Hello, world !
+ ''')
+
+
+ :param directories: A list of directory names which will be
+ searched for a particular template URI. The URI is appended
+ to each directory and the filesystem checked.
+
+ :param collection_size: Approximate size of the collection used
+ to store templates. If left at its default of ``-1``, the size
+ is unbounded, and a plain Python dictionary is used to
+ relate URI strings to :class:`.Template` instances.
+ Otherwise, a least-recently-used cache object is used which
+ will maintain the size of the collection approximately to
+ the number given.
+
+ :param filesystem_checks: When at its default value of ``True``,
+ each call to :meth:`.TemplateLookup.get_template()` will
+ compare the filesystem last modified time to the time in
+ which an existing :class:`.Template` object was created.
+ This allows the :class:`.TemplateLookup` to regenerate a
+ new :class:`.Template` whenever the original source has
+ been updated. Set this to ``False`` for a very minor
+ performance increase.
+
+ :param modulename_callable: A callable which, when present,
+ is passed the path of the source file as well as the
+ requested URI, and then returns the full path of the
+ generated Python module file. This is used to inject
+ alternate schemes for Python module location. If left at
+ its default of ``None``, the built in system of generation
+ based on ``module_directory`` plus ``uri`` is used.
+
+ All other keyword parameters available for
+ :class:`.Template` are mirrored here. When new
+ :class:`.Template` objects are created, the keywords
+ established with this :class:`.TemplateLookup` are passed on
+ to each new :class:`.Template`.
+
+ """
+
+ def __init__(self,
+ directories=None,
+ module_directory=None,
+ filesystem_checks=True,
+ collection_size=-1,
+ format_exceptions=False,
+ error_handler=None,
+ disable_unicode=False,
+ bytestring_passthrough=False,
+ output_encoding=None,
+ encoding_errors='strict',
+
+ cache_args=None,
+ cache_impl='beaker',
+ cache_enabled=True,
+ cache_type=None,
+ cache_dir=None,
+ cache_url=None,
+
+ modulename_callable=None,
+ module_writer=None,
+ default_filters=None,
+ buffer_filters=(),
+ strict_undefined=False,
+ imports=None,
+ future_imports=None,
+ enable_loop=True,
+ input_encoding=None,
+ preprocessor=None,
+ lexer_cls=None):
+
+ self.directories = [posixpath.normpath(d) for d in
+ util.to_list(directories, ())
+ ]
+ self.module_directory = module_directory
+ self.modulename_callable = modulename_callable
+ self.filesystem_checks = filesystem_checks
+ self.collection_size = collection_size
+
+ if cache_args is None:
+ cache_args = {}
+ # transfer deprecated cache_* args
+ if cache_dir:
+ cache_args.setdefault('dir', cache_dir)
+ if cache_url:
+ cache_args.setdefault('url', cache_url)
+ if cache_type:
+ cache_args.setdefault('type', cache_type)
+
+ self.template_args = {
+ 'format_exceptions':format_exceptions,
+ 'error_handler':error_handler,
+ 'disable_unicode':disable_unicode,
+ 'bytestring_passthrough':bytestring_passthrough,
+ 'output_encoding':output_encoding,
+ 'cache_impl':cache_impl,
+ 'encoding_errors':encoding_errors,
+ 'input_encoding':input_encoding,
+ 'module_directory':module_directory,
+ 'module_writer':module_writer,
+ 'cache_args':cache_args,
+ 'cache_enabled':cache_enabled,
+ 'default_filters':default_filters,
+ 'buffer_filters':buffer_filters,
+ 'strict_undefined':strict_undefined,
+ 'imports':imports,
+ 'future_imports':future_imports,
+ 'enable_loop':enable_loop,
+ 'preprocessor':preprocessor,
+ 'lexer_cls':lexer_cls
+ }
+
+ if collection_size == -1:
+ self._collection = {}
+ self._uri_cache = {}
+ else:
+ self._collection = util.LRUCache(collection_size)
+ self._uri_cache = util.LRUCache(collection_size)
+ self._mutex = threading.Lock()
+
+ def get_template(self, uri):
+ """Return a :class:`.Template` object corresponding to the given
+ ``uri``.
+
+ .. note:: The ``relativeto`` argument is not supported here at the moment.
+
+ """
+
+ try:
+ if self.filesystem_checks:
+ return self._check(uri, self._collection[uri])
+ else:
+ return self._collection[uri]
+ except KeyError:
+ u = re.sub(r'^\/+', '', uri)
+ for dir in self.directories:
+ srcfile = posixpath.normpath(posixpath.join(dir, u))
+ if os.path.isfile(srcfile):
+ return self._load(srcfile, uri)
+ else:
+ raise exceptions.TopLevelLookupException(
+ "Cant locate template for uri %r" % uri)
+
+ def adjust_uri(self, uri, relativeto):
+ """Adjust the given ``uri`` based on the given relative URI."""
+
+ key = (uri, relativeto)
+ if key in self._uri_cache:
+ return self._uri_cache[key]
+
+ if uri[0] != '/':
+ if relativeto is not None:
+ v = self._uri_cache[key] = posixpath.join(
+ posixpath.dirname(relativeto), uri)
+ else:
+ v = self._uri_cache[key] = '/' + uri
+ else:
+ v = self._uri_cache[key] = uri
+ return v
+
+
+ def filename_to_uri(self, filename):
+ """Convert the given ``filename`` to a URI relative to
+ this :class:`.TemplateCollection`."""
+
+ try:
+ return self._uri_cache[filename]
+ except KeyError:
+ value = self._relativeize(filename)
+ self._uri_cache[filename] = value
+ return value
+
+ def _relativeize(self, filename):
+ """Return the portion of a filename that is 'relative'
+ to the directories in this lookup.
+
+ """
+
+ filename = posixpath.normpath(filename)
+ for dir in self.directories:
+ if filename[0:len(dir)] == dir:
+ return filename[len(dir):]
+ else:
+ return None
+
+ def _load(self, filename, uri):
+ self._mutex.acquire()
+ try:
+ try:
+ # try returning from collection one
+ # more time in case concurrent thread already loaded
+ return self._collection[uri]
+ except KeyError:
+ pass
+ try:
+ if self.modulename_callable is not None:
+ module_filename = self.modulename_callable(filename, uri)
+ else:
+ module_filename = None
+ self._collection[uri] = template = Template(
+ uri=uri,
+ filename=posixpath.normpath(filename),
+ lookup=self,
+ module_filename=module_filename,
+ **self.template_args)
+ return template
+ except:
+ # if compilation fails etc, ensure
+ # template is removed from collection,
+ # re-raise
+ self._collection.pop(uri, None)
+ raise
+ finally:
+ self._mutex.release()
+
+ def _check(self, uri, template):
+ if template.filename is None:
+ return template
+
+ try:
+ template_stat = os.stat(template.filename)
+ if template.module._modified_time < \
+ template_stat[stat.ST_MTIME]:
+ self._collection.pop(uri, None)
+ return self._load(template.filename, uri)
+ else:
+ return template
+ except OSError:
+ self._collection.pop(uri, None)
+ raise exceptions.TemplateLookupException(
+ "Cant locate template for uri %r" % uri)
+
+
+ def put_string(self, uri, text):
+ """Place a new :class:`.Template` object into this
+ :class:`.TemplateLookup`, based on the given string of
+ ``text``.
+
+ """
+ self._collection[uri] = Template(
+ text,
+ lookup=self,
+ uri=uri,
+ **self.template_args)
+
+ def put_template(self, uri, template):
+ """Place a new :class:`.Template` object into this
+ :class:`.TemplateLookup`, based on the given
+ :class:`.Template` object.
+
+ """
+ self._collection[uri] = template
+
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/parsetree.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/parsetree.py
new file mode 100644
index 00000000000..49ec4e0696c
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/parsetree.py
@@ -0,0 +1,594 @@
+# mako/parsetree.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+"""defines the parse tree components for Mako templates."""
+
+from mako import exceptions, ast, util, filters, compat
+import re
+
+class Node(object):
+ """base class for a Node in the parse tree."""
+
+ def __init__(self, source, lineno, pos, filename):
+ self.source = source
+ self.lineno = lineno
+ self.pos = pos
+ self.filename = filename
+
+ @property
+ def exception_kwargs(self):
+ return {'source': self.source, 'lineno': self.lineno,
+ 'pos': self.pos, 'filename': self.filename}
+
+ def get_children(self):
+ return []
+
+ def accept_visitor(self, visitor):
+ def traverse(node):
+ for n in node.get_children():
+ n.accept_visitor(visitor)
+
+ method = getattr(visitor, "visit" + self.__class__.__name__, traverse)
+ method(self)
+
+class TemplateNode(Node):
+ """a 'container' node that stores the overall collection of nodes."""
+
+ def __init__(self, filename):
+ super(TemplateNode, self).__init__('', 0, 0, filename)
+ self.nodes = []
+ self.page_attributes = {}
+
+ def get_children(self):
+ return self.nodes
+
+ def __repr__(self):
+ return "TemplateNode(%s, %r)" % (
+ util.sorted_dict_repr(self.page_attributes),
+ self.nodes)
+
+class ControlLine(Node):
+ """defines a control line, a line-oriented python line or end tag.
+
+ e.g.::
+
+ % if foo:
+ (markup)
+ % endif
+
+ """
+
+ has_loop_context = False
+
+ def __init__(self, keyword, isend, text, **kwargs):
+ super(ControlLine, self).__init__(**kwargs)
+ self.text = text
+ self.keyword = keyword
+ self.isend = isend
+ self.is_primary = keyword in ['for', 'if', 'while', 'try', 'with']
+ self.nodes = []
+ if self.isend:
+ self._declared_identifiers = []
+ self._undeclared_identifiers = []
+ else:
+ code = ast.PythonFragment(text, **self.exception_kwargs)
+ self._declared_identifiers = code.declared_identifiers
+ self._undeclared_identifiers = code.undeclared_identifiers
+
+ def get_children(self):
+ return self.nodes
+
+ def declared_identifiers(self):
+ return self._declared_identifiers
+
+ def undeclared_identifiers(self):
+ return self._undeclared_identifiers
+
+ def is_ternary(self, keyword):
+ """return true if the given keyword is a ternary keyword
+ for this ControlLine"""
+
+ return keyword in {
+ 'if':set(['else', 'elif']),
+ 'try':set(['except', 'finally']),
+ 'for':set(['else'])
+ }.get(self.keyword, [])
+
+ def __repr__(self):
+ return "ControlLine(%r, %r, %r, %r)" % (
+ self.keyword,
+ self.text,
+ self.isend,
+ (self.lineno, self.pos)
+ )
+
+class Text(Node):
+ """defines plain text in the template."""
+
+ def __init__(self, content, **kwargs):
+ super(Text, self).__init__(**kwargs)
+ self.content = content
+
+ def __repr__(self):
+ return "Text(%r, %r)" % (self.content, (self.lineno, self.pos))
+
+class Code(Node):
+ """defines a Python code block, either inline or module level.
+
+ e.g.::
+
+ inline:
+ <%
+ x = 12
+ %>
+
+ module level:
+ <%!
+ import logger
+ %>
+
+ """
+
+ def __init__(self, text, ismodule, **kwargs):
+ super(Code, self).__init__(**kwargs)
+ self.text = text
+ self.ismodule = ismodule
+ self.code = ast.PythonCode(text, **self.exception_kwargs)
+
+ def declared_identifiers(self):
+ return self.code.declared_identifiers
+
+ def undeclared_identifiers(self):
+ return self.code.undeclared_identifiers
+
+ def __repr__(self):
+ return "Code(%r, %r, %r)" % (
+ self.text,
+ self.ismodule,
+ (self.lineno, self.pos)
+ )
+
+class Comment(Node):
+ """defines a comment line.
+
+ # this is a comment
+
+ """
+
+ def __init__(self, text, **kwargs):
+ super(Comment, self).__init__(**kwargs)
+ self.text = text
+
+ def __repr__(self):
+ return "Comment(%r, %r)" % (self.text, (self.lineno, self.pos))
+
+class Expression(Node):
+ """defines an inline expression.
+
+ ${x+y}
+
+ """
+
+ def __init__(self, text, escapes, **kwargs):
+ super(Expression, self).__init__(**kwargs)
+ self.text = text
+ self.escapes = escapes
+ self.escapes_code = ast.ArgumentList(escapes, **self.exception_kwargs)
+ self.code = ast.PythonCode(text, **self.exception_kwargs)
+
+ def declared_identifiers(self):
+ return []
+
+ def undeclared_identifiers(self):
+ # TODO: make the "filter" shortcut list configurable at parse/gen time
+ return self.code.undeclared_identifiers.union(
+ self.escapes_code.undeclared_identifiers.difference(
+ set(filters.DEFAULT_ESCAPES.keys())
+ )
+ ).difference(self.code.declared_identifiers)
+
+ def __repr__(self):
+ return "Expression(%r, %r, %r)" % (
+ self.text,
+ self.escapes_code.args,
+ (self.lineno, self.pos)
+ )
+
+class _TagMeta(type):
+ """metaclass to allow Tag to produce a subclass according to
+ its keyword"""
+
+ _classmap = {}
+
+ def __init__(cls, clsname, bases, dict):
+ if getattr(cls, '__keyword__', None) is not None:
+ cls._classmap[cls.__keyword__] = cls
+ super(_TagMeta, cls).__init__(clsname, bases, dict)
+
+ def __call__(cls, keyword, attributes, **kwargs):
+ if ":" in keyword:
+ ns, defname = keyword.split(':')
+ return type.__call__(CallNamespaceTag, ns, defname,
+ attributes, **kwargs)
+
+ try:
+ cls = _TagMeta._classmap[keyword]
+ except KeyError:
+ raise exceptions.CompileException(
+ "No such tag: '%s'" % keyword,
+ source=kwargs['source'],
+ lineno=kwargs['lineno'],
+ pos=kwargs['pos'],
+ filename=kwargs['filename']
+ )
+ return type.__call__(cls, keyword, attributes, **kwargs)
+
+class Tag(compat.with_metaclass(_TagMeta, Node)):
+ """abstract base class for tags.
+
+ <%sometag/>
+
+ <%someothertag>
+ stuff
+ </%someothertag>
+
+ """
+ __keyword__ = None
+
+ def __init__(self, keyword, attributes, expressions,
+ nonexpressions, required, **kwargs):
+ """construct a new Tag instance.
+
+ this constructor not called directly, and is only called
+ by subclasses.
+
+ :param keyword: the tag keyword
+
+ :param attributes: raw dictionary of attribute key/value pairs
+
+ :param expressions: a set of identifiers that are legal attributes,
+ which can also contain embedded expressions
+
+ :param nonexpressions: a set of identifiers that are legal
+ attributes, which cannot contain embedded expressions
+
+ :param \**kwargs:
+ other arguments passed to the Node superclass (lineno, pos)
+
+ """
+ super(Tag, self).__init__(**kwargs)
+ self.keyword = keyword
+ self.attributes = attributes
+ self._parse_attributes(expressions, nonexpressions)
+ missing = [r for r in required if r not in self.parsed_attributes]
+ if len(missing):
+ raise exceptions.CompileException(
+ "Missing attribute(s): %s" %
+ ",".join([repr(m) for m in missing]),
+ **self.exception_kwargs)
+ self.parent = None
+ self.nodes = []
+
+ def is_root(self):
+ return self.parent is None
+
+ def get_children(self):
+ return self.nodes
+
+ def _parse_attributes(self, expressions, nonexpressions):
+ undeclared_identifiers = set()
+ self.parsed_attributes = {}
+ for key in self.attributes:
+ if key in expressions:
+ expr = []
+ for x in re.compile(r'(\${.+?})',
+ re.S).split(self.attributes[key]):
+ m = re.compile(r'^\${(.+?)}$', re.S).match(x)
+ if m:
+ code = ast.PythonCode(m.group(1).rstrip(),
+ **self.exception_kwargs)
+ # we aren't discarding "declared_identifiers" here,
+ # which we do so that list comprehension-declared
+ # variables aren't counted. As yet can't find a
+ # condition that requires it here.
+ undeclared_identifiers = \
+ undeclared_identifiers.union(
+ code.undeclared_identifiers)
+ expr.append('(%s)' % m.group(1))
+ else:
+ if x:
+ expr.append(repr(x))
+ self.parsed_attributes[key] = " + ".join(expr) or repr('')
+ elif key in nonexpressions:
+ if re.search(r'\${.+?}', self.attributes[key]):
+ raise exceptions.CompileException(
+ "Attibute '%s' in tag '%s' does not allow embedded "
+ "expressions" % (key, self.keyword),
+ **self.exception_kwargs)
+ self.parsed_attributes[key] = repr(self.attributes[key])
+ else:
+ raise exceptions.CompileException(
+ "Invalid attribute for tag '%s': '%s'" %
+ (self.keyword, key),
+ **self.exception_kwargs)
+ self.expression_undeclared_identifiers = undeclared_identifiers
+
+ def declared_identifiers(self):
+ return []
+
+ def undeclared_identifiers(self):
+ return self.expression_undeclared_identifiers
+
+ def __repr__(self):
+ return "%s(%r, %s, %r, %r)" % (self.__class__.__name__,
+ self.keyword,
+ util.sorted_dict_repr(self.attributes),
+ (self.lineno, self.pos),
+ self.nodes
+ )
+
+class IncludeTag(Tag):
+ __keyword__ = 'include'
+
+ def __init__(self, keyword, attributes, **kwargs):
+ super(IncludeTag, self).__init__(
+ keyword,
+ attributes,
+ ('file', 'import', 'args'),
+ (), ('file',), **kwargs)
+ self.page_args = ast.PythonCode(
+ "__DUMMY(%s)" % attributes.get('args', ''),
+ **self.exception_kwargs)
+
+ def declared_identifiers(self):
+ return []
+
+ def undeclared_identifiers(self):
+ identifiers = self.page_args.undeclared_identifiers.\
+ difference(set(["__DUMMY"])).\
+ difference(self.page_args.declared_identifiers)
+ return identifiers.union(super(IncludeTag, self).
+ undeclared_identifiers())
+
+class NamespaceTag(Tag):
+ __keyword__ = 'namespace'
+
+ def __init__(self, keyword, attributes, **kwargs):
+ super(NamespaceTag, self).__init__(
+ keyword, attributes,
+ ('file',),
+ ('name','inheritable',
+ 'import','module'),
+ (), **kwargs)
+
+ self.name = attributes.get('name', '__anon_%s' % hex(abs(id(self))))
+ if not 'name' in attributes and not 'import' in attributes:
+ raise exceptions.CompileException(
+ "'name' and/or 'import' attributes are required "
+ "for <%namespace>",
+ **self.exception_kwargs)
+ if 'file' in attributes and 'module' in attributes:
+ raise exceptions.CompileException(
+ "<%namespace> may only have one of 'file' or 'module'",
+ **self.exception_kwargs
+ )
+
+ def declared_identifiers(self):
+ return []
+
+class TextTag(Tag):
+ __keyword__ = 'text'
+
+ def __init__(self, keyword, attributes, **kwargs):
+ super(TextTag, self).__init__(
+ keyword,
+ attributes, (),
+ ('filter'), (), **kwargs)
+ self.filter_args = ast.ArgumentList(
+ attributes.get('filter', ''),
+ **self.exception_kwargs)
+
+ def undeclared_identifiers(self):
+ return self.filter_args.\
+ undeclared_identifiers.\
+ difference(filters.DEFAULT_ESCAPES.keys()).union(
+ self.expression_undeclared_identifiers
+ )
+
+class DefTag(Tag):
+ __keyword__ = 'def'
+
+ def __init__(self, keyword, attributes, **kwargs):
+ expressions = ['buffered', 'cached'] + [
+ c for c in attributes if c.startswith('cache_')]
+
+
+ super(DefTag, self).__init__(
+ keyword,
+ attributes,
+ expressions,
+ ('name', 'filter', 'decorator'),
+ ('name',),
+ **kwargs)
+ name = attributes['name']
+ if re.match(r'^[\w_]+$', name):
+ raise exceptions.CompileException(
+ "Missing parenthesis in %def",
+ **self.exception_kwargs)
+ self.function_decl = ast.FunctionDecl("def " + name + ":pass",
+ **self.exception_kwargs)
+ self.name = self.function_decl.funcname
+ self.decorator = attributes.get('decorator', '')
+ self.filter_args = ast.ArgumentList(
+ attributes.get('filter', ''),
+ **self.exception_kwargs)
+
+ is_anonymous = False
+ is_block = False
+
+ @property
+ def funcname(self):
+ return self.function_decl.funcname
+
+ def get_argument_expressions(self, **kw):
+ return self.function_decl.get_argument_expressions(**kw)
+
+ def declared_identifiers(self):
+ return self.function_decl.allargnames
+
+ def undeclared_identifiers(self):
+ res = []
+ for c in self.function_decl.defaults:
+ res += list(ast.PythonCode(c, **self.exception_kwargs).
+ undeclared_identifiers)
+ return set(res).union(
+ self.filter_args.\
+ undeclared_identifiers.\
+ difference(filters.DEFAULT_ESCAPES.keys())
+ ).union(
+ self.expression_undeclared_identifiers
+ ).difference(
+ self.function_decl.allargnames
+ )
+
+class BlockTag(Tag):
+ __keyword__ = 'block'
+
+ def __init__(self, keyword, attributes, **kwargs):
+ expressions = ['buffered', 'cached', 'args'] + [
+ c for c in attributes if c.startswith('cache_')]
+
+ super(BlockTag, self).__init__(
+ keyword,
+ attributes,
+ expressions,
+ ('name','filter', 'decorator'),
+ (),
+ **kwargs)
+ name = attributes.get('name')
+ if name and not re.match(r'^[\w_]+$',name):
+ raise exceptions.CompileException(
+ "%block may not specify an argument signature",
+ **self.exception_kwargs)
+ if not name and attributes.get('args', None):
+ raise exceptions.CompileException(
+ "Only named %blocks may specify args",
+ **self.exception_kwargs
+ )
+ self.body_decl = ast.FunctionArgs(attributes.get('args', ''),
+ **self.exception_kwargs)
+
+ self.name = name
+ self.decorator = attributes.get('decorator', '')
+ self.filter_args = ast.ArgumentList(
+ attributes.get('filter', ''),
+ **self.exception_kwargs)
+
+
+ is_block = True
+
+ @property
+ def is_anonymous(self):
+ return self.name is None
+
+ @property
+ def funcname(self):
+ return self.name or "__M_anon_%d" % (self.lineno, )
+
+ def get_argument_expressions(self, **kw):
+ return self.body_decl.get_argument_expressions(**kw)
+
+ def declared_identifiers(self):
+ return self.body_decl.allargnames
+
+ def undeclared_identifiers(self):
+ return (self.filter_args.\
+ undeclared_identifiers.\
+ difference(filters.DEFAULT_ESCAPES.keys())
+ ).union(self.expression_undeclared_identifiers)
+
+
+
+class CallTag(Tag):
+ __keyword__ = 'call'
+
+ def __init__(self, keyword, attributes, **kwargs):
+ super(CallTag, self).__init__(keyword, attributes,
+ ('args'), ('expr',), ('expr',), **kwargs)
+ self.expression = attributes['expr']
+ self.code = ast.PythonCode(self.expression, **self.exception_kwargs)
+ self.body_decl = ast.FunctionArgs(attributes.get('args', ''),
+ **self.exception_kwargs)
+
+ def declared_identifiers(self):
+ return self.code.declared_identifiers.union(self.body_decl.allargnames)
+
+ def undeclared_identifiers(self):
+ return self.code.undeclared_identifiers.\
+ difference(self.code.declared_identifiers)
+
+class CallNamespaceTag(Tag):
+
+ def __init__(self, namespace, defname, attributes, **kwargs):
+ super(CallNamespaceTag, self).__init__(
+ namespace + ":" + defname,
+ attributes,
+ tuple(attributes.keys()) + ('args', ),
+ (),
+ (),
+ **kwargs)
+
+ self.expression = "%s.%s(%s)" % (
+ namespace,
+ defname,
+ ",".join(["%s=%s" % (k, v) for k, v in
+ self.parsed_attributes.items()
+ if k != 'args'])
+ )
+ self.code = ast.PythonCode(self.expression, **self.exception_kwargs)
+ self.body_decl = ast.FunctionArgs(
+ attributes.get('args', ''),
+ **self.exception_kwargs)
+
+ def declared_identifiers(self):
+ return self.code.declared_identifiers.union(self.body_decl.allargnames)
+
+ def undeclared_identifiers(self):
+ return self.code.undeclared_identifiers.\
+ difference(self.code.declared_identifiers)
+
+class InheritTag(Tag):
+ __keyword__ = 'inherit'
+
+ def __init__(self, keyword, attributes, **kwargs):
+ super(InheritTag, self).__init__(
+ keyword, attributes,
+ ('file',), (), ('file',), **kwargs)
+
+class PageTag(Tag):
+ __keyword__ = 'page'
+
+ def __init__(self, keyword, attributes, **kwargs):
+ expressions = ['cached', 'args', 'expression_filter', 'enable_loop'] + [
+ c for c in attributes if c.startswith('cache_')]
+
+ super(PageTag, self).__init__(
+ keyword,
+ attributes,
+ expressions,
+ (),
+ (),
+ **kwargs)
+ self.body_decl = ast.FunctionArgs(attributes.get('args', ''),
+ **self.exception_kwargs)
+ self.filter_args = ast.ArgumentList(
+ attributes.get('expression_filter', ''),
+ **self.exception_kwargs)
+
+ def declared_identifiers(self):
+ return self.body_decl.allargnames
+
+
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/pygen.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/pygen.py
new file mode 100644
index 00000000000..5ba5125a4c7
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/pygen.py
@@ -0,0 +1,299 @@
+# mako/pygen.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+"""utilities for generating and formatting literal Python code."""
+
+import re
+from mako import exceptions
+
+class PythonPrinter(object):
+ def __init__(self, stream):
+ # indentation counter
+ self.indent = 0
+
+ # a stack storing information about why we incremented
+ # the indentation counter, to help us determine if we
+ # should decrement it
+ self.indent_detail = []
+
+ # the string of whitespace multiplied by the indent
+ # counter to produce a line
+ self.indentstring = " "
+
+ # the stream we are writing to
+ self.stream = stream
+
+ # current line number
+ self.lineno = 1
+
+ # a list of lines that represents a buffered "block" of code,
+ # which can be later printed relative to an indent level
+ self.line_buffer = []
+
+ self.in_indent_lines = False
+
+ self._reset_multi_line_flags()
+
+ # mapping of generated python lines to template
+ # source lines
+ self.source_map = {}
+
+ def _update_lineno(self, num):
+ self.lineno += num
+
+ def start_source(self, lineno):
+ if self.lineno not in self.source_map:
+ self.source_map[self.lineno] = lineno
+
+ def write_blanks(self, num):
+ self.stream.write("\n" * num)
+ self._update_lineno(num)
+
+ def write_indented_block(self, block):
+ """print a line or lines of python which already contain indentation.
+
+ The indentation of the total block of lines will be adjusted to that of
+ the current indent level."""
+ self.in_indent_lines = False
+ for l in re.split(r'\r?\n', block):
+ self.line_buffer.append(l)
+ self._update_lineno(1)
+
+ def writelines(self, *lines):
+ """print a series of lines of python."""
+ for line in lines:
+ self.writeline(line)
+
+ def writeline(self, line):
+ """print a line of python, indenting it according to the current
+ indent level.
+
+ this also adjusts the indentation counter according to the
+ content of the line.
+
+ """
+
+ if not self.in_indent_lines:
+ self._flush_adjusted_lines()
+ self.in_indent_lines = True
+
+ if (line is None or
+ re.match(r"^\s*#",line) or
+ re.match(r"^\s*$", line)
+ ):
+ hastext = False
+ else:
+ hastext = True
+
+ is_comment = line and len(line) and line[0] == '#'
+
+ # see if this line should decrease the indentation level
+ if (not is_comment and
+ (not hastext or self._is_unindentor(line))
+ ):
+
+ if self.indent > 0:
+ self.indent -= 1
+ # if the indent_detail stack is empty, the user
+ # probably put extra closures - the resulting
+ # module wont compile.
+ if len(self.indent_detail) == 0:
+ raise exceptions.SyntaxException(
+ "Too many whitespace closures")
+ self.indent_detail.pop()
+
+ if line is None:
+ return
+
+ # write the line
+ self.stream.write(self._indent_line(line) + "\n")
+ self._update_lineno(len(line.split("\n")))
+
+ # see if this line should increase the indentation level.
+ # note that a line can both decrase (before printing) and
+ # then increase (after printing) the indentation level.
+
+ if re.search(r":[ \t]*(?:#.*)?$", line):
+ # increment indentation count, and also
+ # keep track of what the keyword was that indented us,
+ # if it is a python compound statement keyword
+ # where we might have to look for an "unindent" keyword
+ match = re.match(r"^\s*(if|try|elif|while|for|with)", line)
+ if match:
+ # its a "compound" keyword, so we will check for "unindentors"
+ indentor = match.group(1)
+ self.indent += 1
+ self.indent_detail.append(indentor)
+ else:
+ indentor = None
+ # its not a "compound" keyword. but lets also
+ # test for valid Python keywords that might be indenting us,
+ # else assume its a non-indenting line
+ m2 = re.match(r"^\s*(def|class|else|elif|except|finally)",
+ line)
+ if m2:
+ self.indent += 1
+ self.indent_detail.append(indentor)
+
+ def close(self):
+ """close this printer, flushing any remaining lines."""
+ self._flush_adjusted_lines()
+
+ def _is_unindentor(self, line):
+ """return true if the given line is an 'unindentor',
+ relative to the last 'indent' event received.
+
+ """
+
+ # no indentation detail has been pushed on; return False
+ if len(self.indent_detail) == 0:
+ return False
+
+ indentor = self.indent_detail[-1]
+
+ # the last indent keyword we grabbed is not a
+ # compound statement keyword; return False
+ if indentor is None:
+ return False
+
+ # if the current line doesnt have one of the "unindentor" keywords,
+ # return False
+ match = re.match(r"^\s*(else|elif|except|finally).*\:", line)
+ if not match:
+ return False
+
+ # whitespace matches up, we have a compound indentor,
+ # and this line has an unindentor, this
+ # is probably good enough
+ return True
+
+ # should we decide that its not good enough, heres
+ # more stuff to check.
+ #keyword = match.group(1)
+
+ # match the original indent keyword
+ #for crit in [
+ # (r'if|elif', r'else|elif'),
+ # (r'try', r'except|finally|else'),
+ # (r'while|for', r'else'),
+ #]:
+ # if re.match(crit[0], indentor) and re.match(crit[1], keyword):
+ # return True
+
+ #return False
+
+ def _indent_line(self, line, stripspace=''):
+ """indent the given line according to the current indent level.
+
+ stripspace is a string of space that will be truncated from the
+ start of the line before indenting."""
+
+ return re.sub(r"^%s" % stripspace, self.indentstring
+ * self.indent, line)
+
+ def _reset_multi_line_flags(self):
+ """reset the flags which would indicate we are in a backslashed
+ or triple-quoted section."""
+
+ self.backslashed, self.triplequoted = False, False
+
+ def _in_multi_line(self, line):
+ """return true if the given line is part of a multi-line block,
+ via backslash or triple-quote."""
+
+ # we are only looking for explicitly joined lines here, not
+ # implicit ones (i.e. brackets, braces etc.). this is just to
+ # guard against the possibility of modifying the space inside of
+ # a literal multiline string with unfortunately placed
+ # whitespace
+
+ current_state = (self.backslashed or self.triplequoted)
+
+ if re.search(r"\\$", line):
+ self.backslashed = True
+ else:
+ self.backslashed = False
+
+ triples = len(re.findall(r"\"\"\"|\'\'\'", line))
+ if triples == 1 or triples % 2 != 0:
+ self.triplequoted = not self.triplequoted
+
+ return current_state
+
+ def _flush_adjusted_lines(self):
+ stripspace = None
+ self._reset_multi_line_flags()
+
+ for entry in self.line_buffer:
+ if self._in_multi_line(entry):
+ self.stream.write(entry + "\n")
+ else:
+ entry = entry.expandtabs()
+ if stripspace is None and re.search(r"^[ \t]*[^# \t]", entry):
+ stripspace = re.match(r"^([ \t]*)", entry).group(1)
+ self.stream.write(self._indent_line(entry, stripspace) + "\n")
+
+ self.line_buffer = []
+ self._reset_multi_line_flags()
+
+
+def adjust_whitespace(text):
+ """remove the left-whitespace margin of a block of Python code."""
+
+ state = [False, False]
+ (backslashed, triplequoted) = (0, 1)
+
+ def in_multi_line(line):
+ start_state = (state[backslashed] or state[triplequoted])
+
+ if re.search(r"\\$", line):
+ state[backslashed] = True
+ else:
+ state[backslashed] = False
+
+ def match(reg, t):
+ m = re.match(reg, t)
+ if m:
+ return m, t[len(m.group(0)):]
+ else:
+ return None, t
+
+ while line:
+ if state[triplequoted]:
+ m, line = match(r"%s" % state[triplequoted], line)
+ if m:
+ state[triplequoted] = False
+ else:
+ m, line = match(r".*?(?=%s|$)" % state[triplequoted], line)
+ else:
+ m, line = match(r'#', line)
+ if m:
+ return start_state
+
+ m, line = match(r"\"\"\"|\'\'\'", line)
+ if m:
+ state[triplequoted] = m.group(0)
+ continue
+
+ m, line = match(r".*?(?=\"\"\"|\'\'\'|#|$)", line)
+
+ return start_state
+
+ def _indent_line(line, stripspace=''):
+ return re.sub(r"^%s" % stripspace, '', line)
+
+ lines = []
+ stripspace = None
+
+ for line in re.split(r'\r?\n', text):
+ if in_multi_line(line):
+ lines.append(line)
+ else:
+ line = line.expandtabs()
+ if stripspace is None and re.search(r"^[ \t]*[^# \t]", line):
+ stripspace = re.match(r"^([ \t]*)", line).group(1)
+ lines.append(_indent_line(line, stripspace))
+ return "\n".join(lines)
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/pyparser.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/pyparser.py
new file mode 100644
index 00000000000..bfa46a9fafd
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/pyparser.py
@@ -0,0 +1,232 @@
+# mako/pyparser.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+"""Handles parsing of Python code.
+
+Parsing to AST is done via _ast on Python > 2.5, otherwise the compiler
+module is used.
+"""
+
+from mako import exceptions, util, compat
+from mako.compat import arg_stringname
+import operator
+
+if compat.py3k:
+ # words that cannot be assigned to (notably
+ # smaller than the total keys in __builtins__)
+ reserved = set(['True', 'False', 'None', 'print'])
+
+ # the "id" attribute on a function node
+ arg_id = operator.attrgetter('arg')
+else:
+ # words that cannot be assigned to (notably
+ # smaller than the total keys in __builtins__)
+ reserved = set(['True', 'False', 'None'])
+
+ # the "id" attribute on a function node
+ arg_id = operator.attrgetter('id')
+
+import _ast
+util.restore__ast(_ast)
+from mako import _ast_util
+
+
+def parse(code, mode='exec', **exception_kwargs):
+ """Parse an expression into AST"""
+
+ try:
+ return _ast_util.parse(code, '<unknown>', mode)
+ except Exception:
+ raise exceptions.SyntaxException(
+ "(%s) %s (%r)" % (
+ compat.exception_as().__class__.__name__,
+ compat.exception_as(),
+ code[0:50]
+ ), **exception_kwargs)
+
+
+class FindIdentifiers(_ast_util.NodeVisitor):
+
+ def __init__(self, listener, **exception_kwargs):
+ self.in_function = False
+ self.in_assign_targets = False
+ self.local_ident_stack = set()
+ self.listener = listener
+ self.exception_kwargs = exception_kwargs
+
+ def _add_declared(self, name):
+ if not self.in_function:
+ self.listener.declared_identifiers.add(name)
+ else:
+ self.local_ident_stack.add(name)
+
+ def visit_ClassDef(self, node):
+ self._add_declared(node.name)
+
+ def visit_Assign(self, node):
+
+ # flip around the visiting of Assign so the expression gets
+ # evaluated first, in the case of a clause like "x=x+5" (x
+ # is undeclared)
+
+ self.visit(node.value)
+ in_a = self.in_assign_targets
+ self.in_assign_targets = True
+ for n in node.targets:
+ self.visit(n)
+ self.in_assign_targets = in_a
+
+ if compat.py3k:
+
+ # ExceptHandler is in Python 2, but this block only works in
+ # Python 3 (and is required there)
+
+ def visit_ExceptHandler(self, node):
+ if node.name is not None:
+ self._add_declared(node.name)
+ if node.type is not None:
+ self.visit(node.type)
+ for statement in node.body:
+ self.visit(statement)
+
+ def visit_Lambda(self, node, *args):
+ self._visit_function(node, True)
+
+ def visit_FunctionDef(self, node):
+ self._add_declared(node.name)
+ self._visit_function(node, False)
+
+ def _expand_tuples(self, args):
+ for arg in args:
+ if isinstance(arg, _ast.Tuple):
+ for n in arg.elts:
+ yield n
+ else:
+ yield arg
+
+ def _visit_function(self, node, islambda):
+
+ # push function state onto stack. dont log any more
+ # identifiers as "declared" until outside of the function,
+ # but keep logging identifiers as "undeclared". track
+ # argument names in each function header so they arent
+ # counted as "undeclared"
+
+ inf = self.in_function
+ self.in_function = True
+
+ local_ident_stack = self.local_ident_stack
+ self.local_ident_stack = local_ident_stack.union([
+ arg_id(arg) for arg in self._expand_tuples(node.args.args)
+ ])
+ if islambda:
+ self.visit(node.body)
+ else:
+ for n in node.body:
+ self.visit(n)
+ self.in_function = inf
+ self.local_ident_stack = local_ident_stack
+
+ def visit_For(self, node):
+
+ # flip around visit
+
+ self.visit(node.iter)
+ self.visit(node.target)
+ for statement in node.body:
+ self.visit(statement)
+ for statement in node.orelse:
+ self.visit(statement)
+
+ def visit_Name(self, node):
+ if isinstance(node.ctx, _ast.Store):
+ # this is eqiuvalent to visit_AssName in
+ # compiler
+ self._add_declared(node.id)
+ elif node.id not in reserved and node.id \
+ not in self.listener.declared_identifiers and node.id \
+ not in self.local_ident_stack:
+ self.listener.undeclared_identifiers.add(node.id)
+
+ def visit_Import(self, node):
+ for name in node.names:
+ if name.asname is not None:
+ self._add_declared(name.asname)
+ else:
+ self._add_declared(name.name.split('.')[0])
+
+ def visit_ImportFrom(self, node):
+ for name in node.names:
+ if name.asname is not None:
+ self._add_declared(name.asname)
+ else:
+ if name.name == '*':
+ raise exceptions.CompileException(
+ "'import *' is not supported, since all identifier "
+ "names must be explicitly declared. Please use the "
+ "form 'from <modulename> import <name1>, <name2>, "
+ "...' instead.", **self.exception_kwargs)
+ self._add_declared(name.name)
+
+
+class FindTuple(_ast_util.NodeVisitor):
+
+ def __init__(self, listener, code_factory, **exception_kwargs):
+ self.listener = listener
+ self.exception_kwargs = exception_kwargs
+ self.code_factory = code_factory
+
+ def visit_Tuple(self, node):
+ for n in node.elts:
+ p = self.code_factory(n, **self.exception_kwargs)
+ self.listener.codeargs.append(p)
+ self.listener.args.append(ExpressionGenerator(n).value())
+ self.listener.declared_identifiers = \
+ self.listener.declared_identifiers.union(
+ p.declared_identifiers)
+ self.listener.undeclared_identifiers = \
+ self.listener.undeclared_identifiers.union(
+ p.undeclared_identifiers)
+
+
+class ParseFunc(_ast_util.NodeVisitor):
+
+ def __init__(self, listener, **exception_kwargs):
+ self.listener = listener
+ self.exception_kwargs = exception_kwargs
+
+ def visit_FunctionDef(self, node):
+ self.listener.funcname = node.name
+
+ argnames = [arg_id(arg) for arg in node.args.args]
+ if node.args.vararg:
+ argnames.append(arg_stringname(node.args.vararg))
+
+ if compat.py2k:
+ # kw-only args don't exist in Python 2
+ kwargnames = []
+ else:
+ kwargnames = [arg_id(arg) for arg in node.args.kwonlyargs]
+ if node.args.kwarg:
+ kwargnames.append(arg_stringname(node.args.kwarg))
+ self.listener.argnames = argnames
+ self.listener.defaults = node.args.defaults # ast
+ self.listener.kwargnames = kwargnames
+ if compat.py2k:
+ self.listener.kwdefaults = []
+ else:
+ self.listener.kwdefaults = node.args.kw_defaults
+ self.listener.varargs = node.args.vararg
+ self.listener.kwargs = node.args.kwarg
+
+class ExpressionGenerator(object):
+
+ def __init__(self, astnode):
+ self.generator = _ast_util.SourceGenerator(' ' * 4)
+ self.generator.visit(astnode)
+
+ def value(self):
+ return ''.join(self.generator.result)
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/runtime.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/runtime.py
new file mode 100644
index 00000000000..6b6a35a9215
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/runtime.py
@@ -0,0 +1,878 @@
+# mako/runtime.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+"""provides runtime services for templates, including Context,
+Namespace, and various helper functions."""
+
+from mako import exceptions, util, compat
+from mako.compat import compat_builtins
+import sys
+
+
+class Context(object):
+ """Provides runtime namespace, output buffer, and various
+ callstacks for templates.
+
+ See :ref:`runtime_toplevel` for detail on the usage of
+ :class:`.Context`.
+
+ """
+
+ def __init__(self, buffer, **data):
+ self._buffer_stack = [buffer]
+
+ self._data = data
+
+ self._kwargs = data.copy()
+ self._with_template = None
+ self._outputting_as_unicode = None
+ self.namespaces = {}
+
+ # "capture" function which proxies to the
+ # generic "capture" function
+ self._data['capture'] = compat.partial(capture, self)
+
+ # "caller" stack used by def calls with content
+ self.caller_stack = self._data['caller'] = CallerStack()
+
+ def _set_with_template(self, t):
+ self._with_template = t
+ illegal_names = t.reserved_names.intersection(self._data)
+ if illegal_names:
+ raise exceptions.NameConflictError(
+ "Reserved words passed to render(): %s" %
+ ", ".join(illegal_names))
+
+ @property
+ def lookup(self):
+ """Return the :class:`.TemplateLookup` associated
+ with this :class:`.Context`.
+
+ """
+ return self._with_template.lookup
+
+ @property
+ def kwargs(self):
+ """Return the dictionary of top level keyword arguments associated
+ with this :class:`.Context`.
+
+ This dictionary only includes the top-level arguments passed to
+ :meth:`.Template.render`. It does not include names produced within
+ the template execution such as local variable names or special names
+ such as ``self``, ``next``, etc.
+
+ The purpose of this dictionary is primarily for the case that
+ a :class:`.Template` accepts arguments via its ``<%page>`` tag,
+ which are normally expected to be passed via :meth:`.Template.render`,
+ except the template is being called in an inheritance context,
+ using the ``body()`` method. :attr:`.Context.kwargs` can then be
+ used to propagate these arguments to the inheriting template::
+
+ ${next.body(**context.kwargs)}
+
+ """
+ return self._kwargs.copy()
+
+ def push_caller(self, caller):
+ """Push a ``caller`` callable onto the callstack for
+ this :class:`.Context`."""
+
+
+ self.caller_stack.append(caller)
+
+ def pop_caller(self):
+ """Pop a ``caller`` callable onto the callstack for this
+ :class:`.Context`."""
+
+ del self.caller_stack[-1]
+
+ def keys(self):
+ """Return a list of all names established in this :class:`.Context`."""
+
+ return list(self._data.keys())
+
+ def __getitem__(self, key):
+ if key in self._data:
+ return self._data[key]
+ else:
+ return compat_builtins.__dict__[key]
+
+ def _push_writer(self):
+ """push a capturing buffer onto this Context and return
+ the new writer function."""
+
+ buf = util.FastEncodingBuffer()
+ self._buffer_stack.append(buf)
+ return buf.write
+
+ def _pop_buffer_and_writer(self):
+ """pop the most recent capturing buffer from this Context
+ and return the current writer after the pop.
+
+ """
+
+ buf = self._buffer_stack.pop()
+ return buf, self._buffer_stack[-1].write
+
+ def _push_buffer(self):
+ """push a capturing buffer onto this Context."""
+
+ self._push_writer()
+
+ def _pop_buffer(self):
+ """pop the most recent capturing buffer from this Context."""
+
+ return self._buffer_stack.pop()
+
+ def get(self, key, default=None):
+ """Return a value from this :class:`.Context`."""
+
+ return self._data.get(key, compat_builtins.__dict__.get(key, default))
+
+ def write(self, string):
+ """Write a string to this :class:`.Context` object's
+ underlying output buffer."""
+
+ self._buffer_stack[-1].write(string)
+
+ def writer(self):
+ """Return the current writer function."""
+
+ return self._buffer_stack[-1].write
+
+ def _copy(self):
+ c = Context.__new__(Context)
+ c._buffer_stack = self._buffer_stack
+ c._data = self._data.copy()
+ c._kwargs = self._kwargs
+ c._with_template = self._with_template
+ c._outputting_as_unicode = self._outputting_as_unicode
+ c.namespaces = self.namespaces
+ c.caller_stack = self.caller_stack
+ return c
+
+ def _locals(self, d):
+ """Create a new :class:`.Context` with a copy of this
+ :class:`.Context`'s current state,
+ updated with the given dictionary.
+
+ The :attr:`.Context.kwargs` collection remains
+ unaffected.
+
+
+ """
+
+ if not d:
+ return self
+ c = self._copy()
+ c._data.update(d)
+ return c
+
+ def _clean_inheritance_tokens(self):
+ """create a new copy of this :class:`.Context`. with
+ tokens related to inheritance state removed."""
+
+ c = self._copy()
+ x = c._data
+ x.pop('self', None)
+ x.pop('parent', None)
+ x.pop('next', None)
+ return c
+
+class CallerStack(list):
+ def __init__(self):
+ self.nextcaller = None
+
+ def __nonzero__(self):
+ return self.__bool__()
+
+ def __bool__(self):
+ return len(self) and self._get_caller() and True or False
+
+ def _get_caller(self):
+ # this method can be removed once
+ # codegen MAGIC_NUMBER moves past 7
+ return self[-1]
+
+ def __getattr__(self, key):
+ return getattr(self._get_caller(), key)
+
+ def _push_frame(self):
+ frame = self.nextcaller or None
+ self.append(frame)
+ self.nextcaller = None
+ return frame
+
+ def _pop_frame(self):
+ self.nextcaller = self.pop()
+
+
+class Undefined(object):
+ """Represents an undefined value in a template.
+
+ All template modules have a constant value
+ ``UNDEFINED`` present which is an instance of this
+ object.
+
+ """
+ def __str__(self):
+ raise NameError("Undefined")
+
+ def __nonzero__(self):
+ return self.__bool__()
+
+ def __bool__(self):
+ return False
+
+UNDEFINED = Undefined()
+
+class LoopStack(object):
+ """a stack for LoopContexts that implements the context manager protocol
+ to automatically pop off the top of the stack on context exit
+ """
+
+ def __init__(self):
+ self.stack = []
+
+ def _enter(self, iterable):
+ self._push(iterable)
+ return self._top
+
+ def _exit(self):
+ self._pop()
+ return self._top
+
+ @property
+ def _top(self):
+ if self.stack:
+ return self.stack[-1]
+ else:
+ return self
+
+ def _pop(self):
+ return self.stack.pop()
+
+ def _push(self, iterable):
+ new = LoopContext(iterable)
+ if self.stack:
+ new.parent = self.stack[-1]
+ return self.stack.append(new)
+
+ def __getattr__(self, key):
+ raise exceptions.RuntimeException("No loop context is established")
+
+ def __iter__(self):
+ return iter(self._top)
+
+
+class LoopContext(object):
+ """A magic loop variable.
+ Automatically accessible in any ``% for`` block.
+
+ See the section :ref:`loop_context` for usage
+ notes.
+
+ :attr:`parent` -> :class:`.LoopContext` or ``None``
+ The parent loop, if one exists.
+ :attr:`index` -> `int`
+ The 0-based iteration count.
+ :attr:`reverse_index` -> `int`
+ The number of iterations remaining.
+ :attr:`first` -> `bool`
+ ``True`` on the first iteration, ``False`` otherwise.
+ :attr:`last` -> `bool`
+ ``True`` on the last iteration, ``False`` otherwise.
+ :attr:`even` -> `bool`
+ ``True`` when ``index`` is even.
+ :attr:`odd` -> `bool`
+ ``True`` when ``index`` is odd.
+ """
+
+ def __init__(self, iterable):
+ self._iterable = iterable
+ self.index = 0
+ self.parent = None
+
+ def __iter__(self):
+ for i in self._iterable:
+ yield i
+ self.index += 1
+
+ @util.memoized_instancemethod
+ def __len__(self):
+ return len(self._iterable)
+
+ @property
+ def reverse_index(self):
+ return len(self) - self.index - 1
+
+ @property
+ def first(self):
+ return self.index == 0
+
+ @property
+ def last(self):
+ return self.index == len(self) - 1
+
+ @property
+ def even(self):
+ return not self.odd
+
+ @property
+ def odd(self):
+ return bool(self.index % 2)
+
+ def cycle(self, *values):
+ """Cycle through values as the loop progresses.
+ """
+ if not values:
+ raise ValueError("You must provide values to cycle through")
+ return values[self.index % len(values)]
+
+
+class _NSAttr(object):
+ def __init__(self, parent):
+ self.__parent = parent
+ def __getattr__(self, key):
+ ns = self.__parent
+ while ns:
+ if hasattr(ns.module, key):
+ return getattr(ns.module, key)
+ else:
+ ns = ns.inherits
+ raise AttributeError(key)
+
+class Namespace(object):
+ """Provides access to collections of rendering methods, which
+ can be local, from other templates, or from imported modules.
+
+ To access a particular rendering method referenced by a
+ :class:`.Namespace`, use plain attribute access:
+
+ .. sourcecode:: mako
+
+ ${some_namespace.foo(x, y, z)}
+
+ :class:`.Namespace` also contains several built-in attributes
+ described here.
+
+ """
+
+ def __init__(self, name, context,
+ callables=None, inherits=None,
+ populate_self=True, calling_uri=None):
+ self.name = name
+ self.context = context
+ self.inherits = inherits
+ if callables is not None:
+ self.callables = dict([(c.__name__, c) for c in callables])
+
+ callables = ()
+
+ module = None
+ """The Python module referenced by this :class:`.Namespace`.
+
+ If the namespace references a :class:`.Template`, then
+ this module is the equivalent of ``template.module``,
+ i.e. the generated module for the template.
+
+ """
+
+ template = None
+ """The :class:`.Template` object referenced by this
+ :class:`.Namespace`, if any.
+
+ """
+
+ context = None
+ """The :class:`.Context` object for this :class:`.Namespace`.
+
+ Namespaces are often created with copies of contexts that
+ contain slightly different data, particularly in inheritance
+ scenarios. Using the :class:`.Context` off of a :class:`.Namespace` one
+ can traverse an entire chain of templates that inherit from
+ one-another.
+
+ """
+
+ filename = None
+ """The path of the filesystem file used for this
+ :class:`.Namespace`'s module or template.
+
+ If this is a pure module-based
+ :class:`.Namespace`, this evaluates to ``module.__file__``. If a
+ template-based namespace, it evaluates to the original
+ template file location.
+
+ """
+
+ uri = None
+ """The URI for this :class:`.Namespace`'s template.
+
+ I.e. whatever was sent to :meth:`.TemplateLookup.get_template()`.
+
+ This is the equivalent of :attr:`.Template.uri`.
+
+ """
+
+ _templateuri = None
+
+ @util.memoized_property
+ def attr(self):
+ """Access module level attributes by name.
+
+ This accessor allows templates to supply "scalar"
+ attributes which are particularly handy in inheritance
+ relationships.
+
+ .. seealso::
+
+ :ref:`inheritance_attr`
+
+ :ref:`namespace_attr_for_includes`
+
+ """
+ return _NSAttr(self)
+
+ def get_namespace(self, uri):
+ """Return a :class:`.Namespace` corresponding to the given ``uri``.
+
+ If the given ``uri`` is a relative URI (i.e. it does not
+ contain a leading slash ``/``), the ``uri`` is adjusted to
+ be relative to the ``uri`` of the namespace itself. This
+ method is therefore mostly useful off of the built-in
+ ``local`` namespace, described in :ref:`namespace_local`.
+
+ In
+ most cases, a template wouldn't need this function, and
+ should instead use the ``<%namespace>`` tag to load
+ namespaces. However, since all ``<%namespace>`` tags are
+ evaluated before the body of a template ever runs,
+ this method can be used to locate namespaces using
+ expressions that were generated within the body code of
+ the template, or to conditionally use a particular
+ namespace.
+
+ """
+ key = (self, uri)
+ if key in self.context.namespaces:
+ return self.context.namespaces[key]
+ else:
+ ns = TemplateNamespace(uri, self.context._copy(),
+ templateuri=uri,
+ calling_uri=self._templateuri)
+ self.context.namespaces[key] = ns
+ return ns
+
+ def get_template(self, uri):
+ """Return a :class:`.Template` from the given ``uri``.
+
+ The ``uri`` resolution is relative to the ``uri`` of this
+ :class:`.Namespace` object's :class:`.Template`.
+
+ """
+ return _lookup_template(self.context, uri, self._templateuri)
+
+ def get_cached(self, key, **kwargs):
+ """Return a value from the :class:`.Cache` referenced by this
+ :class:`.Namespace` object's :class:`.Template`.
+
+ The advantage to this method versus direct access to the
+ :class:`.Cache` is that the configuration parameters
+ declared in ``<%page>`` take effect here, thereby calling
+ up the same configured backend as that configured
+ by ``<%page>``.
+
+ """
+
+ return self.cache.get(key, **kwargs)
+
+ @property
+ def cache(self):
+ """Return the :class:`.Cache` object referenced
+ by this :class:`.Namespace` object's
+ :class:`.Template`.
+
+ """
+ return self.template.cache
+
+ def include_file(self, uri, **kwargs):
+ """Include a file at the given ``uri``."""
+
+ _include_file(self.context, uri, self._templateuri, **kwargs)
+
+ def _populate(self, d, l):
+ for ident in l:
+ if ident == '*':
+ for (k, v) in self._get_star():
+ d[k] = v
+ else:
+ d[ident] = getattr(self, ident)
+
+ def _get_star(self):
+ if self.callables:
+ for key in self.callables:
+ yield (key, self.callables[key])
+
+ def __getattr__(self, key):
+ if key in self.callables:
+ val = self.callables[key]
+ elif self.inherits:
+ val = getattr(self.inherits, key)
+ else:
+ raise AttributeError(
+ "Namespace '%s' has no member '%s'" %
+ (self.name, key))
+ setattr(self, key, val)
+ return val
+
+class TemplateNamespace(Namespace):
+ """A :class:`.Namespace` specific to a :class:`.Template` instance."""
+
+ def __init__(self, name, context, template=None, templateuri=None,
+ callables=None, inherits=None,
+ populate_self=True, calling_uri=None):
+ self.name = name
+ self.context = context
+ self.inherits = inherits
+ if callables is not None:
+ self.callables = dict([(c.__name__, c) for c in callables])
+
+ if templateuri is not None:
+ self.template = _lookup_template(context, templateuri,
+ calling_uri)
+ self._templateuri = self.template.module._template_uri
+ elif template is not None:
+ self.template = template
+ self._templateuri = template.module._template_uri
+ else:
+ raise TypeError("'template' argument is required.")
+
+ if populate_self:
+ lclcallable, lclcontext = \
+ _populate_self_namespace(context, self.template,
+ self_ns=self)
+
+ @property
+ def module(self):
+ """The Python module referenced by this :class:`.Namespace`.
+
+ If the namespace references a :class:`.Template`, then
+ this module is the equivalent of ``template.module``,
+ i.e. the generated module for the template.
+
+ """
+ return self.template.module
+
+ @property
+ def filename(self):
+ """The path of the filesystem file used for this
+ :class:`.Namespace`'s module or template.
+ """
+ return self.template.filename
+
+ @property
+ def uri(self):
+ """The URI for this :class:`.Namespace`'s template.
+
+ I.e. whatever was sent to :meth:`.TemplateLookup.get_template()`.
+
+ This is the equivalent of :attr:`.Template.uri`.
+
+ """
+ return self.template.uri
+
+ def _get_star(self):
+ if self.callables:
+ for key in self.callables:
+ yield (key, self.callables[key])
+ def get(key):
+ callable_ = self.template._get_def_callable(key)
+ return compat.partial(callable_, self.context)
+ for k in self.template.module._exports:
+ yield (k, get(k))
+
+ def __getattr__(self, key):
+ if key in self.callables:
+ val = self.callables[key]
+ elif self.template.has_def(key):
+ callable_ = self.template._get_def_callable(key)
+ val = compat.partial(callable_, self.context)
+ elif self.inherits:
+ val = getattr(self.inherits, key)
+
+ else:
+ raise AttributeError(
+ "Namespace '%s' has no member '%s'" %
+ (self.name, key))
+ setattr(self, key, val)
+ return val
+
+class ModuleNamespace(Namespace):
+ """A :class:`.Namespace` specific to a Python module instance."""
+
+ def __init__(self, name, context, module,
+ callables=None, inherits=None,
+ populate_self=True, calling_uri=None):
+ self.name = name
+ self.context = context
+ self.inherits = inherits
+ if callables is not None:
+ self.callables = dict([(c.__name__, c) for c in callables])
+
+ mod = __import__(module)
+ for token in module.split('.')[1:]:
+ mod = getattr(mod, token)
+ self.module = mod
+
+ @property
+ def filename(self):
+ """The path of the filesystem file used for this
+ :class:`.Namespace`'s module or template.
+ """
+ return self.module.__file__
+
+ def _get_star(self):
+ if self.callables:
+ for key in self.callables:
+ yield (key, self.callables[key])
+ for key in dir(self.module):
+ if key[0] != '_':
+ callable_ = getattr(self.module, key)
+ if compat.callable(callable_):
+ yield key, compat.partial(callable_, self.context)
+
+
+ def __getattr__(self, key):
+ if key in self.callables:
+ val = self.callables[key]
+ elif hasattr(self.module, key):
+ callable_ = getattr(self.module, key)
+ val = compat.partial(callable_, self.context)
+ elif self.inherits:
+ val = getattr(self.inherits, key)
+ else:
+ raise AttributeError(
+ "Namespace '%s' has no member '%s'" %
+ (self.name, key))
+ setattr(self, key, val)
+ return val
+
+def supports_caller(func):
+ """Apply a caller_stack compatibility decorator to a plain
+ Python function.
+
+ See the example in :ref:`namespaces_python_modules`.
+
+ """
+
+ def wrap_stackframe(context, *args, **kwargs):
+ context.caller_stack._push_frame()
+ try:
+ return func(context, *args, **kwargs)
+ finally:
+ context.caller_stack._pop_frame()
+ return wrap_stackframe
+
+def capture(context, callable_, *args, **kwargs):
+ """Execute the given template def, capturing the output into
+ a buffer.
+
+ See the example in :ref:`namespaces_python_modules`.
+
+ """
+
+ if not compat.callable(callable_):
+ raise exceptions.RuntimeException(
+ "capture() function expects a callable as "
+ "its argument (i.e. capture(func, *args, **kwargs))"
+ )
+ context._push_buffer()
+ try:
+ callable_(*args, **kwargs)
+ finally:
+ buf = context._pop_buffer()
+ return buf.getvalue()
+
+def _decorate_toplevel(fn):
+ def decorate_render(render_fn):
+ def go(context, *args, **kw):
+ def y(*args, **kw):
+ return render_fn(context, *args, **kw)
+ try:
+ y.__name__ = render_fn.__name__[7:]
+ except TypeError:
+ # < Python 2.4
+ pass
+ return fn(y)(context, *args, **kw)
+ return go
+ return decorate_render
+
+def _decorate_inline(context, fn):
+ def decorate_render(render_fn):
+ dec = fn(render_fn)
+ def go(*args, **kw):
+ return dec(context, *args, **kw)
+ return go
+ return decorate_render
+
+def _include_file(context, uri, calling_uri, **kwargs):
+ """locate the template from the given uri and include it in
+ the current output."""
+
+ template = _lookup_template(context, uri, calling_uri)
+ (callable_, ctx) = _populate_self_namespace(
+ context._clean_inheritance_tokens(),
+ template)
+ callable_(ctx, **_kwargs_for_include(callable_, context._data, **kwargs))
+
+def _inherit_from(context, uri, calling_uri):
+ """called by the _inherit method in template modules to set
+ up the inheritance chain at the start of a template's
+ execution."""
+
+ if uri is None:
+ return None
+ template = _lookup_template(context, uri, calling_uri)
+ self_ns = context['self']
+ ih = self_ns
+ while ih.inherits is not None:
+ ih = ih.inherits
+ lclcontext = context._locals({'next': ih})
+ ih.inherits = TemplateNamespace("self:%s" % template.uri,
+ lclcontext,
+ template=template,
+ populate_self=False)
+ context._data['parent'] = lclcontext._data['local'] = ih.inherits
+ callable_ = getattr(template.module, '_mako_inherit', None)
+ if callable_ is not None:
+ ret = callable_(template, lclcontext)
+ if ret:
+ return ret
+
+ gen_ns = getattr(template.module, '_mako_generate_namespaces', None)
+ if gen_ns is not None:
+ gen_ns(context)
+ return (template.callable_, lclcontext)
+
+def _lookup_template(context, uri, relativeto):
+ lookup = context._with_template.lookup
+ if lookup is None:
+ raise exceptions.TemplateLookupException(
+ "Template '%s' has no TemplateLookup associated" %
+ context._with_template.uri)
+ uri = lookup.adjust_uri(uri, relativeto)
+ try:
+ return lookup.get_template(uri)
+ except exceptions.TopLevelLookupException:
+ raise exceptions.TemplateLookupException(str(compat.exception_as()))
+
+def _populate_self_namespace(context, template, self_ns=None):
+ if self_ns is None:
+ self_ns = TemplateNamespace('self:%s' % template.uri,
+ context, template=template,
+ populate_self=False)
+ context._data['self'] = context._data['local'] = self_ns
+ if hasattr(template.module, '_mako_inherit'):
+ ret = template.module._mako_inherit(template, context)
+ if ret:
+ return ret
+ return (template.callable_, context)
+
+def _render(template, callable_, args, data, as_unicode=False):
+ """create a Context and return the string
+ output of the given template and template callable."""
+
+ if as_unicode:
+ buf = util.FastEncodingBuffer(as_unicode=True)
+ elif template.bytestring_passthrough:
+ buf = compat.StringIO()
+ else:
+ buf = util.FastEncodingBuffer(
+ as_unicode=as_unicode,
+ encoding=template.output_encoding,
+ errors=template.encoding_errors)
+ context = Context(buf, **data)
+ context._outputting_as_unicode = as_unicode
+ context._set_with_template(template)
+
+ _render_context(template, callable_, context, *args,
+ **_kwargs_for_callable(callable_, data))
+ return context._pop_buffer().getvalue()
+
+def _kwargs_for_callable(callable_, data):
+ argspec = compat.inspect_func_args(callable_)
+ # for normal pages, **pageargs is usually present
+ if argspec[2]:
+ return data
+
+ # for rendering defs from the top level, figure out the args
+ namedargs = argspec[0] + [v for v in argspec[1:3] if v is not None]
+ kwargs = {}
+ for arg in namedargs:
+ if arg != 'context' and arg in data and arg not in kwargs:
+ kwargs[arg] = data[arg]
+ return kwargs
+
+def _kwargs_for_include(callable_, data, **kwargs):
+ argspec = compat.inspect_func_args(callable_)
+ namedargs = argspec[0] + [v for v in argspec[1:3] if v is not None]
+ for arg in namedargs:
+ if arg != 'context' and arg in data and arg not in kwargs:
+ kwargs[arg] = data[arg]
+ return kwargs
+
+def _render_context(tmpl, callable_, context, *args, **kwargs):
+ import mako.template as template
+ # create polymorphic 'self' namespace for this
+ # template with possibly updated context
+ if not isinstance(tmpl, template.DefTemplate):
+ # if main render method, call from the base of the inheritance stack
+ (inherit, lclcontext) = _populate_self_namespace(context, tmpl)
+ _exec_template(inherit, lclcontext, args=args, kwargs=kwargs)
+ else:
+ # otherwise, call the actual rendering method specified
+ (inherit, lclcontext) = _populate_self_namespace(context, tmpl.parent)
+ _exec_template(callable_, context, args=args, kwargs=kwargs)
+
+def _exec_template(callable_, context, args=None, kwargs=None):
+ """execute a rendering callable given the callable, a
+ Context, and optional explicit arguments
+
+ the contextual Template will be located if it exists, and
+ the error handling options specified on that Template will
+ be interpreted here.
+ """
+ template = context._with_template
+ if template is not None and \
+ (template.format_exceptions or template.error_handler):
+ try:
+ callable_(context, *args, **kwargs)
+ except Exception:
+ _render_error(template, context, compat.exception_as())
+ except:
+ e = sys.exc_info()[0]
+ _render_error(template, context, e)
+ else:
+ callable_(context, *args, **kwargs)
+
+def _render_error(template, context, error):
+ if template.error_handler:
+ result = template.error_handler(context, error)
+ if not result:
+ compat.reraise(*sys.exc_info())
+ else:
+ error_template = exceptions.html_error_template()
+ if context._outputting_as_unicode:
+ context._buffer_stack[:] = [
+ util.FastEncodingBuffer(as_unicode=True)]
+ else:
+ context._buffer_stack[:] = [util.FastEncodingBuffer(
+ error_template.output_encoding,
+ error_template.encoding_errors)]
+
+ context._set_with_template(error_template)
+ error_template.render_context(context, error=error)
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/template.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/template.py
new file mode 100644
index 00000000000..fb6106289fa
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/template.py
@@ -0,0 +1,705 @@
+# mako/template.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+"""Provides the Template class, a facade for parsing, generating and executing
+template strings, as well as template runtime operations."""
+
+from mako.lexer import Lexer
+from mako import runtime, util, exceptions, codegen, cache, compat
+import os
+import re
+import shutil
+import stat
+import sys
+import tempfile
+import types
+import weakref
+
+
+class Template(object):
+ """Represents a compiled template.
+
+ :class:`.Template` includes a reference to the original
+ template source (via the :attr:`.source` attribute)
+ as well as the source code of the
+ generated Python module (i.e. the :attr:`.code` attribute),
+ as well as a reference to an actual Python module.
+
+ :class:`.Template` is constructed using either a literal string
+ representing the template text, or a filename representing a filesystem
+ path to a source file.
+
+ :param text: textual template source. This argument is mutually
+ exclusive versus the ``filename`` parameter.
+
+ :param filename: filename of the source template. This argument is
+ mutually exclusive versus the ``text`` parameter.
+
+ :param buffer_filters: string list of filters to be applied
+ to the output of ``%def``\ s which are buffered, cached, or otherwise
+ filtered, after all filters
+ defined with the ``%def`` itself have been applied. Allows the
+ creation of default expression filters that let the output
+ of return-valued ``%def``\ s "opt out" of that filtering via
+ passing special attributes or objects.
+
+ :param bytestring_passthrough: When ``True``, and ``output_encoding`` is
+ set to ``None``, and :meth:`.Template.render` is used to render,
+ the `StringIO` or `cStringIO` buffer will be used instead of the
+ default "fast" buffer. This allows raw bytestrings in the
+ output stream, such as in expressions, to pass straight
+ through to the buffer. This flag is forced
+ to ``True`` if ``disable_unicode`` is also configured.
+
+ .. versionadded:: 0.4
+ Added to provide the same behavior as that of the previous series.
+
+ :param cache_args: Dictionary of cache configuration arguments that
+ will be passed to the :class:`.CacheImpl`. See :ref:`caching_toplevel`.
+
+ :param cache_dir:
+
+ .. deprecated:: 0.6
+ Use the ``'dir'`` argument in the ``cache_args`` dictionary.
+ See :ref:`caching_toplevel`.
+
+ :param cache_enabled: Boolean flag which enables caching of this
+ template. See :ref:`caching_toplevel`.
+
+ :param cache_impl: String name of a :class:`.CacheImpl` caching
+ implementation to use. Defaults to ``'beaker'``.
+
+ :param cache_type:
+
+ .. deprecated:: 0.6
+ Use the ``'type'`` argument in the ``cache_args`` dictionary.
+ See :ref:`caching_toplevel`.
+
+ :param cache_url:
+
+ .. deprecated:: 0.6
+ Use the ``'url'`` argument in the ``cache_args`` dictionary.
+ See :ref:`caching_toplevel`.
+
+ :param default_filters: List of string filter names that will
+ be applied to all expressions. See :ref:`filtering_default_filters`.
+
+ :param disable_unicode: Disables all awareness of Python Unicode
+ objects. See :ref:`unicode_disabled`.
+
+ :param enable_loop: When ``True``, enable the ``loop`` context variable.
+ This can be set to ``False`` to support templates that may
+ be making usage of the name "``loop``". Individual templates can
+ re-enable the "loop" context by placing the directive
+ ``enable_loop="True"`` inside the ``<%page>`` tag -- see
+ :ref:`migrating_loop`.
+
+ :param encoding_errors: Error parameter passed to ``encode()`` when
+ string encoding is performed. See :ref:`usage_unicode`.
+
+ :param error_handler: Python callable which is called whenever
+ compile or runtime exceptions occur. The callable is passed
+ the current context as well as the exception. If the
+ callable returns ``True``, the exception is considered to
+ be handled, else it is re-raised after the function
+ completes. Is used to provide custom error-rendering
+ functions.
+
+ :param format_exceptions: if ``True``, exceptions which occur during
+ the render phase of this template will be caught and
+ formatted into an HTML error page, which then becomes the
+ rendered result of the :meth:`.render` call. Otherwise,
+ runtime exceptions are propagated outwards.
+
+ :param imports: String list of Python statements, typically individual
+ "import" lines, which will be placed into the module level
+ preamble of all generated Python modules. See the example
+ in :ref:`filtering_default_filters`.
+
+ :param future_imports: String list of names to import from `__future__`.
+ These will be concatenated into a comma-separated string and inserted
+ into the beginning of the template, e.g. ``futures_imports=['FOO',
+ 'BAR']`` results in ``from __future__ import FOO, BAR``. If you're
+ interested in using features like the new division operator, you must
+ use future_imports to convey that to the renderer, as otherwise the
+ import will not appear as the first executed statement in the generated
+ code and will therefore not have the desired effect.
+
+ :param input_encoding: Encoding of the template's source code. Can
+ be used in lieu of the coding comment. See
+ :ref:`usage_unicode` as well as :ref:`unicode_toplevel` for
+ details on source encoding.
+
+ :param lookup: a :class:`.TemplateLookup` instance that will be used
+ for all file lookups via the ``<%namespace>``,
+ ``<%include>``, and ``<%inherit>`` tags. See
+ :ref:`usage_templatelookup`.
+
+ :param module_directory: Filesystem location where generated
+ Python module files will be placed.
+
+ :param module_filename: Overrides the filename of the generated
+ Python module file. For advanced usage only.
+
+ :param module_writer: A callable which overrides how the Python
+ module is written entirely. The callable is passed the
+ encoded source content of the module and the destination
+ path to be written to. The default behavior of module writing
+ uses a tempfile in conjunction with a file move in order
+ to make the operation atomic. So a user-defined module
+ writing function that mimics the default behavior would be:
+
+ .. sourcecode:: python
+
+ import tempfile
+ import os
+ import shutil
+
+ def module_writer(source, outputpath):
+ (dest, name) = \\
+ tempfile.mkstemp(
+ dir=os.path.dirname(outputpath)
+ )
+
+ os.write(dest, source)
+ os.close(dest)
+ shutil.move(name, outputpath)
+
+ from mako.template import Template
+ mytemplate = Template(
+ filename="index.html",
+ module_directory="/path/to/modules",
+ module_writer=module_writer
+ )
+
+ The function is provided for unusual configurations where
+ certain platform-specific permissions or other special
+ steps are needed.
+
+ :param output_encoding: The encoding to use when :meth:`.render`
+ is called.
+ See :ref:`usage_unicode` as well as :ref:`unicode_toplevel`.
+
+ :param preprocessor: Python callable which will be passed
+ the full template source before it is parsed. The return
+ result of the callable will be used as the template source
+ code.
+
+ :param lexer_cls: A :class:`.Lexer` class used to parse
+ the template. The :class:`.Lexer` class is used by
+ default.
+
+ .. versionadded:: 0.7.4
+
+ :param strict_undefined: Replaces the automatic usage of
+ ``UNDEFINED`` for any undeclared variables not located in
+ the :class:`.Context` with an immediate raise of
+ ``NameError``. The advantage is immediate reporting of
+ missing variables which include the name.
+
+ .. versionadded:: 0.3.6
+
+ :param uri: string URI or other identifier for this template.
+ If not provided, the ``uri`` is generated from the filesystem
+ path, or from the in-memory identity of a non-file-based
+ template. The primary usage of the ``uri`` is to provide a key
+ within :class:`.TemplateLookup`, as well as to generate the
+ file path of the generated Python module file, if
+ ``module_directory`` is specified.
+
+ """
+
+ lexer_cls = Lexer
+
+ def __init__(self,
+ text=None,
+ filename=None,
+ uri=None,
+ format_exceptions=False,
+ error_handler=None,
+ lookup=None,
+ output_encoding=None,
+ encoding_errors='strict',
+ module_directory=None,
+ cache_args=None,
+ cache_impl='beaker',
+ cache_enabled=True,
+ cache_type=None,
+ cache_dir=None,
+ cache_url=None,
+ module_filename=None,
+ input_encoding=None,
+ disable_unicode=False,
+ module_writer=None,
+ bytestring_passthrough=False,
+ default_filters=None,
+ buffer_filters=(),
+ strict_undefined=False,
+ imports=None,
+ future_imports=None,
+ enable_loop=True,
+ preprocessor=None,
+ lexer_cls=None):
+ if uri:
+ self.module_id = re.sub(r'\W', "_", uri)
+ self.uri = uri
+ elif filename:
+ self.module_id = re.sub(r'\W', "_", filename)
+ drive, path = os.path.splitdrive(filename)
+ path = os.path.normpath(path).replace(os.path.sep, "/")
+ self.uri = path
+ else:
+ self.module_id = "memory:" + hex(id(self))
+ self.uri = self.module_id
+
+ u_norm = self.uri
+ if u_norm.startswith("/"):
+ u_norm = u_norm[1:]
+ u_norm = os.path.normpath(u_norm)
+ if u_norm.startswith(".."):
+ raise exceptions.TemplateLookupException(
+ "Template uri \"%s\" is invalid - "
+ "it cannot be relative outside "
+ "of the root path." % self.uri)
+
+ self.input_encoding = input_encoding
+ self.output_encoding = output_encoding
+ self.encoding_errors = encoding_errors
+ self.disable_unicode = disable_unicode
+ self.bytestring_passthrough = bytestring_passthrough or disable_unicode
+ self.enable_loop = enable_loop
+ self.strict_undefined = strict_undefined
+ self.module_writer = module_writer
+
+ if compat.py3k and disable_unicode:
+ raise exceptions.UnsupportedError(
+ "Mako for Python 3 does not "
+ "support disabling Unicode")
+ elif output_encoding and disable_unicode:
+ raise exceptions.UnsupportedError(
+ "output_encoding must be set to "
+ "None when disable_unicode is used.")
+ if default_filters is None:
+ if compat.py3k or self.disable_unicode:
+ self.default_filters = ['str']
+ else:
+ self.default_filters = ['unicode']
+ else:
+ self.default_filters = default_filters
+ self.buffer_filters = buffer_filters
+
+ self.imports = imports
+ self.future_imports = future_imports
+ self.preprocessor = preprocessor
+
+ if lexer_cls is not None:
+ self.lexer_cls = lexer_cls
+
+ # if plain text, compile code in memory only
+ if text is not None:
+ (code, module) = _compile_text(self, text, filename)
+ self._code = code
+ self._source = text
+ ModuleInfo(module, None, self, filename, code, text)
+ elif filename is not None:
+ # if template filename and a module directory, load
+ # a filesystem-based module file, generating if needed
+ if module_filename is not None:
+ path = module_filename
+ elif module_directory is not None:
+ path = os.path.abspath(
+ os.path.join(
+ os.path.normpath(module_directory),
+ u_norm + ".py"
+ )
+ )
+ else:
+ path = None
+ module = self._compile_from_file(path, filename)
+ else:
+ raise exceptions.RuntimeException(
+ "Template requires text or filename")
+
+ self.module = module
+ self.filename = filename
+ self.callable_ = self.module.render_body
+ self.format_exceptions = format_exceptions
+ self.error_handler = error_handler
+ self.lookup = lookup
+
+ self.module_directory = module_directory
+
+ self._setup_cache_args(
+ cache_impl, cache_enabled, cache_args,
+ cache_type, cache_dir, cache_url
+ )
+
+
+ @util.memoized_property
+ def reserved_names(self):
+ if self.enable_loop:
+ return codegen.RESERVED_NAMES
+ else:
+ return codegen.RESERVED_NAMES.difference(['loop'])
+
+ def _setup_cache_args(self,
+ cache_impl, cache_enabled, cache_args,
+ cache_type, cache_dir, cache_url):
+ self.cache_impl = cache_impl
+ self.cache_enabled = cache_enabled
+ if cache_args:
+ self.cache_args = cache_args
+ else:
+ self.cache_args = {}
+
+ # transfer deprecated cache_* args
+ if cache_type:
+ self.cache_args['type'] = cache_type
+ if cache_dir:
+ self.cache_args['dir'] = cache_dir
+ if cache_url:
+ self.cache_args['url'] = cache_url
+
+ def _compile_from_file(self, path, filename):
+ if path is not None:
+ util.verify_directory(os.path.dirname(path))
+ filemtime = os.stat(filename)[stat.ST_MTIME]
+ if not os.path.exists(path) or \
+ os.stat(path)[stat.ST_MTIME] < filemtime:
+ data = util.read_file(filename)
+ _compile_module_file(
+ self,
+ data,
+ filename,
+ path,
+ self.module_writer)
+ module = compat.load_module(self.module_id, path)
+ del sys.modules[self.module_id]
+ if module._magic_number != codegen.MAGIC_NUMBER:
+ data = util.read_file(filename)
+ _compile_module_file(
+ self,
+ data,
+ filename,
+ path,
+ self.module_writer)
+ module = compat.load_module(self.module_id, path)
+ del sys.modules[self.module_id]
+ ModuleInfo(module, path, self, filename, None, None)
+ else:
+ # template filename and no module directory, compile code
+ # in memory
+ data = util.read_file(filename)
+ code, module = _compile_text(
+ self,
+ data,
+ filename)
+ self._source = None
+ self._code = code
+ ModuleInfo(module, None, self, filename, code, None)
+ return module
+
+ @property
+ def source(self):
+ """Return the template source code for this :class:`.Template`."""
+
+ return _get_module_info_from_callable(self.callable_).source
+
+ @property
+ def code(self):
+ """Return the module source code for this :class:`.Template`."""
+
+ return _get_module_info_from_callable(self.callable_).code
+
+ @util.memoized_property
+ def cache(self):
+ return cache.Cache(self)
+
+ @property
+ def cache_dir(self):
+ return self.cache_args['dir']
+ @property
+ def cache_url(self):
+ return self.cache_args['url']
+ @property
+ def cache_type(self):
+ return self.cache_args['type']
+
+ def render(self, *args, **data):
+ """Render the output of this template as a string.
+
+ If the template specifies an output encoding, the string
+ will be encoded accordingly, else the output is raw (raw
+ output uses `cStringIO` and can't handle multibyte
+ characters). A :class:`.Context` object is created corresponding
+ to the given data. Arguments that are explicitly declared
+ by this template's internal rendering method are also
+ pulled from the given ``*args``, ``**data`` members.
+
+ """
+ return runtime._render(self, self.callable_, args, data)
+
+ def render_unicode(self, *args, **data):
+ """Render the output of this template as a unicode object."""
+
+ return runtime._render(self,
+ self.callable_,
+ args,
+ data,
+ as_unicode=True)
+
+ def render_context(self, context, *args, **kwargs):
+ """Render this :class:`.Template` with the given context.
+
+ The data is written to the context's buffer.
+
+ """
+ if getattr(context, '_with_template', None) is None:
+ context._set_with_template(self)
+ runtime._render_context(self,
+ self.callable_,
+ context,
+ *args,
+ **kwargs)
+
+ def has_def(self, name):
+ return hasattr(self.module, "render_%s" % name)
+
+ def get_def(self, name):
+ """Return a def of this template as a :class:`.DefTemplate`."""
+
+ return DefTemplate(self, getattr(self.module, "render_%s" % name))
+
+ def _get_def_callable(self, name):
+ return getattr(self.module, "render_%s" % name)
+
+ @property
+ def last_modified(self):
+ return self.module._modified_time
+
+class ModuleTemplate(Template):
+ """A Template which is constructed given an existing Python module.
+
+ e.g.::
+
+ t = Template("this is a template")
+ f = file("mymodule.py", "w")
+ f.write(t.code)
+ f.close()
+
+ import mymodule
+
+ t = ModuleTemplate(mymodule)
+ print t.render()
+
+ """
+
+ def __init__(self, module,
+ module_filename=None,
+ template=None,
+ template_filename=None,
+ module_source=None,
+ template_source=None,
+ output_encoding=None,
+ encoding_errors='strict',
+ disable_unicode=False,
+ bytestring_passthrough=False,
+ format_exceptions=False,
+ error_handler=None,
+ lookup=None,
+ cache_args=None,
+ cache_impl='beaker',
+ cache_enabled=True,
+ cache_type=None,
+ cache_dir=None,
+ cache_url=None,
+ ):
+ self.module_id = re.sub(r'\W', "_", module._template_uri)
+ self.uri = module._template_uri
+ self.input_encoding = module._source_encoding
+ self.output_encoding = output_encoding
+ self.encoding_errors = encoding_errors
+ self.disable_unicode = disable_unicode
+ self.bytestring_passthrough = bytestring_passthrough or disable_unicode
+ self.enable_loop = module._enable_loop
+
+ if compat.py3k and disable_unicode:
+ raise exceptions.UnsupportedError(
+ "Mako for Python 3 does not "
+ "support disabling Unicode")
+ elif output_encoding and disable_unicode:
+ raise exceptions.UnsupportedError(
+ "output_encoding must be set to "
+ "None when disable_unicode is used.")
+
+ self.module = module
+ self.filename = template_filename
+ ModuleInfo(module,
+ module_filename,
+ self,
+ template_filename,
+ module_source,
+ template_source)
+
+ self.callable_ = self.module.render_body
+ self.format_exceptions = format_exceptions
+ self.error_handler = error_handler
+ self.lookup = lookup
+ self._setup_cache_args(
+ cache_impl, cache_enabled, cache_args,
+ cache_type, cache_dir, cache_url
+ )
+
+class DefTemplate(Template):
+ """A :class:`.Template` which represents a callable def in a parent
+ template."""
+
+ def __init__(self, parent, callable_):
+ self.parent = parent
+ self.callable_ = callable_
+ self.output_encoding = parent.output_encoding
+ self.module = parent.module
+ self.encoding_errors = parent.encoding_errors
+ self.format_exceptions = parent.format_exceptions
+ self.error_handler = parent.error_handler
+ self.enable_loop = parent.enable_loop
+ self.lookup = parent.lookup
+ self.bytestring_passthrough = parent.bytestring_passthrough
+
+ def get_def(self, name):
+ return self.parent.get_def(name)
+
+class ModuleInfo(object):
+ """Stores information about a module currently loaded into
+ memory, provides reverse lookups of template source, module
+ source code based on a module's identifier.
+
+ """
+ _modules = weakref.WeakValueDictionary()
+
+ def __init__(self,
+ module,
+ module_filename,
+ template,
+ template_filename,
+ module_source,
+ template_source):
+ self.module = module
+ self.module_filename = module_filename
+ self.template_filename = template_filename
+ self.module_source = module_source
+ self.template_source = template_source
+ self._modules[module.__name__] = template._mmarker = self
+ if module_filename:
+ self._modules[module_filename] = self
+
+ @classmethod
+ def get_module_source_metadata(cls, module_source, full_line_map=False):
+ source_map = re.search(
+ r"__M_BEGIN_METADATA(.+?)__M_END_METADATA",
+ module_source, re.S).group(1)
+ source_map = compat.json.loads(source_map)
+ source_map['line_map'] = dict((int(k), int(v))
+ for k, v in source_map['line_map'].items())
+ if full_line_map:
+ f_line_map = source_map['full_line_map'] = []
+ line_map = source_map['line_map']
+
+ curr_templ_line = 1
+ for mod_line in range(1, max(line_map)):
+ if mod_line in line_map:
+ curr_templ_line = line_map[mod_line]
+ f_line_map.append(curr_templ_line)
+ return source_map
+
+ @property
+ def code(self):
+ if self.module_source is not None:
+ return self.module_source
+ else:
+ return util.read_python_file(self.module_filename)
+
+ @property
+ def source(self):
+ if self.template_source is not None:
+ if self.module._source_encoding and \
+ not isinstance(self.template_source, compat.text_type):
+ return self.template_source.decode(
+ self.module._source_encoding)
+ else:
+ return self.template_source
+ else:
+ data = util.read_file(self.template_filename)
+ if self.module._source_encoding:
+ return data.decode(self.module._source_encoding)
+ else:
+ return data
+
+def _compile(template, text, filename, generate_magic_comment):
+ lexer = template.lexer_cls(text,
+ filename,
+ disable_unicode=template.disable_unicode,
+ input_encoding=template.input_encoding,
+ preprocessor=template.preprocessor)
+ node = lexer.parse()
+ source = codegen.compile(node,
+ template.uri,
+ filename,
+ default_filters=template.default_filters,
+ buffer_filters=template.buffer_filters,
+ imports=template.imports,
+ future_imports=template.future_imports,
+ source_encoding=lexer.encoding,
+ generate_magic_comment=generate_magic_comment,
+ disable_unicode=template.disable_unicode,
+ strict_undefined=template.strict_undefined,
+ enable_loop=template.enable_loop,
+ reserved_names=template.reserved_names)
+ return source, lexer
+
+def _compile_text(template, text, filename):
+ identifier = template.module_id
+ source, lexer = _compile(template, text, filename,
+ generate_magic_comment=template.disable_unicode)
+
+ cid = identifier
+ if not compat.py3k and isinstance(cid, compat.text_type):
+ cid = cid.encode()
+ module = types.ModuleType(cid)
+ code = compile(source, cid, 'exec')
+
+ # this exec() works for 2.4->3.3.
+ exec(code, module.__dict__, module.__dict__)
+ return (source, module)
+
+def _compile_module_file(template, text, filename, outputpath, module_writer):
+ source, lexer = _compile(template, text, filename,
+ generate_magic_comment=True)
+
+ if isinstance(source, compat.text_type):
+ source = source.encode(lexer.encoding or 'ascii')
+
+ if module_writer:
+ module_writer(source, outputpath)
+ else:
+ # make tempfiles in the same location as the ultimate
+ # location. this ensures they're on the same filesystem,
+ # avoiding synchronization issues.
+ (dest, name) = tempfile.mkstemp(dir=os.path.dirname(outputpath))
+
+ os.write(dest, source)
+ os.close(dest)
+ shutil.move(name, outputpath)
+
+def _get_module_info_from_callable(callable_):
+ if compat.py3k:
+ return _get_module_info(callable_.__globals__['__name__'])
+ else:
+ return _get_module_info(callable_.func_globals['__name__'])
+
+def _get_module_info(filename):
+ return ModuleInfo._modules[filename]
+
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/util.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/util.py
new file mode 100644
index 00000000000..cba2ab7920c
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/util.py
@@ -0,0 +1,360 @@
+# mako/util.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+import re
+import collections
+import codecs
+import os
+from mako import compat
+import operator
+
+def update_wrapper(decorated, fn):
+ decorated.__wrapped__ = fn
+ decorated.__name__ = fn.__name__
+ return decorated
+
+
+class PluginLoader(object):
+ def __init__(self, group):
+ self.group = group
+ self.impls = {}
+
+ def load(self, name):
+ if name in self.impls:
+ return self.impls[name]()
+ else:
+ import pkg_resources
+ for impl in pkg_resources.iter_entry_points(
+ self.group,
+ name):
+ self.impls[name] = impl.load
+ return impl.load()
+ else:
+ from mako import exceptions
+ raise exceptions.RuntimeException(
+ "Can't load plugin %s %s" %
+ (self.group, name))
+
+ def register(self, name, modulepath, objname):
+ def load():
+ mod = __import__(modulepath)
+ for token in modulepath.split(".")[1:]:
+ mod = getattr(mod, token)
+ return getattr(mod, objname)
+ self.impls[name] = load
+
+def verify_directory(dir):
+ """create and/or verify a filesystem directory."""
+
+ tries = 0
+
+ while not os.path.exists(dir):
+ try:
+ tries += 1
+ os.makedirs(dir, compat.octal("0775"))
+ except:
+ if tries > 5:
+ raise
+
+def to_list(x, default=None):
+ if x is None:
+ return default
+ if not isinstance(x, (list, tuple)):
+ return [x]
+ else:
+ return x
+
+
+class memoized_property(object):
+ """A read-only @property that is only evaluated once."""
+ def __init__(self, fget, doc=None):
+ self.fget = fget
+ self.__doc__ = doc or fget.__doc__
+ self.__name__ = fget.__name__
+
+ def __get__(self, obj, cls):
+ if obj is None:
+ return self
+ obj.__dict__[self.__name__] = result = self.fget(obj)
+ return result
+
+class memoized_instancemethod(object):
+ """Decorate a method memoize its return value.
+
+ Best applied to no-arg methods: memoization is not sensitive to
+ argument values, and will always return the same value even when
+ called with different arguments.
+
+ """
+ def __init__(self, fget, doc=None):
+ self.fget = fget
+ self.__doc__ = doc or fget.__doc__
+ self.__name__ = fget.__name__
+
+ def __get__(self, obj, cls):
+ if obj is None:
+ return self
+ def oneshot(*args, **kw):
+ result = self.fget(obj, *args, **kw)
+ memo = lambda *a, **kw: result
+ memo.__name__ = self.__name__
+ memo.__doc__ = self.__doc__
+ obj.__dict__[self.__name__] = memo
+ return result
+ oneshot.__name__ = self.__name__
+ oneshot.__doc__ = self.__doc__
+ return oneshot
+
+class SetLikeDict(dict):
+ """a dictionary that has some setlike methods on it"""
+ def union(self, other):
+ """produce a 'union' of this dict and another (at the key level).
+
+ values in the second dict take precedence over that of the first"""
+ x = SetLikeDict(**self)
+ x.update(other)
+ return x
+
+class FastEncodingBuffer(object):
+ """a very rudimentary buffer that is faster than StringIO,
+ but doesn't crash on unicode data like cStringIO."""
+
+ def __init__(self, encoding=None, errors='strict', as_unicode=False):
+ self.data = collections.deque()
+ self.encoding = encoding
+ if as_unicode:
+ self.delim = compat.u('')
+ else:
+ self.delim = ''
+ self.as_unicode = as_unicode
+ self.errors = errors
+ self.write = self.data.append
+
+ def truncate(self):
+ self.data = collections.deque()
+ self.write = self.data.append
+
+ def getvalue(self):
+ if self.encoding:
+ return self.delim.join(self.data).encode(self.encoding,
+ self.errors)
+ else:
+ return self.delim.join(self.data)
+
+class LRUCache(dict):
+ """A dictionary-like object that stores a limited number of items,
+ discarding lesser used items periodically.
+
+ this is a rewrite of LRUCache from Myghty to use a periodic timestamp-based
+ paradigm so that synchronization is not really needed. the size management
+ is inexact.
+ """
+
+ class _Item(object):
+ def __init__(self, key, value):
+ self.key = key
+ self.value = value
+ self.timestamp = compat.time_func()
+ def __repr__(self):
+ return repr(self.value)
+
+ def __init__(self, capacity, threshold=.5):
+ self.capacity = capacity
+ self.threshold = threshold
+
+ def __getitem__(self, key):
+ item = dict.__getitem__(self, key)
+ item.timestamp = compat.time_func()
+ return item.value
+
+ def values(self):
+ return [i.value for i in dict.values(self)]
+
+ def setdefault(self, key, value):
+ if key in self:
+ return self[key]
+ else:
+ self[key] = value
+ return value
+
+ def __setitem__(self, key, value):
+ item = dict.get(self, key)
+ if item is None:
+ item = self._Item(key, value)
+ dict.__setitem__(self, key, item)
+ else:
+ item.value = value
+ self._manage_size()
+
+ def _manage_size(self):
+ while len(self) > self.capacity + self.capacity * self.threshold:
+ bytime = sorted(dict.values(self),
+ key=operator.attrgetter('timestamp'), reverse=True)
+ for item in bytime[self.capacity:]:
+ try:
+ del self[item.key]
+ except KeyError:
+ # if we couldn't find a key, most likely some other thread
+ # broke in on us. loop around and try again
+ break
+
+# Regexp to match python magic encoding line
+_PYTHON_MAGIC_COMMENT_re = re.compile(
+ r'[ \t\f]* \# .* coding[=:][ \t]*([-\w.]+)',
+ re.VERBOSE)
+
+def parse_encoding(fp):
+ """Deduce the encoding of a Python source file (binary mode) from magic
+ comment.
+
+ It does this in the same way as the `Python interpreter`__
+
+ .. __: http://docs.python.org/ref/encodings.html
+
+ The ``fp`` argument should be a seekable file object in binary mode.
+ """
+ pos = fp.tell()
+ fp.seek(0)
+ try:
+ line1 = fp.readline()
+ has_bom = line1.startswith(codecs.BOM_UTF8)
+ if has_bom:
+ line1 = line1[len(codecs.BOM_UTF8):]
+
+ m = _PYTHON_MAGIC_COMMENT_re.match(line1.decode('ascii', 'ignore'))
+ if not m:
+ try:
+ import parser
+ parser.suite(line1.decode('ascii', 'ignore'))
+ except (ImportError, SyntaxError):
+ # Either it's a real syntax error, in which case the source
+ # is not valid python source, or line2 is a continuation of
+ # line1, in which case we don't want to scan line2 for a magic
+ # comment.
+ pass
+ else:
+ line2 = fp.readline()
+ m = _PYTHON_MAGIC_COMMENT_re.match(
+ line2.decode('ascii', 'ignore'))
+
+ if has_bom:
+ if m:
+ raise SyntaxError("python refuses to compile code with both a UTF8" \
+ " byte-order-mark and a magic encoding comment")
+ return 'utf_8'
+ elif m:
+ return m.group(1)
+ else:
+ return None
+ finally:
+ fp.seek(pos)
+
+def sorted_dict_repr(d):
+ """repr() a dictionary with the keys in order.
+
+ Used by the lexer unit test to compare parse trees based on strings.
+
+ """
+ keys = list(d.keys())
+ keys.sort()
+ return "{" + ", ".join(["%r: %r" % (k, d[k]) for k in keys]) + "}"
+
+def restore__ast(_ast):
+ """Attempt to restore the required classes to the _ast module if it
+ appears to be missing them
+ """
+ if hasattr(_ast, 'AST'):
+ return
+ _ast.PyCF_ONLY_AST = 2 << 9
+ m = compile("""\
+def foo(): pass
+class Bar(object): pass
+if False: pass
+baz = 'mako'
+1 + 2 - 3 * 4 / 5
+6 // 7 % 8 << 9 >> 10
+11 & 12 ^ 13 | 14
+15 and 16 or 17
+-baz + (not +18) - ~17
+baz and 'foo' or 'bar'
+(mako is baz == baz) is not baz != mako
+mako > baz < mako >= baz <= mako
+mako in baz not in mako""", '<unknown>', 'exec', _ast.PyCF_ONLY_AST)
+ _ast.Module = type(m)
+
+ for cls in _ast.Module.__mro__:
+ if cls.__name__ == 'mod':
+ _ast.mod = cls
+ elif cls.__name__ == 'AST':
+ _ast.AST = cls
+
+ _ast.FunctionDef = type(m.body[0])
+ _ast.ClassDef = type(m.body[1])
+ _ast.If = type(m.body[2])
+
+ _ast.Name = type(m.body[3].targets[0])
+ _ast.Store = type(m.body[3].targets[0].ctx)
+ _ast.Str = type(m.body[3].value)
+
+ _ast.Sub = type(m.body[4].value.op)
+ _ast.Add = type(m.body[4].value.left.op)
+ _ast.Div = type(m.body[4].value.right.op)
+ _ast.Mult = type(m.body[4].value.right.left.op)
+
+ _ast.RShift = type(m.body[5].value.op)
+ _ast.LShift = type(m.body[5].value.left.op)
+ _ast.Mod = type(m.body[5].value.left.left.op)
+ _ast.FloorDiv = type(m.body[5].value.left.left.left.op)
+
+ _ast.BitOr = type(m.body[6].value.op)
+ _ast.BitXor = type(m.body[6].value.left.op)
+ _ast.BitAnd = type(m.body[6].value.left.left.op)
+
+ _ast.Or = type(m.body[7].value.op)
+ _ast.And = type(m.body[7].value.values[0].op)
+
+ _ast.Invert = type(m.body[8].value.right.op)
+ _ast.Not = type(m.body[8].value.left.right.op)
+ _ast.UAdd = type(m.body[8].value.left.right.operand.op)
+ _ast.USub = type(m.body[8].value.left.left.op)
+
+ _ast.Or = type(m.body[9].value.op)
+ _ast.And = type(m.body[9].value.values[0].op)
+
+ _ast.IsNot = type(m.body[10].value.ops[0])
+ _ast.NotEq = type(m.body[10].value.ops[1])
+ _ast.Is = type(m.body[10].value.left.ops[0])
+ _ast.Eq = type(m.body[10].value.left.ops[1])
+
+ _ast.Gt = type(m.body[11].value.ops[0])
+ _ast.Lt = type(m.body[11].value.ops[1])
+ _ast.GtE = type(m.body[11].value.ops[2])
+ _ast.LtE = type(m.body[11].value.ops[3])
+
+ _ast.In = type(m.body[12].value.ops[0])
+ _ast.NotIn = type(m.body[12].value.ops[1])
+
+
+
+def read_file(path, mode='rb'):
+ fp = open(path, mode)
+ try:
+ data = fp.read()
+ return data
+ finally:
+ fp.close()
+
+def read_python_file(path):
+ fp = open(path, "rb")
+ try:
+ encoding = parse_encoding(fp)
+ data = fp.read()
+ if encoding:
+ data = data.decode(encoding)
+ return data
+ finally:
+ fp.close()
+
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
new file mode 100644
index 00000000000..922117e7e16
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
@@ -0,0 +1,141 @@
+<%
+ max_len = 0
+ for knob in knobs:
+ if len(knob[0]) > max_len: max_len = len(knob[0])
+ max_len += len('KNOB_ ')
+ if max_len % 4: max_len += 4 - (max_len % 4)
+
+ def space_knob(knob):
+ knob_len = len('KNOB_' + knob)
+ return ' '*(max_len - knob_len)
+%>/******************************************************************************
+*
+* Copyright 2015
+* Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http ://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+% if gen_header:
+* @file ${filename}.h
+% else:
+* @file ${filename}.cpp
+% endif
+*
+* @brief Dynamic Knobs for Core.
+*
+* ======================= AUTO GENERATED: DO NOT EDIT !!! ====================
+*
+******************************************************************************/
+%if gen_header:
+#pragma once
+#include <string>
+
+template <typename T>
+struct Knob
+{
+ const T& Value() const { return m_Value; }
+ const T& Value(const T& newValue) { m_Value = newValue; return Value(); }
+
+protected:
+ Knob(const T& defaultValue) : m_Value(defaultValue) {}
+
+private:
+ T m_Value;
+};
+
+#define DEFINE_KNOB(_name, _type, _default) \\
+
+ struct Knob_##_name : Knob<_type> \\
+
+ { \\
+
+ Knob_##_name() : Knob<_type>(_default) { } \\
+
+ static const char* Name() { return "KNOB_" #_name; } \\
+
+ } _name;
+
+#define GET_KNOB(_name) g_GlobalKnobs._name.Value()
+#define SET_KNOB(_name, _newValue) g_GlobalKnobs._name.Value(_newValue)
+
+struct GlobalKnobs
+{
+ % for knob in knobs:
+ //-----------------------------------------------------------
+ // KNOB_${knob[0]}
+ //
+ % for line in knob[1]['desc']:
+ // ${line}
+ % endfor
+ DEFINE_KNOB(${knob[0]}, ${knob[1]['type']}, ${knob[1]['default']});
+
+ % endfor
+ GlobalKnobs();
+ std::string ToString(const char* optPerLinePrefix="");
+};
+extern GlobalKnobs g_GlobalKnobs;
+
+% for knob in knobs:
+#define KNOB_${knob[0]}${space_knob(knob[0])}GET_KNOB(${knob[0]})
+% endfor
+
+
+% else:
+% for inc in includes:
+#include <${inc}>
+% endfor
+
+//========================================================
+// Static Data Members
+//========================================================
+GlobalKnobs g_GlobalKnobs;
+
+//========================================================
+// Knob Initialization
+//========================================================
+GlobalKnobs::GlobalKnobs()
+{
+ % for knob in knobs:
+ InitKnob(${knob[0]});
+ % endfor
+
+}
+
+//========================================================
+// Knob Display (Convert to String)
+//========================================================
+std::string GlobalKnobs::ToString(const char* optPerLinePrefix)
+{
+ std::basic_stringstream<char> str;
+ str << std::showbase << std::setprecision(1) << std::fixed;
+
+ if (optPerLinePrefix == nullptr) { optPerLinePrefix = ""; }
+
+ % for knob in knobs:
+ str << optPerLinePrefix << "KNOB_${knob[0]}:${space_knob(knob[0])}";
+ % if knob[1]['type'] == 'bool':
+ str << (KNOB_${knob[0]} ? "+\n" : "-\n");
+ % elif knob[1]['type'] != 'float':
+ str << std::hex << std::setw(11) << std::left << KNOB_${knob[0]};
+ str << std::dec << KNOB_${knob[0]} << "\n";
+ % else:
+ str << KNOB_${knob[0]} << "\n";
+ % endif
+ % endfor
+ str << std::ends;
+
+ return str.str();
+}
+
+
+% endif