diff options
author | Jason Ekstrand <[email protected]> | 2016-03-15 14:09:50 -0700 |
---|---|---|
committer | Jason Ekstrand <[email protected]> | 2016-03-15 14:09:50 -0700 |
commit | 7f6a0cb29c89a03441be744680a2145445be3a3c (patch) | |
tree | 516824ab49962521563b95fa79430cf948baaccc | |
parent | b83785d86d2c7f07323920615c72a9f09695a9a7 (diff) | |
parent | e103b52aec773537d2821d8acc42ac9caa2a4b17 (diff) |
Merge remote-tracking branch 'public/master' into vulkan
457 files changed, 62759 insertions, 2759 deletions
diff --git a/.gitignore b/.gitignore index 21aa35cd36d..b4f88f7b7fb 100644 --- a/.gitignore +++ b/.gitignore @@ -46,3 +46,4 @@ manifest.txt Makefile Makefile.in .install-mesa-links +.install-gallium-links diff --git a/configure.ac b/configure.ac index 5f686f5602a..384de4dbde6 100644 --- a/configure.ac +++ b/configure.ac @@ -68,7 +68,7 @@ OPENCL_VERSION=1 AC_SUBST([OPENCL_VERSION]) dnl Versions for external dependencies -LIBDRM_REQUIRED=2.4.60 +LIBDRM_REQUIRED=2.4.66 LIBDRM_RADEON_REQUIRED=2.4.56 LIBDRM_AMDGPU_REQUIRED=2.4.63 LIBDRM_INTEL_REQUIRED=2.4.61 @@ -1737,6 +1737,7 @@ AM_CONDITIONAL(HAVE_ST_XVMC, test "x$enable_xvmc" = xyes) if test "x$enable_vdpau" = xyes; then PKG_CHECK_MODULES([VDPAU], [vdpau >= $VDPAU_REQUIRED]) gallium_st="$gallium_st vdpau" + DEFINES="$DEFINES -DHAVE_ST_VDPAU" fi AM_CONDITIONAL(HAVE_ST_VDPAU, test "x$enable_vdpau" = xyes) @@ -2193,6 +2194,16 @@ radeon_llvm_check() { fi } +swr_llvm_check() { + gallium_require_llvm $1 + if test ${LLVM_VERSION_INT} -lt 306; then + AC_MSG_ERROR([LLVM version 3.6 or later required when building $1]) + fi + if test "x$enable_gallium_llvm" != "xyes"; then + AC_MSG_ERROR([--enable-gallium-llvm is required when building $1]) + fi +} + dnl Duplicates in GALLIUM_DRIVERS_DIRS are removed by sorting it after this block if test -n "$with_gallium_drivers"; then gallium_drivers=`IFS=', '; echo $with_gallium_drivers` @@ -2265,6 +2276,30 @@ if test -n "$with_gallium_drivers"; then HAVE_GALLIUM_LLVMPIPE=yes fi ;; + xswr) + AX_CXX_COMPILE_STDCXX([11], [noext], [mandatory]) + swr_llvm_check "swr" + + AC_MSG_CHECKING([whether $CXX supports AVX/AVX2]) + AVX_CXXFLAGS="-march=core-avx-i" + AVX2_CXXFLAGS="-march=core-avx2" + + AC_LANG_PUSH([C++]) + save_CXXFLAGS="$CXXFLAGS" + CXXFLAGS="$AVX_CXXFLAGS $CXXFLAGS" + AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],[], + [AC_MSG_ERROR([AVX compiler support not detected])]) + CXXFLAGS="$save_CXXFLAGS" + + save_CFLAGS="$CXXFLAGS" + CXXFLAGS="$AVX2_CXXFLAGS $CXXFLAGS" + AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],[], + [AC_MSG_ERROR([AVX2 compiler support not detected])]) + CXXFLAGS="$save_CXXFLAGS" + AC_LANG_POP([C++]) + + HAVE_GALLIUM_SWR=yes + ;; xvc4) HAVE_GALLIUM_VC4=yes gallium_require_drm "vc4" @@ -2354,6 +2389,7 @@ AM_CONDITIONAL(HAVE_GALLIUM_NOUVEAU, test "x$HAVE_GALLIUM_NOUVEAU" = xyes) AM_CONDITIONAL(HAVE_GALLIUM_FREEDRENO, test "x$HAVE_GALLIUM_FREEDRENO" = xyes) AM_CONDITIONAL(HAVE_GALLIUM_SOFTPIPE, test "x$HAVE_GALLIUM_SOFTPIPE" = xyes) AM_CONDITIONAL(HAVE_GALLIUM_LLVMPIPE, test "x$HAVE_GALLIUM_LLVMPIPE" = xyes) +AM_CONDITIONAL(HAVE_GALLIUM_SWR, test "x$HAVE_GALLIUM_SWR" = xyes) AM_CONDITIONAL(HAVE_GALLIUM_VC4, test "x$HAVE_GALLIUM_VC4" = xyes) AM_CONDITIONAL(HAVE_GALLIUM_VIRGL, test "x$HAVE_GALLIUM_VIRGL" = xyes) @@ -2470,6 +2506,9 @@ AC_CONFIG_FILES([Makefile src/gallium/drivers/rbug/Makefile src/gallium/drivers/softpipe/Makefile src/gallium/drivers/svga/Makefile + src/gallium/drivers/swr/Makefile + src/gallium/drivers/swr/avx/Makefile + src/gallium/drivers/swr/avx2/Makefile src/gallium/drivers/trace/Makefile src/gallium/drivers/vc4/Makefile src/gallium/drivers/virgl/Makefile diff --git a/docs/GL3.txt b/docs/GL3.txt index d141c221f19..ee7facafc95 100644 --- a/docs/GL3.txt +++ b/docs/GL3.txt @@ -158,7 +158,7 @@ GL 4.3, GLSL 4.30: GL_ARB_explicit_uniform_location DONE (all drivers that support GLSL) GL_ARB_fragment_layer_viewport DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe) GL_ARB_framebuffer_no_attachments DONE (i965) - GL_ARB_internalformat_query2 in progress (elima) + GL_ARB_internalformat_query2 DONE (i965) GL_ARB_invalidate_subdata DONE (all drivers) GL_ARB_multi_draw_indirect DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe) GL_ARB_program_interface_query DONE (all drivers) @@ -180,8 +180,8 @@ GL 4.4, GLSL 4.40: GL_ARB_clear_texture DONE (i965, nv50, nvc0) GL_ARB_enhanced_layouts in progress (Timothy) - compile-time constant expressions DONE - - explicit byte offsets for blocks in progress - - forced alignment within blocks in progress + - explicit byte offsets for blocks DONE + - forced alignment within blocks DONE - specified vec4-slot component numbers in progress - specified transform/feedback layout in progress - input/output block locations DONE diff --git a/docs/relnotes/11.3.0.html b/docs/relnotes/11.3.0.html index fa650830e23..c31296ef9b1 100644 --- a/docs/relnotes/11.3.0.html +++ b/docs/relnotes/11.3.0.html @@ -44,6 +44,8 @@ Note: some of the new features are only available with certain drivers. </p> <ul> +<li>GL_ARB_internalformat_query2 on i965</li> +<li>GL_ARB_shader_atomic_counter_ops on nvc0</li> <li>GL_OES_texture_border_clamp and GL_EXT_texture_border_clamp on all drivers that support GL_ARB_texture_border_clamp</li> <li>GL_OES_shader_image_atomic on all drivers that support GL_ARB_shader_image_load_store</li> </ul> diff --git a/include/EGL/eglmesaext.h b/include/EGL/eglmesaext.h index 917a2043c77..337dd2cb789 100644 --- a/include/EGL/eglmesaext.h +++ b/include/EGL/eglmesaext.h @@ -34,17 +34,6 @@ extern "C" { #include <EGL/eglplatform.h> -#ifndef EGL_MESA_drm_display -#define EGL_MESA_drm_display 1 - -#ifdef EGL_EGLEXT_PROTOTYPES -EGLAPI EGLDisplay EGLAPIENTRY eglGetDRMDisplayMESA(int fd); -#endif /* EGL_EGLEXT_PROTOTYPES */ - -typedef EGLDisplay (EGLAPIENTRYP PFNEGLGETDRMDISPLAYMESA) (int fd); - -#endif /* EGL_MESA_drm_display */ - #ifdef EGL_MESA_drm_image /* Mesa's extension to EGL_MESA_drm_image... */ #ifndef EGL_DRM_BUFFER_USE_CURSOR_MESA diff --git a/include/GL/internal/dri_interface.h b/include/GL/internal/dri_interface.h index 6bbd3fa87f5..2b49a2941e1 100644 --- a/include/GL/internal/dri_interface.h +++ b/include/GL/internal/dri_interface.h @@ -1100,6 +1100,11 @@ struct __DRIdri2ExtensionRec { #define __DRI_IMAGE_USE_SCANOUT 0x0002 #define __DRI_IMAGE_USE_CURSOR 0x0004 /* Depricated */ #define __DRI_IMAGE_USE_LINEAR 0x0008 +/* The buffer will only be read by an external process after SwapBuffers, + * in contrary to gbm buffers, front buffers and fake front buffers, which + * could be read after a flush." + */ +#define __DRI_IMAGE_USE_BACKBUFFER 0x0010 /** diff --git a/include/pci_ids/i965_pci_ids.h b/include/pci_ids/i965_pci_ids.h index bdfbefe0b75..bd645fae640 100644 --- a/include/pci_ids/i965_pci_ids.h +++ b/include/pci_ids/i965_pci_ids.h @@ -156,8 +156,8 @@ CHIPSET(0x5932, kbl_gt4, "Intel(R) Kabylake GT4") CHIPSET(0x593A, kbl_gt4, "Intel(R) Kabylake GT4") CHIPSET(0x593B, kbl_gt4, "Intel(R) Kabylake GT4") CHIPSET(0x593D, kbl_gt4, "Intel(R) Kabylake GT4") -CHIPSET(0x22B0, chv, "Intel(R) HD Graphics (Cherryview)") -CHIPSET(0x22B1, chv, "Intel(R) HD Graphics (Cherryview)") +CHIPSET(0x22B0, chv, "Intel(R) HD Graphics (Cherrytrail)") +CHIPSET(0x22B1, chv, "Intel(R) HD Graphics XXX (Braswell)") /* Overridden in brw_get_renderer_string */ CHIPSET(0x22B2, chv, "Intel(R) HD Graphics (Cherryview)") CHIPSET(0x22B3, chv, "Intel(R) HD Graphics (Cherryview)") CHIPSET(0x0A84, bxt, "Intel(R) HD Graphics (Broxton)") diff --git a/m4/ax_cxx_compile_stdcxx.m4 b/m4/ax_cxx_compile_stdcxx.m4 new file mode 100644 index 00000000000..079e17d2a62 --- /dev/null +++ b/m4/ax_cxx_compile_stdcxx.m4 @@ -0,0 +1,558 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_CXX_COMPILE_STDCXX(VERSION, [ext|noext], [mandatory|optional]) +# +# DESCRIPTION +# +# Check for baseline language coverage in the compiler for the specified +# version of the C++ standard. If necessary, add switches to CXXFLAGS to +# enable support. VERSION may be '11' (for the C++11 standard) or '14' +# (for the C++14 standard). +# +# The second argument, if specified, indicates whether you insist on an +# extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g. +# -std=c++11). If neither is specified, you get whatever works, with +# preference for an extended mode. +# +# The third argument, if specified 'mandatory' or if left unspecified, +# indicates that baseline support for the specified C++ standard is +# required and that the macro should error out if no mode with that +# support is found. If specified 'optional', then configuration proceeds +# regardless, after defining HAVE_CXX${VERSION} if and only if a +# supporting mode is found. +# +# LICENSE +# +# Copyright (c) 2008 Benjamin Kosnik <[email protected]> +# Copyright (c) 2012 Zack Weinberg <[email protected]> +# Copyright (c) 2013 Roy Stogner <[email protected]> +# Copyright (c) 2014, 2015 Google Inc.; contributed by Alexey Sokolov <[email protected]> +# Copyright (c) 2015 Paul Norman <[email protected]> +# Copyright (c) 2015 Moritz Klammler <[email protected]> +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 1 + +dnl This macro is based on the code from the AX_CXX_COMPILE_STDCXX_11 macro +dnl (serial version number 13). + +AC_DEFUN([AX_CXX_COMPILE_STDCXX], [dnl + m4_if([$1], [11], [], + [$1], [14], [], + [$1], [17], [m4_fatal([support for C++17 not yet implemented in AX_CXX_COMPILE_STDCXX])], + [m4_fatal([invalid first argument `$1' to AX_CXX_COMPILE_STDCXX])])dnl + m4_if([$2], [], [], + [$2], [ext], [], + [$2], [noext], [], + [m4_fatal([invalid second argument `$2' to AX_CXX_COMPILE_STDCXX])])dnl + m4_if([$3], [], [ax_cxx_compile_cxx$1_required=true], + [$3], [mandatory], [ax_cxx_compile_cxx$1_required=true], + [$3], [optional], [ax_cxx_compile_cxx$1_required=false], + [m4_fatal([invalid third argument `$3' to AX_CXX_COMPILE_STDCXX])]) + AC_LANG_PUSH([C++])dnl + ac_success=no + AC_CACHE_CHECK(whether $CXX supports C++$1 features by default, + ax_cv_cxx_compile_cxx$1, + [AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])], + [ax_cv_cxx_compile_cxx$1=yes], + [ax_cv_cxx_compile_cxx$1=no])]) + if test x$ax_cv_cxx_compile_cxx$1 = xyes; then + ac_success=yes + fi + + m4_if([$2], [noext], [], [dnl + if test x$ac_success = xno; then + for switch in -std=gnu++$1 -std=gnu++0x; do + cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch]) + AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch, + $cachevar, + [ac_save_CXXFLAGS="$CXXFLAGS" + CXXFLAGS="$CXXFLAGS $switch" + AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])], + [eval $cachevar=yes], + [eval $cachevar=no]) + CXXFLAGS="$ac_save_CXXFLAGS"]) + if eval test x\$$cachevar = xyes; then + CXXFLAGS="$CXXFLAGS $switch" + ac_success=yes + break + fi + done + fi]) + + m4_if([$2], [ext], [], [dnl + if test x$ac_success = xno; then + dnl HP's aCC needs +std=c++11 according to: + dnl http://h21007.www2.hp.com/portal/download/files/unprot/aCxx/PDF_Release_Notes/769149-001.pdf + dnl Cray's crayCC needs "-h std=c++11" + for switch in -std=c++$1 -std=c++0x +std=c++$1 "-h std=c++$1"; do + cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch]) + AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch, + $cachevar, + [ac_save_CXXFLAGS="$CXXFLAGS" + CXXFLAGS="$CXXFLAGS $switch" + AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])], + [eval $cachevar=yes], + [eval $cachevar=no]) + CXXFLAGS="$ac_save_CXXFLAGS"]) + if eval test x\$$cachevar = xyes; then + CXXFLAGS="$CXXFLAGS $switch" + ac_success=yes + break + fi + done + fi]) + AC_LANG_POP([C++]) + if test x$ax_cxx_compile_cxx$1_required = xtrue; then + if test x$ac_success = xno; then + AC_MSG_ERROR([*** A compiler with support for C++$1 language features is required.]) + fi + else + if test x$ac_success = xno; then + HAVE_CXX$1=0 + AC_MSG_NOTICE([No compiler with C++$1 support was found]) + else + HAVE_CXX$1=1 + AC_DEFINE(HAVE_CXX$1,1, + [define if the compiler supports basic C++$1 syntax]) + fi + + AC_SUBST(HAVE_CXX$1) + fi +]) + + +dnl Test body for checking C++11 support + +m4_define([_AX_CXX_COMPILE_STDCXX_testbody_11], + _AX_CXX_COMPILE_STDCXX_testbody_new_in_11 +) + + +dnl Test body for checking C++14 support + +m4_define([_AX_CXX_COMPILE_STDCXX_testbody_14], + _AX_CXX_COMPILE_STDCXX_testbody_new_in_11 + _AX_CXX_COMPILE_STDCXX_testbody_new_in_14 +) + + +dnl Tests for new features in C++11 + +m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_11], [[ + +// If the compiler admits that it is not ready for C++11, why torture it? +// Hopefully, this will speed up the test. + +#ifndef __cplusplus + +#error "This is not a C++ compiler" + +#elif __cplusplus < 201103L + +#error "This is not a C++11 compiler" + +#else + +namespace cxx11 +{ + + namespace test_static_assert + { + + template <typename T> + struct check + { + static_assert(sizeof(int) <= sizeof(T), "not big enough"); + }; + + } + + namespace test_final_override + { + + struct Base + { + virtual void f() {} + }; + + struct Derived : public Base + { + virtual void f() override {} + }; + + } + + namespace test_double_right_angle_brackets + { + + template < typename T > + struct check {}; + + typedef check<void> single_type; + typedef check<check<void>> double_type; + typedef check<check<check<void>>> triple_type; + typedef check<check<check<check<void>>>> quadruple_type; + + } + + namespace test_decltype + { + + int + f() + { + int a = 1; + decltype(a) b = 2; + return a + b; + } + + } + + namespace test_type_deduction + { + + template < typename T1, typename T2 > + struct is_same + { + static const bool value = false; + }; + + template < typename T > + struct is_same<T, T> + { + static const bool value = true; + }; + + template < typename T1, typename T2 > + auto + add(T1 a1, T2 a2) -> decltype(a1 + a2) + { + return a1 + a2; + } + + int + test(const int c, volatile int v) + { + static_assert(is_same<int, decltype(0)>::value == true, ""); + static_assert(is_same<int, decltype(c)>::value == false, ""); + static_assert(is_same<int, decltype(v)>::value == false, ""); + auto ac = c; + auto av = v; + auto sumi = ac + av + 'x'; + auto sumf = ac + av + 1.0; + static_assert(is_same<int, decltype(ac)>::value == true, ""); + static_assert(is_same<int, decltype(av)>::value == true, ""); + static_assert(is_same<int, decltype(sumi)>::value == true, ""); + static_assert(is_same<int, decltype(sumf)>::value == false, ""); + static_assert(is_same<int, decltype(add(c, v))>::value == true, ""); + return (sumf > 0.0) ? sumi : add(c, v); + } + + } + + namespace test_noexcept + { + + int f() { return 0; } + int g() noexcept { return 0; } + + static_assert(noexcept(f()) == false, ""); + static_assert(noexcept(g()) == true, ""); + + } + + namespace test_constexpr + { + + template < typename CharT > + unsigned long constexpr + strlen_c_r(const CharT *const s, const unsigned long acc) noexcept + { + return *s ? strlen_c_r(s + 1, acc + 1) : acc; + } + + template < typename CharT > + unsigned long constexpr + strlen_c(const CharT *const s) noexcept + { + return strlen_c_r(s, 0UL); + } + + static_assert(strlen_c("") == 0UL, ""); + static_assert(strlen_c("1") == 1UL, ""); + static_assert(strlen_c("example") == 7UL, ""); + static_assert(strlen_c("another\0example") == 7UL, ""); + + } + + namespace test_rvalue_references + { + + template < int N > + struct answer + { + static constexpr int value = N; + }; + + answer<1> f(int&) { return answer<1>(); } + answer<2> f(const int&) { return answer<2>(); } + answer<3> f(int&&) { return answer<3>(); } + + void + test() + { + int i = 0; + const int c = 0; + static_assert(decltype(f(i))::value == 1, ""); + static_assert(decltype(f(c))::value == 2, ""); + static_assert(decltype(f(0))::value == 3, ""); + } + + } + + namespace test_uniform_initialization + { + + struct test + { + static const int zero {}; + static const int one {1}; + }; + + static_assert(test::zero == 0, ""); + static_assert(test::one == 1, ""); + + } + + namespace test_lambdas + { + + void + test1() + { + auto lambda1 = [](){}; + auto lambda2 = lambda1; + lambda1(); + lambda2(); + } + + int + test2() + { + auto a = [](int i, int j){ return i + j; }(1, 2); + auto b = []() -> int { return '0'; }(); + auto c = [=](){ return a + b; }(); + auto d = [&](){ return c; }(); + auto e = [a, &b](int x) mutable { + const auto identity = [](int y){ return y; }; + for (auto i = 0; i < a; ++i) + a += b--; + return x + identity(a + b); + }(0); + return a + b + c + d + e; + } + + int + test3() + { + const auto nullary = [](){ return 0; }; + const auto unary = [](int x){ return x; }; + using nullary_t = decltype(nullary); + using unary_t = decltype(unary); + const auto higher1st = [](nullary_t f){ return f(); }; + const auto higher2nd = [unary](nullary_t f1){ + return [unary, f1](unary_t f2){ return f2(unary(f1())); }; + }; + return higher1st(nullary) + higher2nd(nullary)(unary); + } + + } + + namespace test_variadic_templates + { + + template <int...> + struct sum; + + template <int N0, int... N1toN> + struct sum<N0, N1toN...> + { + static constexpr auto value = N0 + sum<N1toN...>::value; + }; + + template <> + struct sum<> + { + static constexpr auto value = 0; + }; + + static_assert(sum<>::value == 0, ""); + static_assert(sum<1>::value == 1, ""); + static_assert(sum<23>::value == 23, ""); + static_assert(sum<1, 2>::value == 3, ""); + static_assert(sum<5, 5, 11>::value == 21, ""); + static_assert(sum<2, 3, 5, 7, 11, 13>::value == 41, ""); + + } + + // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae + // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function + // because of this. + namespace test_template_alias_sfinae + { + + struct foo {}; + + template<typename T> + using member = typename T::member_type; + + template<typename T> + void func(...) {} + + template<typename T> + void func(member<T>*) {} + + void test(); + + void test() { func<foo>(0); } + + } + +} // namespace cxx11 + +#endif // __cplusplus >= 201103L + +]]) + + +dnl Tests for new features in C++14 + +m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_14], [[ + +// If the compiler admits that it is not ready for C++14, why torture it? +// Hopefully, this will speed up the test. + +#ifndef __cplusplus + +#error "This is not a C++ compiler" + +#elif __cplusplus < 201402L + +#error "This is not a C++14 compiler" + +#else + +namespace cxx14 +{ + + namespace test_polymorphic_lambdas + { + + int + test() + { + const auto lambda = [](auto&&... args){ + const auto istiny = [](auto x){ + return (sizeof(x) == 1UL) ? 1 : 0; + }; + const int aretiny[] = { istiny(args)... }; + return aretiny[0]; + }; + return lambda(1, 1L, 1.0f, '1'); + } + + } + + namespace test_binary_literals + { + + constexpr auto ivii = 0b0000000000101010; + static_assert(ivii == 42, "wrong value"); + + } + + namespace test_generalized_constexpr + { + + template < typename CharT > + constexpr unsigned long + strlen_c(const CharT *const s) noexcept + { + auto length = 0UL; + for (auto p = s; *p; ++p) + ++length; + return length; + } + + static_assert(strlen_c("") == 0UL, ""); + static_assert(strlen_c("x") == 1UL, ""); + static_assert(strlen_c("test") == 4UL, ""); + static_assert(strlen_c("another\0test") == 7UL, ""); + + } + + namespace test_lambda_init_capture + { + + int + test() + { + auto x = 0; + const auto lambda1 = [a = x](int b){ return a + b; }; + const auto lambda2 = [a = lambda1(x)](){ return a; }; + return lambda2(); + } + + } + + namespace test_digit_seperators + { + + constexpr auto ten_million = 100'000'000; + static_assert(ten_million == 100000000, ""); + + } + + namespace test_return_type_deduction + { + + auto f(int& x) { return x; } + decltype(auto) g(int& x) { return x; } + + template < typename T1, typename T2 > + struct is_same + { + static constexpr auto value = false; + }; + + template < typename T > + struct is_same<T, T> + { + static constexpr auto value = true; + }; + + int + test() + { + auto x = 0; + static_assert(is_same<int, decltype(f(x))>::value, ""); + static_assert(is_same<int&, decltype(g(x))>::value, ""); + return x; + } + + } + +} // namespace cxx14 + +#endif // __cplusplus >= 201402L + +]]) diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources index 2a4568aa679..b0b8281869d 100644 --- a/src/compiler/Makefile.sources +++ b/src/compiler/Makefile.sources @@ -188,10 +188,10 @@ NIR_FILES = \ nir/nir_lower_clip.c \ nir/nir_lower_global_vars_to_local.c \ nir/nir_lower_gs_intrinsics.c \ - nir/nir_lower_indirect_derefs.c \ nir/nir_lower_load_const_to_scalar.c \ nir/nir_lower_locals_to_regs.c \ nir/nir_lower_idiv.c \ + nir/nir_lower_indirect_derefs.c \ nir/nir_lower_io.c \ nir/nir_lower_outputs_to_temporaries.c \ nir/nir_lower_phis_to_scalar.c \ diff --git a/src/compiler/glsl/ast.h b/src/compiler/glsl/ast.h index 9aa5bb99f49..727aa432631 100644 --- a/src/compiler/glsl/ast.h +++ b/src/compiler/glsl/ast.h @@ -479,6 +479,12 @@ struct ast_type_qualifier { unsigned pixel_center_integer:1; /*@}*/ + /** + * Flag set if GL_ARB_enhanced_layouts "align" layout qualifier is + * used. + */ + unsigned explicit_align:1; + /** * Flag set if GL_ARB_explicit_attrib_location "location" layout * qualifier is used. @@ -577,6 +583,11 @@ struct ast_type_qualifier { /** Precision of the type (highp/medium/lowp). */ unsigned precision:2; + /** + * Alignment specified via GL_ARB_enhanced_layouts "align" layout qualifier + */ + ast_expression *align; + /** Geometry shader invocations for GL_ARB_gpu_shader5. */ ast_layout_expression *invocations; @@ -1061,10 +1072,9 @@ public: class ast_interface_block : public ast_node { public: - ast_interface_block(ast_type_qualifier layout, - const char *instance_name, + ast_interface_block(const char *instance_name, ast_array_specifier *array_specifier) - : layout(layout), block_name(NULL), instance_name(instance_name), + : block_name(NULL), instance_name(instance_name), array_specifier(array_specifier) { } diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp index db5ec9a4ad9..5262bd87655 100644 --- a/src/compiler/glsl/ast_to_hir.cpp +++ b/src/compiler/glsl/ast_to_hir.cpp @@ -4223,6 +4223,18 @@ ast_declarator_list::hir(exec_list *instructions, type_name); } else { if (decl_type->base_type == GLSL_TYPE_ARRAY) { + /* From Section 13.22 (Array Declarations) of the GLSL ES 3.2 + * spec: + * + * "... any declaration that leaves the size undefined is + * disallowed as this would add complexity and there are no + * use-cases." + */ + if (state->es_shader && decl_type->is_unsized_array()) { + _mesa_glsl_error(&loc, state, "array size must be explicitly " + "or implicitly defined"); + } + /* From Section 4.12 (Empty Declarations) of the GLSL 4.5 spec: * * "The combinations of types and qualifiers that cause @@ -6244,9 +6256,11 @@ ast_process_struct_or_iface_block_members(exec_list *instructions, ir_variable_mode var_mode, ast_type_qualifier *layout, unsigned block_stream, - unsigned expl_location) + unsigned expl_location, + unsigned expl_align) { unsigned decl_count = 0; + unsigned next_offset = 0; /* Make an initial pass over the list of fields to determine how * many there are. Each element in this list is an ast_declarator_list. @@ -6460,13 +6474,93 @@ ast_process_struct_or_iface_block_members(exec_list *instructions, } } + /* Offset can only be used with std430 and std140 layouts an initial + * value of 0 is used for error detection. + */ + unsigned align = 0; + unsigned size = 0; + if (layout) { + bool row_major; + if (qual->flags.q.row_major || + matrix_layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR) { + row_major = true; + } else { + row_major = false; + } + + if(layout->flags.q.std140) { + align = field_type->std140_base_alignment(row_major); + size = field_type->std140_size(row_major); + } else if (layout->flags.q.std430) { + align = field_type->std430_base_alignment(row_major); + size = field_type->std430_size(row_major); + } + } + + if (qual->flags.q.explicit_offset) { + unsigned qual_offset; + if (process_qualifier_constant(state, &loc, "offset", + qual->offset, &qual_offset)) { + if (align != 0 && size != 0) { + if (next_offset > qual_offset) + _mesa_glsl_error(&loc, state, "layout qualifier " + "offset overlaps previous member"); + + if (qual_offset % align) { + _mesa_glsl_error(&loc, state, "layout qualifier offset " + "must be a multiple of the base " + "alignment of %s", field_type->name); + } + fields[i].offset = qual_offset; + next_offset = glsl_align(qual_offset + size, align); + } else { + _mesa_glsl_error(&loc, state, "offset can only be used " + "with std430 and std140 layouts"); + } + } + } else { + fields[i].offset = -1; + } + + if (qual->flags.q.explicit_align || expl_align != 0) { + unsigned offset = fields[i].offset != -1 ? fields[i].offset : + next_offset; + if (align == 0 || size == 0) { + _mesa_glsl_error(&loc, state, "align can only be used with " + "std430 and std140 layouts"); + } else if (qual->flags.q.explicit_align) { + unsigned member_align; + if (process_qualifier_constant(state, &loc, "align", + qual->align, &member_align)) { + if (member_align == 0 || + member_align & (member_align - 1)) { + _mesa_glsl_error(&loc, state, "align layout qualifier " + "in not a power of 2"); + } else { + fields[i].offset = glsl_align(offset, member_align); + next_offset = glsl_align(fields[i].offset + size, align); + } + } + } else { + fields[i].offset = glsl_align(offset, expl_align); + next_offset = glsl_align(fields[i].offset + size, align); + } + } + + if (!qual->flags.q.explicit_offset) { + if (align != 0 && size != 0) + next_offset = glsl_align(next_offset + size, align); + } + /* Propogate row- / column-major information down the fields of the * structure or interface block. Structures need this data because * the structure may contain a structure that contains ... a matrix * that need the proper layout. */ - if (field_type->without_array()->is_matrix() - || field_type->without_array()->is_record()) { + if (is_interface && + (layout->flags.q.uniform || layout->flags.q.buffer) && + (field_type->without_array()->is_matrix() + || field_type->without_array()->is_record())) { /* If no layout is specified for the field, inherit the layout * from the block. */ @@ -6477,11 +6571,10 @@ ast_process_struct_or_iface_block_members(exec_list *instructions, else if (qual->flags.q.column_major) fields[i].matrix_layout = GLSL_MATRIX_LAYOUT_COLUMN_MAJOR; - /* If we're processing an interface block, the matrix layout must - * be decided by this point. + /* If we're processing an uniform or buffer block, the matrix + * layout must be decided by this point. */ - assert(!is_interface - || fields[i].matrix_layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR + assert(fields[i].matrix_layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR || fields[i].matrix_layout == GLSL_MATRIX_LAYOUT_COLUMN_MAJOR); } @@ -6553,7 +6646,8 @@ ast_struct_specifier::hir(exec_list *instructions, ir_var_auto, layout, 0, /* for interface only */ - expl_location); + expl_location, + 0 /* for interface only */); validate_identifier(this->name, loc, state); @@ -6721,6 +6815,20 @@ ast_interface_block::hir(exec_list *instructions, } } + unsigned expl_align = 0; + if (layout.flags.q.explicit_align) { + if (!process_qualifier_constant(state, &loc, "align", + layout.align, &expl_align)) { + return NULL; + } else { + if (expl_align == 0 || expl_align & (expl_align - 1)) { + _mesa_glsl_error(&loc, state, "align layout qualifier in not a " + "power of 2."); + return NULL; + } + } + } + unsigned int num_variables = ast_process_struct_or_iface_block_members(&declared_variables, state, @@ -6732,7 +6840,8 @@ ast_interface_block::hir(exec_list *instructions, var_mode, &this->layout, qual_stream, - expl_location); + expl_location, + expl_align); if (!redeclaring_per_vertex) { validate_identifier(this->block_name, loc, state); @@ -6833,6 +6942,8 @@ ast_interface_block::hir(exec_list *instructions, } else { fields[i].location = earlier_per_vertex->fields.structure[j].location; + fields[i].offset = + earlier_per_vertex->fields.structure[j].offset; fields[i].interpolation = earlier_per_vertex->fields.structure[j].interpolation; fields[i].centroid = diff --git a/src/compiler/glsl/ast_type.cpp b/src/compiler/glsl/ast_type.cpp index dcd83efa6ff..07ed4f2356c 100644 --- a/src/compiler/glsl/ast_type.cpp +++ b/src/compiler/glsl/ast_type.cpp @@ -73,6 +73,7 @@ ast_type_qualifier::has_layout() const || this->flags.q.column_major || this->flags.q.row_major || this->flags.q.packed + || this->flags.q.explicit_align || this->flags.q.explicit_location || this->flags.q.explicit_image_format || this->flags.q.explicit_index @@ -134,6 +135,28 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc, stream_layout_mask.flags.i = 0; stream_layout_mask.flags.q.stream = 1; + /* FIXME: We should probably do interface and function param validation + * separately. + */ + ast_type_qualifier input_layout_mask; + input_layout_mask.flags.i = 0; + input_layout_mask.flags.q.centroid = 1; + /* Function params can have constant */ + input_layout_mask.flags.q.constant = 1; + input_layout_mask.flags.q.explicit_location = 1; + input_layout_mask.flags.q.flat = 1; + input_layout_mask.flags.q.in = 1; + input_layout_mask.flags.q.invariant = 1; + input_layout_mask.flags.q.noperspective = 1; + input_layout_mask.flags.q.origin_upper_left = 1; + /* Function params 'inout' will set this */ + input_layout_mask.flags.q.out = 1; + input_layout_mask.flags.q.patch = 1; + input_layout_mask.flags.q.pixel_center_integer = 1; + input_layout_mask.flags.q.precise = 1; + input_layout_mask.flags.q.sample = 1; + input_layout_mask.flags.q.smooth = 1; + /* Uniform block layout qualifiers get to overwrite each * other (rightmost having priority), while all other * qualifiers currently don't allow duplicates. @@ -258,6 +281,16 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc, this->flags.i |= q.flags.i; + if (this->flags.q.in && + (this->flags.i & ~input_layout_mask.flags.i) != 0) { + _mesa_glsl_error(loc, state, + "invalid input layout qualifier used"); + return false; + } + + if (q.flags.q.explicit_align) + this->align = q.align; + if (q.flags.q.explicit_location) this->location = q.location; diff --git a/src/compiler/glsl/builtin_functions.cpp b/src/compiler/glsl/builtin_functions.cpp index 0a0dcc68a05..ff6b628eb64 100644 --- a/src/compiler/glsl/builtin_functions.cpp +++ b/src/compiler/glsl/builtin_functions.cpp @@ -432,6 +432,12 @@ shader_atomic_counters(const _mesa_glsl_parse_state *state) } static bool +shader_atomic_counter_ops(const _mesa_glsl_parse_state *state) +{ + return state->ARB_shader_atomic_counter_ops_enable; +} + +static bool shader_clock(const _mesa_glsl_parse_state *state) { return state->ARB_shader_clock_enable; @@ -578,7 +584,7 @@ private: ir_dereference_array *array_ref(ir_variable *var, int i); ir_swizzle *matrix_elt(ir_variable *var, int col, int row); - ir_expression *asin_expr(ir_variable *x); + ir_expression *asin_expr(ir_variable *x, float p0, float p1); void do_atan(ir_factory &body, const glsl_type *type, ir_variable *res, operand y_over_x); /** @@ -792,8 +798,14 @@ private: B1(interpolateAtSample) ir_function_signature *_atomic_counter_intrinsic(builtin_available_predicate avail); + ir_function_signature *_atomic_counter_intrinsic1(builtin_available_predicate avail); + ir_function_signature *_atomic_counter_intrinsic2(builtin_available_predicate avail); ir_function_signature *_atomic_counter_op(const char *intrinsic, builtin_available_predicate avail); + ir_function_signature *_atomic_counter_op1(const char *intrinsic, + builtin_available_predicate avail); + ir_function_signature *_atomic_counter_op2(const char *intrinsic, + builtin_available_predicate avail); ir_function_signature *_atomic_intrinsic2(builtin_available_predicate avail, const glsl_type *type); @@ -968,48 +980,59 @@ builtin_builder::create_intrinsics() glsl_type::uint_type), _atomic_intrinsic2(buffer_atomics_supported, glsl_type::int_type), + _atomic_counter_intrinsic1(shader_atomic_counter_ops), + NULL); + add_function("__intrinsic_atomic_sub", + _atomic_counter_intrinsic1(shader_atomic_counter_ops), NULL); add_function("__intrinsic_atomic_min", _atomic_intrinsic2(buffer_atomics_supported, glsl_type::uint_type), _atomic_intrinsic2(buffer_atomics_supported, glsl_type::int_type), + _atomic_counter_intrinsic1(shader_atomic_counter_ops), NULL); add_function("__intrinsic_atomic_max", _atomic_intrinsic2(buffer_atomics_supported, glsl_type::uint_type), _atomic_intrinsic2(buffer_atomics_supported, glsl_type::int_type), + _atomic_counter_intrinsic1(shader_atomic_counter_ops), NULL); add_function("__intrinsic_atomic_and", _atomic_intrinsic2(buffer_atomics_supported, glsl_type::uint_type), _atomic_intrinsic2(buffer_atomics_supported, glsl_type::int_type), + _atomic_counter_intrinsic1(shader_atomic_counter_ops), NULL); add_function("__intrinsic_atomic_or", _atomic_intrinsic2(buffer_atomics_supported, glsl_type::uint_type), _atomic_intrinsic2(buffer_atomics_supported, glsl_type::int_type), + _atomic_counter_intrinsic1(shader_atomic_counter_ops), NULL); add_function("__intrinsic_atomic_xor", _atomic_intrinsic2(buffer_atomics_supported, glsl_type::uint_type), _atomic_intrinsic2(buffer_atomics_supported, glsl_type::int_type), + _atomic_counter_intrinsic1(shader_atomic_counter_ops), NULL); add_function("__intrinsic_atomic_exchange", _atomic_intrinsic2(buffer_atomics_supported, glsl_type::uint_type), _atomic_intrinsic2(buffer_atomics_supported, glsl_type::int_type), + _atomic_counter_intrinsic1(shader_atomic_counter_ops), NULL); add_function("__intrinsic_atomic_comp_swap", _atomic_intrinsic3(buffer_atomics_supported, glsl_type::uint_type), _atomic_intrinsic3(buffer_atomics_supported, glsl_type::int_type), + _atomic_counter_intrinsic2(shader_atomic_counter_ops), NULL); add_image_functions(false); @@ -2714,6 +2737,43 @@ builtin_builder::create_builtins() shader_atomic_counters), NULL); + add_function("atomicCounterAddARB", + _atomic_counter_op1("__intrinsic_atomic_add", + shader_atomic_counter_ops), + NULL); + add_function("atomicCounterSubtractARB", + _atomic_counter_op1("__intrinsic_atomic_sub", + shader_atomic_counter_ops), + NULL); + add_function("atomicCounterMinARB", + _atomic_counter_op1("__intrinsic_atomic_min", + shader_atomic_counter_ops), + NULL); + add_function("atomicCounterMaxARB", + _atomic_counter_op1("__intrinsic_atomic_max", + shader_atomic_counter_ops), + NULL); + add_function("atomicCounterAndARB", + _atomic_counter_op1("__intrinsic_atomic_and", + shader_atomic_counter_ops), + NULL); + add_function("atomicCounterOrARB", + _atomic_counter_op1("__intrinsic_atomic_or", + shader_atomic_counter_ops), + NULL); + add_function("atomicCounterXorARB", + _atomic_counter_op1("__intrinsic_atomic_xor", + shader_atomic_counter_ops), + NULL); + add_function("atomicCounterExchangeARB", + _atomic_counter_op1("__intrinsic_atomic_exchange", + shader_atomic_counter_ops), + NULL); + add_function("atomicCounterCompSwapARB", + _atomic_counter_op2("__intrinsic_atomic_comp_swap", + shader_atomic_counter_ops), + NULL); + add_function("atomicAdd", _atomic_op2("__intrinsic_atomic_add", buffer_atomics_supported, @@ -3212,7 +3272,7 @@ builtin_builder::_tan(const glsl_type *type) } ir_expression * -builtin_builder::asin_expr(ir_variable *x) +builtin_builder::asin_expr(ir_variable *x, float p0, float p1) { return mul(sign(x), sub(imm(M_PI_2f), @@ -3221,8 +3281,8 @@ builtin_builder::asin_expr(ir_variable *x) mul(abs(x), add(imm(M_PI_4f - 1.0f), mul(abs(x), - add(imm(0.086566724f), - mul(abs(x), imm(-0.03102955f)))))))))); + add(imm(p0), + mul(abs(x), imm(p1)))))))))); } ir_call * @@ -3251,7 +3311,7 @@ builtin_builder::_asin(const glsl_type *type) ir_variable *x = in_var(type, "x"); MAKE_SIG(type, always_available, 1, x); - body.emit(ret(asin_expr(x))); + body.emit(ret(asin_expr(x, 0.086566724f, -0.03102955f))); return sig; } @@ -3262,7 +3322,7 @@ builtin_builder::_acos(const glsl_type *type) ir_variable *x = in_var(type, "x"); MAKE_SIG(type, always_available, 1, x); - body.emit(ret(sub(imm(M_PI_2f), asin_expr(x)))); + body.emit(ret(sub(imm(M_PI_2f), asin_expr(x, 0.08132463f, -0.02363318f)))); return sig; } @@ -5145,6 +5205,25 @@ builtin_builder::_atomic_counter_intrinsic(builtin_available_predicate avail) } ir_function_signature * +builtin_builder::_atomic_counter_intrinsic1(builtin_available_predicate avail) +{ + ir_variable *counter = in_var(glsl_type::atomic_uint_type, "counter"); + ir_variable *data = in_var(glsl_type::uint_type, "data"); + MAKE_INTRINSIC(glsl_type::uint_type, avail, 2, counter, data); + return sig; +} + +ir_function_signature * +builtin_builder::_atomic_counter_intrinsic2(builtin_available_predicate avail) +{ + ir_variable *counter = in_var(glsl_type::atomic_uint_type, "counter"); + ir_variable *compare = in_var(glsl_type::uint_type, "compare"); + ir_variable *data = in_var(glsl_type::uint_type, "data"); + MAKE_INTRINSIC(glsl_type::uint_type, avail, 3, counter, compare, data); + return sig; +} + +ir_function_signature * builtin_builder::_atomic_intrinsic2(builtin_available_predicate avail, const glsl_type *type) { @@ -5180,6 +5259,37 @@ builtin_builder::_atomic_counter_op(const char *intrinsic, } ir_function_signature * +builtin_builder::_atomic_counter_op1(const char *intrinsic, + builtin_available_predicate avail) +{ + ir_variable *counter = in_var(glsl_type::atomic_uint_type, "atomic_counter"); + ir_variable *data = in_var(glsl_type::uint_type, "data"); + MAKE_SIG(glsl_type::uint_type, avail, 2, counter, data); + + ir_variable *retval = body.make_temp(glsl_type::uint_type, "atomic_retval"); + body.emit(call(shader->symbols->get_function(intrinsic), retval, + sig->parameters)); + body.emit(ret(retval)); + return sig; +} + +ir_function_signature * +builtin_builder::_atomic_counter_op2(const char *intrinsic, + builtin_available_predicate avail) +{ + ir_variable *counter = in_var(glsl_type::atomic_uint_type, "atomic_counter"); + ir_variable *compare = in_var(glsl_type::uint_type, "compare"); + ir_variable *data = in_var(glsl_type::uint_type, "data"); + MAKE_SIG(glsl_type::uint_type, avail, 3, counter, compare, data); + + ir_variable *retval = body.make_temp(glsl_type::uint_type, "atomic_retval"); + body.emit(call(shader->symbols->get_function(intrinsic), retval, + sig->parameters)); + body.emit(ret(retval)); + return sig; +} + +ir_function_signature * builtin_builder::_atomic_op2(const char *intrinsic, builtin_available_predicate avail, const glsl_type *type) diff --git a/src/compiler/glsl/builtin_variables.cpp b/src/compiler/glsl/builtin_variables.cpp index d20fc4a816c..4e2de37fbba 100644 --- a/src/compiler/glsl/builtin_variables.cpp +++ b/src/compiler/glsl/builtin_variables.cpp @@ -323,6 +323,7 @@ per_vertex_accumulator::add_field(int slot, const glsl_type *type, this->fields[this->num_fields].name = name; this->fields[this->num_fields].matrix_layout = GLSL_MATRIX_LAYOUT_INHERITED; this->fields[this->num_fields].location = slot; + this->fields[this->num_fields].offset = -1; this->fields[this->num_fields].interpolation = INTERP_QUALIFIER_NONE; this->fields[this->num_fields].centroid = 0; this->fields[this->num_fields].sample = 0; diff --git a/src/compiler/glsl/glcpp/glcpp-lex.l b/src/compiler/glsl/glcpp/glcpp-lex.l index fa9aa506912..d09441aac88 100644 --- a/src/compiler/glsl/glcpp/glcpp-lex.l +++ b/src/compiler/glsl/glcpp/glcpp-lex.l @@ -120,6 +120,11 @@ void glcpp_set_column (int column_no , yyscan_t yyscanner); static int glcpp_lex_update_state_per_token (glcpp_parser_t *parser, int token) { + if (token != NEWLINE && token != SPACE && token != HASH_TOKEN && + !parser->lexing_version_directive) { + glcpp_parser_resolve_implicit_version(parser); + } + /* After the first non-space token in a line, we won't * allow any '#' to introduce a directive. */ if (token == NEWLINE) { @@ -285,6 +290,7 @@ HEXADECIMAL_INTEGER 0[xX][0-9a-fA-F]+[uU]? <HASH>version{HSPACE}+ { BEGIN INITIAL; yyextra->space_tokens = 0; + yyextra->lexing_version_directive = 1; RETURN_STRING_TOKEN (VERSION_TOKEN); } @@ -314,6 +320,9 @@ HEXADECIMAL_INTEGER 0[xX][0-9a-fA-F]+[uU]? <HASH>{NEWLINE} { BEGIN INITIAL; + yyextra->space_tokens = 0; + yylineno++; + yycolumn = 0; RETURN_TOKEN_NEVER_SKIP (NEWLINE); } @@ -536,6 +545,7 @@ HEXADECIMAL_INTEGER 0[xX][0-9a-fA-F]+[uU]? } yyextra->space_tokens = 1; yyextra->lexing_directive = 0; + yyextra->lexing_version_directive = 0; yylineno++; yycolumn = 0; RETURN_TOKEN_NEVER_SKIP (NEWLINE); @@ -546,6 +556,7 @@ HEXADECIMAL_INTEGER 0[xX][0-9a-fA-F]+[uU]? glcpp_error(yylloc, yyextra, "Unterminated comment"); BEGIN DONE; /* Don't keep matching this rule forever. */ yyextra->lexing_directive = 0; + yyextra->lexing_version_directive = 0; if (! parser->last_token_was_newline) RETURN_TOKEN (NEWLINE); } diff --git a/src/compiler/glsl/glcpp/glcpp-parse.y b/src/compiler/glsl/glcpp/glcpp-parse.y index 5c38f86d333..007b70b020d 100644 --- a/src/compiler/glsl/glcpp/glcpp-parse.y +++ b/src/compiler/glsl/glcpp/glcpp-parse.y @@ -266,45 +266,37 @@ control_line: ralloc_asprintf_rewrite_tail (&parser->output, &parser->output_length, "\n"); } | control_line_error -| HASH_TOKEN LINE { - glcpp_parser_resolve_implicit_version(parser); - } pp_tokens NEWLINE { +| HASH_TOKEN LINE pp_tokens NEWLINE { if (parser->skip_stack == NULL || parser->skip_stack->type == SKIP_NO_SKIP) { _glcpp_parser_expand_and_lex_from (parser, - LINE_EXPANDED, $4, + LINE_EXPANDED, $3, EXPANSION_MODE_IGNORE_DEFINED); } } ; control_line_success: - HASH_TOKEN DEFINE_TOKEN { - glcpp_parser_resolve_implicit_version(parser); - } define -| HASH_TOKEN UNDEF { - glcpp_parser_resolve_implicit_version(parser); - } IDENTIFIER NEWLINE { + HASH_TOKEN DEFINE_TOKEN define +| HASH_TOKEN UNDEF IDENTIFIER NEWLINE { macro_t *macro; - if (strcmp("__LINE__", $4) == 0 - || strcmp("__FILE__", $4) == 0 - || strcmp("__VERSION__", $4) == 0 - || strncmp("GL_", $4, 3) == 0) + if (strcmp("__LINE__", $3) == 0 + || strcmp("__FILE__", $3) == 0 + || strcmp("__VERSION__", $3) == 0 + || strncmp("GL_", $3, 3) == 0) glcpp_error(& @1, parser, "Built-in (pre-defined)" " macro names cannot be undefined."); - macro = hash_table_find (parser->defines, $4); + macro = hash_table_find (parser->defines, $3); if (macro) { - hash_table_remove (parser->defines, $4); + hash_table_remove (parser->defines, $3); ralloc_free (macro); } - ralloc_free ($4); + ralloc_free ($3); } -| HASH_TOKEN IF { - glcpp_parser_resolve_implicit_version(parser); - } pp_tokens NEWLINE { +| HASH_TOKEN IF pp_tokens NEWLINE { /* Be careful to only evaluate the 'if' expression if * we are not skipping. When we are skipping, we * simply push a new 0-valued 'if' onto the skip @@ -316,7 +308,7 @@ control_line_success: parser->skip_stack->type == SKIP_NO_SKIP) { _glcpp_parser_expand_and_lex_from (parser, - IF_EXPANDED, $4, + IF_EXPANDED, $3, EXPANSION_MODE_EVALUATE_DEFINED); } else @@ -335,18 +327,14 @@ control_line_success: } _glcpp_parser_skip_stack_push_if (parser, & @1, 0); } -| HASH_TOKEN IFDEF { - glcpp_parser_resolve_implicit_version(parser); - } IDENTIFIER junk NEWLINE { - macro_t *macro = hash_table_find (parser->defines, $4); - ralloc_free ($4); +| HASH_TOKEN IFDEF IDENTIFIER junk NEWLINE { + macro_t *macro = hash_table_find (parser->defines, $3); + ralloc_free ($3); _glcpp_parser_skip_stack_push_if (parser, & @1, macro != NULL); } -| HASH_TOKEN IFNDEF { - glcpp_parser_resolve_implicit_version(parser); - } IDENTIFIER junk NEWLINE { - macro_t *macro = hash_table_find (parser->defines, $4); - ralloc_free ($4); +| HASH_TOKEN IFNDEF IDENTIFIER junk NEWLINE { + macro_t *macro = hash_table_find (parser->defines, $3); + ralloc_free ($3); _glcpp_parser_skip_stack_push_if (parser, & @3, macro == NULL); } | HASH_TOKEN ELIF pp_tokens NEWLINE { @@ -2494,6 +2482,9 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio if (extensions->ARB_shader_atomic_counters) add_builtin_define(parser, "GL_ARB_shader_atomic_counters", 1); + if (extensions->ARB_shader_atomic_counter_ops) + add_builtin_define(parser, "GL_ARB_shader_atomic_counter_ops", 1); + if (extensions->ARB_viewport_array) add_builtin_define(parser, "GL_ARB_viewport_array", 1); diff --git a/src/compiler/glsl/glcpp/glcpp.h b/src/compiler/glsl/glcpp/glcpp.h index 70aa14b6ec0..d87e6b77dc5 100644 --- a/src/compiler/glsl/glcpp/glcpp.h +++ b/src/compiler/glsl/glcpp/glcpp.h @@ -176,6 +176,7 @@ struct glcpp_parser { struct hash_table *defines; active_list_t *active; int lexing_directive; + int lexing_version_directive; int space_tokens; int last_token_was_newline; int last_token_was_space; diff --git a/src/compiler/glsl/glcpp/tests/129-define-non-identifier.c.expected b/src/compiler/glsl/glcpp/tests/129-define-non-identifier.c.expected index fd0b41347fa..5206a5c553c 100644 --- a/src/compiler/glsl/glcpp/tests/129-define-non-identifier.c.expected +++ b/src/compiler/glsl/glcpp/tests/129-define-non-identifier.c.expected @@ -1,2 +1,2 @@ 0:1(9): preprocessor error: #define followed by a non-identifier: 123 -0:1(9): preprocessor error: syntax error, unexpected INTEGER_STRING, expecting FUNC_IDENTIFIER or OBJ_IDENTIFIER +0:1(9): preprocessor error: syntax error, unexpected INTEGER_STRING, expecting FUNC_IDENTIFIER or OBJ_IDENTIFIER or NEWLINE diff --git a/src/compiler/glsl/glcpp/tests/144-implicit-version.c b/src/compiler/glsl/glcpp/tests/144-implicit-version.c new file mode 100644 index 00000000000..7bf72fc19e9 --- /dev/null +++ b/src/compiler/glsl/glcpp/tests/144-implicit-version.c @@ -0,0 +1 @@ +int x = __VERSION__; diff --git a/src/compiler/glsl/glcpp/tests/144-implicit-version.c.expected b/src/compiler/glsl/glcpp/tests/144-implicit-version.c.expected new file mode 100644 index 00000000000..8c2dfd9ce30 --- /dev/null +++ b/src/compiler/glsl/glcpp/tests/144-implicit-version.c.expected @@ -0,0 +1 @@ +int x = 110; diff --git a/src/compiler/glsl/glcpp/tests/145-version-first.c b/src/compiler/glsl/glcpp/tests/145-version-first.c new file mode 100644 index 00000000000..f9fcfb08246 --- /dev/null +++ b/src/compiler/glsl/glcpp/tests/145-version-first.c @@ -0,0 +1,2 @@ +123 +#version 120 diff --git a/src/compiler/glsl/glcpp/tests/145-version-first.c.expected b/src/compiler/glsl/glcpp/tests/145-version-first.c.expected new file mode 100644 index 00000000000..f4092b04af7 --- /dev/null +++ b/src/compiler/glsl/glcpp/tests/145-version-first.c.expected @@ -0,0 +1,3 @@ +0:2(1): preprocessor error: #version must appear on the first line +123 + diff --git a/src/compiler/glsl/glcpp/tests/146-version-first-hash.c b/src/compiler/glsl/glcpp/tests/146-version-first-hash.c new file mode 100644 index 00000000000..14dbe964bd6 --- /dev/null +++ b/src/compiler/glsl/glcpp/tests/146-version-first-hash.c @@ -0,0 +1,2 @@ +# +#version 120 diff --git a/src/compiler/glsl/glcpp/tests/146-version-first-hash.c.expected b/src/compiler/glsl/glcpp/tests/146-version-first-hash.c.expected new file mode 100644 index 00000000000..e8e449793fd --- /dev/null +++ b/src/compiler/glsl/glcpp/tests/146-version-first-hash.c.expected @@ -0,0 +1,3 @@ +0:2(1): preprocessor error: #version must appear on the first line + + diff --git a/src/compiler/glsl/glsl_parser.yy b/src/compiler/glsl/glsl_parser.yy index 99bd0e61d0e..5ed051a6705 100644 --- a/src/compiler/glsl/glsl_parser.yy +++ b/src/compiler/glsl/glsl_parser.yy @@ -170,7 +170,6 @@ static bool match_layout_qualifier(const char *s1, const char *s2, %token <identifier> IDENTIFIER TYPE_IDENTIFIER NEW_IDENTIFIER %type <identifier> any_identifier %type <interface_block> instance_name_opt -%type <interface_block> buffer_instance_name_opt %token <real> FLOATCONSTANT %token <dreal> DOUBLECONSTANT %token <n> INTCONSTANT UINTCONSTANT BOOLCONSTANT @@ -220,6 +219,7 @@ static bool match_layout_qualifier(const char *s1, const char *s2, %type <type_qualifier> subroutine_qualifier %type <subroutine_list> subroutine_type_list %type <type_qualifier> interface_qualifier +%type <type_qualifier> uniform_interface_qualifier %type <type_qualifier> buffer_interface_qualifier %type <type_specifier> type_specifier %type <type_specifier> type_specifier_nonarray @@ -894,6 +894,7 @@ parameter_declarator: $$->type->set_location(@1); $$->type->specifier = $1; $$->identifier = $2; + state->symbols->add_variable(new(state) ir_variable(NULL, $2, ir_var_auto)); } | type_specifier any_identifier array_specifier { @@ -905,6 +906,7 @@ parameter_declarator: $$->type->specifier = $1; $$->identifier = $2; $$->array_specifier = $3; + state->symbols->add_variable(new(state) ir_variable(NULL, $2, ir_var_auto)); } ; @@ -1062,6 +1064,7 @@ single_declaration: $$ = new(ctx) ast_declarator_list($1); $$->set_location_range(@1, @2); $$->declarations.push_tail(&decl->link); + state->symbols->add_variable(new(state) ir_variable(NULL, $2, ir_var_auto)); } | fully_specified_type any_identifier array_specifier { @@ -1072,6 +1075,7 @@ single_declaration: $$ = new(ctx) ast_declarator_list($1); $$->set_location_range(@1, @3); $$->declarations.push_tail(&decl->link); + state->symbols->add_variable(new(state) ir_variable(NULL, $2, ir_var_auto)); } | fully_specified_type any_identifier array_specifier '=' initializer { @@ -1082,6 +1086,7 @@ single_declaration: $$ = new(ctx) ast_declarator_list($1); $$->set_location_range(@1, @3); $$->declarations.push_tail(&decl->link); + state->symbols->add_variable(new(state) ir_variable(NULL, $2, ir_var_auto)); } | fully_specified_type any_identifier '=' initializer { @@ -1092,6 +1097,7 @@ single_declaration: $$ = new(ctx) ast_declarator_list($1); $$->set_location_range(@1, @2); $$->declarations.push_tail(&decl->link); + state->symbols->add_variable(new(state) ir_variable(NULL, $2, ir_var_auto)); } | INVARIANT variable_identifier { @@ -1468,6 +1474,17 @@ layout_qualifier_id: "GLSL 4.40 or ARB_enhanced_layouts"); } + if (match_layout_qualifier("align", $1, state) == 0) { + if (!state->has_enhanced_layouts()) { + _mesa_glsl_error(& @1, state, + "align qualifier requires " + "GLSL 4.40 or ARB_enhanced_layouts"); + } else { + $$.flags.q.explicit_align = 1; + $$.align = $3; + } + } + if (match_layout_qualifier("location", $1, state) == 0) { $$.flags.q.explicit_location = 1; @@ -1498,7 +1515,8 @@ layout_qualifier_id: $$.binding = $3; } - if (state->has_atomic_counters() && + if ((state->has_atomic_counters() || + state->has_enhanced_layouts()) && match_layout_qualifier("offset", $1, state) == 0) { $$.flags.q.explicit_offset = 1; $$.offset = $3; @@ -2625,10 +2643,23 @@ basic_interface_block: $$ = block; } - | buffer_interface_qualifier NEW_IDENTIFIER '{' member_list '}' buffer_instance_name_opt ';' + | uniform_interface_qualifier NEW_IDENTIFIER '{' member_list '}' instance_name_opt ';' + { + ast_interface_block *const block = $6; + + block->layout = *state->default_uniform_qualifier; + block->block_name = $2; + block->declarations.push_degenerate_list_at_head(& $4->link); + + _mesa_ast_process_interface_block(& @1, state, block, $1); + + $$ = block; + } + | buffer_interface_qualifier NEW_IDENTIFIER '{' member_list '}' instance_name_opt ';' { ast_interface_block *const block = $6; + block->layout = *state->default_shader_storage_qualifier; block->block_name = $2; block->declarations.push_degenerate_list_at_head(& $4->link); @@ -2649,7 +2680,10 @@ interface_qualifier: memset(& $$, 0, sizeof($$)); $$.flags.q.out = 1; } - | UNIFORM + ; + +uniform_interface_qualifier: + UNIFORM { memset(& $$, 0, sizeof($$)); $$.flags.q.uniform = 1; @@ -2667,39 +2701,16 @@ buffer_interface_qualifier: instance_name_opt: /* empty */ { - $$ = new(state) ast_interface_block(*state->default_uniform_qualifier, - NULL, NULL); - } - | NEW_IDENTIFIER - { - $$ = new(state) ast_interface_block(*state->default_uniform_qualifier, - $1, NULL); - $$->set_location(@1); - } - | NEW_IDENTIFIER array_specifier - { - $$ = new(state) ast_interface_block(*state->default_uniform_qualifier, - $1, $2); - $$->set_location_range(@1, @2); - } - ; - -buffer_instance_name_opt: - /* empty */ - { - $$ = new(state) ast_interface_block(*state->default_shader_storage_qualifier, - NULL, NULL); + $$ = new(state) ast_interface_block(NULL, NULL); } | NEW_IDENTIFIER { - $$ = new(state) ast_interface_block(*state->default_shader_storage_qualifier, - $1, NULL); + $$ = new(state) ast_interface_block($1, NULL); $$->set_location(@1); } | NEW_IDENTIFIER array_specifier { - $$ = new(state) ast_interface_block(*state->default_shader_storage_qualifier, - $1, $2); + $$ = new(state) ast_interface_block($1, $2); $$->set_location_range(@1, @2); } ; diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp index fe8b3bb2e79..1ac8489b45a 100644 --- a/src/compiler/glsl/glsl_parser_extras.cpp +++ b/src/compiler/glsl/glsl_parser_extras.cpp @@ -575,6 +575,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = { EXT(ARB_gpu_shader_fp64, true, false, ARB_gpu_shader_fp64), EXT(ARB_sample_shading, true, false, ARB_sample_shading), EXT(ARB_separate_shader_objects, true, false, dummy_true), + EXT(ARB_shader_atomic_counter_ops, true, false, ARB_shader_atomic_counter_ops), EXT(ARB_shader_atomic_counters, true, false, ARB_shader_atomic_counters), EXT(ARB_shader_bit_encoding, true, false, ARB_shader_bit_encoding), EXT(ARB_shader_clock, true, false, ARB_shader_clock), @@ -926,7 +927,8 @@ _mesa_ast_process_interface_block(YYLTYPE *locp, block->layout.flags.i |= block_interface_qualifier; if (state->stage == MESA_SHADER_GEOMETRY && - state->has_explicit_attrib_stream()) { + state->has_explicit_attrib_stream() && + block->layout.flags.q.out) { /* Assign global layout's stream value. */ block->layout.flags.q.stream = 1; block->layout.flags.q.explicit_stream = 0; diff --git a/src/compiler/glsl/glsl_parser_extras.h b/src/compiler/glsl/glsl_parser_extras.h index 74825a0bd35..12a3a46928c 100644 --- a/src/compiler/glsl/glsl_parser_extras.h +++ b/src/compiler/glsl/glsl_parser_extras.h @@ -533,6 +533,8 @@ struct _mesa_glsl_parse_state { bool ARB_sample_shading_warn; bool ARB_separate_shader_objects_enable; bool ARB_separate_shader_objects_warn; + bool ARB_shader_atomic_counter_ops_enable; + bool ARB_shader_atomic_counter_ops_warn; bool ARB_shader_atomic_counters_enable; bool ARB_shader_atomic_counters_warn; bool ARB_shader_bit_encoding_enable; diff --git a/src/compiler/glsl/ir.h b/src/compiler/glsl/ir.h index f6ed16de0c3..f4519679ff3 100644 --- a/src/compiler/glsl/ir.h +++ b/src/compiler/glsl/ir.h @@ -866,7 +866,7 @@ public: unsigned stream; /** - * Location an atomic counter is stored at. + * Atomic or block member offset. */ unsigned offset; diff --git a/src/compiler/glsl/ir_builder.cpp b/src/compiler/glsl/ir_builder.cpp index c9cf1240dfe..d68647f4234 100644 --- a/src/compiler/glsl/ir_builder.cpp +++ b/src/compiler/glsl/ir_builder.cpp @@ -51,7 +51,7 @@ assign(deref lhs, operand rhs, operand condition, int writemask) void *mem_ctx = ralloc_parent(lhs.val); ir_assignment *assign = new(mem_ctx) ir_assignment(lhs.val, - rhs.val, + rhs.val, condition.val, writemask); @@ -89,11 +89,11 @@ swizzle(operand a, int swizzle, int components) void *mem_ctx = ralloc_parent(a.val); return new(mem_ctx) ir_swizzle(a.val, - GET_SWZ(swizzle, 0), - GET_SWZ(swizzle, 1), - GET_SWZ(swizzle, 2), - GET_SWZ(swizzle, 3), - components); + GET_SWZ(swizzle, 0), + GET_SWZ(swizzle, 1), + GET_SWZ(swizzle, 2), + GET_SWZ(swizzle, 3), + components); } ir_swizzle * diff --git a/src/compiler/glsl/link_interface_blocks.cpp b/src/compiler/glsl/link_interface_blocks.cpp index 64c30fea9a3..4c6fb56f891 100644 --- a/src/compiler/glsl/link_interface_blocks.cpp +++ b/src/compiler/glsl/link_interface_blocks.cpp @@ -81,6 +81,66 @@ intrastage_match(ir_variable *a, return true; } +/** + * Return true if interface members mismatch and its not allowed by GLSL. + */ +static bool +interstage_member_mismatch(struct gl_shader_program *prog, + const glsl_type *c, const glsl_type *p) { + + if (c->length != p->length) + return true; + + for (unsigned i = 0; i < c->length; i++) { + if (c->fields.structure[i].type != p->fields.structure[i].type) + return true; + if (strcmp(c->fields.structure[i].name, + p->fields.structure[i].name) != 0) + return true; + if (c->fields.structure[i].location != + p->fields.structure[i].location) + return true; + if (c->fields.structure[i].patch != + p->fields.structure[i].patch) + return true; + + /* From Section 4.5 (Interpolation Qualifiers) of the GLSL 4.40 spec: + * + * "It is a link-time error if, within the same stage, the + * interpolation qualifiers of variables of the same name do not + * match." + */ + if (prog->IsES || prog->Version < 440) + if (c->fields.structure[i].interpolation != + p->fields.structure[i].interpolation) + return true; + + /* From Section 4.3.4 (Input Variables) of the GLSL ES 3.0 spec: + * + * "The output of the vertex shader and the input of the fragment + * shader form an interface. For this interface, vertex shader + * output variables and fragment shader input variables of the same + * name must match in type and qualification (other than precision + * and out matching to in). + * + * The table in Section 9.2.1 Linked Shaders of the GLSL ES 3.1 spec + * says that centroid no longer needs to match for varyings. + * + * The table in Section 9.2.1 Linked Shaders of the GLSL ES 3.2 spec + * says that sample need not match for varyings. + */ + if (!prog->IsES || prog->Version < 310) + if (c->fields.structure[i].centroid != + p->fields.structure[i].centroid) + return true; + if (!prog->IsES) + if (c->fields.structure[i].sample != + p->fields.structure[i].sample) + return true; + } + + return false; +} /** * Check if two interfaces match, according to interstage (in/out) interface @@ -90,10 +150,9 @@ intrastage_match(ir_variable *a, * an array and the producer interface is required to be a non-array. * This is used for tessellation control and geometry shader consumers. */ -bool -interstage_match(ir_variable *producer, - ir_variable *consumer, - bool extra_array_level) +static bool +interstage_match(struct gl_shader_program *prog, ir_variable *producer, + ir_variable *consumer, bool extra_array_level) { /* Unsized arrays should not occur during interstage linking. They * should have all been assigned a size by link_intrastage_shaders. @@ -106,9 +165,16 @@ interstage_match(ir_variable *producer, /* Exception: if both the interface blocks are implicitly declared, * don't force their types to match. They might mismatch due to the two * shaders using different GLSL versions, and that's ok. + * + * Also we store some member information such as interpolation in + * glsl_type that doesn't always have to match across shader stages. + * Therefore we make a pass over the members glsl_struct_field to make + * sure we don't reject shaders where fields don't need to match. */ - if (consumer->data.how_declared != ir_var_declared_implicitly || - producer->data.how_declared != ir_var_declared_implicitly) + if ((consumer->data.how_declared != ir_var_declared_implicitly || + producer->data.how_declared != ir_var_declared_implicitly) && + interstage_member_mismatch(prog, consumer->get_interface_type(), + producer->get_interface_type())) return false; } @@ -311,7 +377,7 @@ validate_interstage_inout_blocks(struct gl_shader_program *prog, if (consumer_def == NULL) continue; - if (!interstage_match(var, consumer_def, extra_array_level)) { + if (!interstage_match(prog, var, consumer_def, extra_array_level)) { linker_error(prog, "definitions of interface block `%s' do not " "match\n", var->get_interface_type()->name); return; diff --git a/src/compiler/glsl/link_uniform_blocks.cpp b/src/compiler/glsl/link_uniform_blocks.cpp index 7d755765852..c8fa181a15d 100644 --- a/src/compiler/glsl/link_uniform_blocks.cpp +++ b/src/compiler/glsl/link_uniform_blocks.cpp @@ -97,6 +97,11 @@ private: this->offset, type->std140_base_alignment(row_major)); } + virtual void set_buffer_offset(unsigned offset) + { + this->offset = offset; + } + virtual void visit_field(const glsl_type *type, const char *name, bool row_major, const glsl_type *, const unsigned packing, diff --git a/src/compiler/glsl/link_uniforms.cpp b/src/compiler/glsl/link_uniforms.cpp index deaba94df1c..940cc61181d 100644 --- a/src/compiler/glsl/link_uniforms.cpp +++ b/src/compiler/glsl/link_uniforms.cpp @@ -188,12 +188,15 @@ program_resource_visitor::recursion(const glsl_type *t, char **name, this->enter_record(t, *name, row_major, packing); for (unsigned i = 0; i < t->length; i++) { - const char *field = t->fields.structure[i].name; - size_t new_length = name_length; + const char *field = t->fields.structure[i].name; + size_t new_length = name_length; if (t->fields.structure[i].type->is_record()) this->visit_field(&t->fields.structure[i]); + if (t->is_interface() && t->fields.structure[i].offset != -1) + this->set_buffer_offset(t->fields.structure[i].offset); + /* Append '.field' to the current variable name. */ if (name_length == 0) { ralloc_asprintf_rewrite_tail(name, &new_length, "%s", field); @@ -247,10 +250,10 @@ program_resource_visitor::recursion(const glsl_type *t, char **name, record_array_count *= length; for (unsigned i = 0; i < length; i++) { - size_t new_length = name_length; + size_t new_length = name_length; - /* Append the subscript to the current variable name */ - ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]", i); + /* Append the subscript to the current variable name */ + ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]", i); recursion(t->fields.array, name, new_length, row_major, record_type, @@ -298,6 +301,11 @@ program_resource_visitor::leave_record(const glsl_type *, const char *, bool, } void +program_resource_visitor::set_buffer_offset(unsigned) +{ +} + +void program_resource_visitor::set_record_array_count(unsigned) { } @@ -415,19 +423,19 @@ private: if(!is_shader_storage) this->num_shader_uniform_components += values; } else { - /* Accumulate the total number of uniform slots used by this shader. - * Note that samplers do not count against this limit because they - * don't use any storage on current hardware. - */ - if (!is_ubo_var && !is_shader_storage) - this->num_shader_uniform_components += values; + /* Accumulate the total number of uniform slots used by this shader. + * Note that samplers do not count against this limit because they + * don't use any storage on current hardware. + */ + if (!is_ubo_var && !is_shader_storage) + this->num_shader_uniform_components += values; } /* If the uniform is already in the map, there's nothing more to do. */ unsigned id; if (this->map->get(id, name)) - return; + return; if (this->current_var->data.how_declared == ir_var_hidden) { this->hidden_map->put(this->num_hidden_uniforms, name); @@ -473,8 +481,8 @@ class parcel_out_uniform_storage : public program_resource_visitor { public: parcel_out_uniform_storage(struct gl_shader_program *prog, struct string_to_uint_map *map, - struct gl_uniform_storage *uniforms, - union gl_constant_value *values) + struct gl_uniform_storage *uniforms, + union gl_constant_value *values) : prog(prog), map(map), uniforms(uniforms), values(values) { } @@ -520,9 +528,9 @@ public: ubo_block_index = i; break; } - } - } - assert(ubo_block_index != -1); + } + } + assert(ubo_block_index != -1); /* Uniform blocks that were specified with an instance name must be * handled a little bit differently. The name of the variable is the @@ -676,6 +684,11 @@ private: } } + virtual void set_buffer_offset(unsigned offset) + { + this->ubo_byte_offset = offset; + } + virtual void set_record_array_count(unsigned record_array_count) { this->record_array_count = record_array_count; @@ -730,15 +743,15 @@ private: assert(found); if (!found) - return; + return; const glsl_type *base_type; if (type->is_array()) { - this->uniforms[id].array_elements = type->length; - base_type = type->fields.array; + this->uniforms[id].array_elements = type->length; + base_type = type->fields.array; } else { - this->uniforms[id].array_elements = 0; - base_type = type; + this->uniforms[id].array_elements = 0; + base_type = type; } /* Initialise opaque data */ @@ -822,11 +835,11 @@ private: this->uniforms[id].array_stride = glsl_align(type->without_array()->std140_size(row_major), 16); - } else { - this->uniforms[id].array_stride = 0; - } + } else { + this->uniforms[id].array_stride = 0; + } - if (type->without_array()->is_matrix()) { + if (type->without_array()->is_matrix()) { const glsl_type *matrix = type->without_array(); const unsigned N = matrix->base_type == GLSL_TYPE_DOUBLE ? 8 : 4; const unsigned items = @@ -838,17 +851,17 @@ private: glsl_align(items * N, 16); else this->uniforms[id].matrix_stride = glsl_align(items * N, 16); - this->uniforms[id].row_major = row_major; - } else { - this->uniforms[id].matrix_stride = 0; - this->uniforms[id].row_major = false; - } + this->uniforms[id].row_major = row_major; + } else { + this->uniforms[id].matrix_stride = 0; + this->uniforms[id].row_major = false; + } } else { - this->uniforms[id].block_index = -1; - this->uniforms[id].offset = -1; - this->uniforms[id].array_stride = -1; - this->uniforms[id].matrix_stride = -1; - this->uniforms[id].row_major = false; + this->uniforms[id].block_index = -1; + this->uniforms[id].offset = -1; + this->uniforms[id].array_stride = -1; + this->uniforms[id].matrix_stride = -1; + this->uniforms[id].row_major = false; } this->values += values_for_type(type); @@ -914,36 +927,36 @@ public: */ int link_cross_validate_uniform_block(void *mem_ctx, - struct gl_uniform_block **linked_blocks, - unsigned int *num_linked_blocks, - struct gl_uniform_block *new_block) + struct gl_uniform_block **linked_blocks, + unsigned int *num_linked_blocks, + struct gl_uniform_block *new_block) { for (unsigned int i = 0; i < *num_linked_blocks; i++) { struct gl_uniform_block *old_block = &(*linked_blocks)[i]; if (strcmp(old_block->Name, new_block->Name) == 0) - return link_uniform_blocks_are_compatible(old_block, new_block) - ? i : -1; + return link_uniform_blocks_are_compatible(old_block, new_block) + ? i : -1; } *linked_blocks = reralloc(mem_ctx, *linked_blocks, - struct gl_uniform_block, - *num_linked_blocks + 1); + struct gl_uniform_block, + *num_linked_blocks + 1); int linked_block_index = (*num_linked_blocks)++; struct gl_uniform_block *linked_block = &(*linked_blocks)[linked_block_index]; memcpy(linked_block, new_block, sizeof(*new_block)); linked_block->Uniforms = ralloc_array(*linked_blocks, - struct gl_uniform_buffer_variable, - linked_block->NumUniforms); + struct gl_uniform_buffer_variable, + linked_block->NumUniforms); memcpy(linked_block->Uniforms, - new_block->Uniforms, - sizeof(*linked_block->Uniforms) * linked_block->NumUniforms); + new_block->Uniforms, + sizeof(*linked_block->Uniforms) * linked_block->NumUniforms); for (unsigned int i = 0; i < linked_block->NumUniforms; i++) { struct gl_uniform_buffer_variable *ubo_var = - &linked_block->Uniforms[i]; + &linked_block->Uniforms[i]; if (ubo_var->Name == ubo_var->IndexName) { ubo_var->Name = ralloc_strdup(*linked_blocks, ubo_var->Name); @@ -970,7 +983,7 @@ link_update_uniform_buffer_variables(struct gl_shader *shader) ir_variable *const var = node->as_variable(); if ((var == NULL) || !var->is_in_buffer_block()) - continue; + continue; assert(var->data.mode == ir_var_uniform || var->data.mode == ir_var_shader_storage); @@ -992,7 +1005,7 @@ link_update_uniform_buffer_variables(struct gl_shader *shader) const unsigned l = strlen(var->name); for (unsigned i = 0; i < shader->NumBufferInterfaceBlocks; i++) { - for (unsigned j = 0; j < shader->BufferInterfaceBlocks[i].NumUniforms; j++) { + for (unsigned j = 0; j < shader->BufferInterfaceBlocks[i].NumUniforms; j++) { if (sentinel) { const char *begin = shader->BufferInterfaceBlocks[i].Uniforms[j].Name; const char *end = strchr(begin, sentinel); @@ -1010,13 +1023,13 @@ link_update_uniform_buffer_variables(struct gl_shader *shader) } } else if (!strcmp(var->name, shader->BufferInterfaceBlocks[i].Uniforms[j].Name)) { - found = true; - var->data.location = j; - break; - } - } - if (found) - break; + found = true; + var->data.location = j; + break; + } + } + if (found) + break; } assert(found); } @@ -1099,7 +1112,7 @@ link_assign_uniform_locations(struct gl_shader_program *prog, struct gl_shader *sh = prog->_LinkedShaders[i]; if (sh == NULL) - continue; + continue; /* Uniforms that lack an initializer in the shader code have an initial * value of zero. This includes sampler uniforms. @@ -1120,13 +1133,13 @@ link_assign_uniform_locations(struct gl_shader_program *prog, uniform_size.start_shader(); foreach_in_list(ir_instruction, node, sh->ir) { - ir_variable *const var = node->as_variable(); + ir_variable *const var = node->as_variable(); - if ((var == NULL) || (var->data.mode != ir_var_uniform && - var->data.mode != ir_var_shader_storage)) - continue; + if ((var == NULL) || (var->data.mode != ir_var_uniform && + var->data.mode != ir_var_shader_storage)) + continue; - uniform_size.process(var); + uniform_size.process(var); } sh->num_samplers = uniform_size.num_shader_samplers; @@ -1136,8 +1149,8 @@ link_assign_uniform_locations(struct gl_shader_program *prog, for (unsigned i = 0; i < sh->NumBufferInterfaceBlocks; i++) { if (!sh->BufferInterfaceBlocks[i].IsShaderStorage) { - sh->num_combined_uniform_components += - sh->BufferInterfaceBlocks[i].UniformBufferSize / 4; + sh->num_combined_uniform_components += + sh->BufferInterfaceBlocks[i].UniformBufferSize / 4; } } } @@ -1170,18 +1183,18 @@ link_assign_uniform_locations(struct gl_shader_program *prog, for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { if (prog->_LinkedShaders[i] == NULL) - continue; + continue; parcel.start_shader((gl_shader_stage)i); foreach_in_list(ir_instruction, node, prog->_LinkedShaders[i]->ir) { - ir_variable *const var = node->as_variable(); + ir_variable *const var = node->as_variable(); if ((var == NULL) || (var->data.mode != ir_var_uniform && var->data.mode != ir_var_shader_storage)) - continue; + continue; - parcel.set_and_process(var); + parcel.set_and_process(var); } prog->_LinkedShaders[i]->active_samplers = parcel.shader_samplers_used; diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp index 05cc1a2b7f8..34eb848a9c1 100644 --- a/src/compiler/glsl/link_varyings.cpp +++ b/src/compiler/glsl/link_varyings.cpp @@ -219,7 +219,7 @@ cross_validate_front_and_back_color(struct gl_shader_program *prog, */ void cross_validate_outputs_to_inputs(struct gl_shader_program *prog, - gl_shader *producer, gl_shader *consumer) + gl_shader *producer, gl_shader *consumer) { glsl_symbol_table parameters; ir_variable *explicit_locations[MAX_VARYING] = { NULL, }; @@ -312,8 +312,14 @@ cross_validate_outputs_to_inputs(struct gl_shader_program *prog, } if (output != NULL) { - cross_validate_types_and_qualifiers(prog, input, output, - consumer->Stage, producer->Stage); + /* Interface blocks have their own validation elsewhere so don't + * try validating them here. + */ + if (!(input->get_interface_type() && + output->get_interface_type())) + cross_validate_types_and_qualifiers(prog, input, output, + consumer->Stage, + producer->Stage); } else { /* Check for input vars with unmatched output vars in prev stage * taking into account that interface blocks could have a matching @@ -348,7 +354,7 @@ remove_unused_shader_inputs_and_outputs(bool is_separate_shader_object, ir_variable *const var = node->as_variable(); if ((var == NULL) || (var->data.mode != int(mode))) - continue; + continue; /* A shader 'in' or 'out' variable is only really an input or output if * its value is used by other shader stages. This will cause the @@ -356,7 +362,7 @@ remove_unused_shader_inputs_and_outputs(bool is_separate_shader_object, */ if (var->data.is_unmatched_generic_inout) { assert(var->data.mode != ir_var_temporary); - var->data.mode = ir_var_auto; + var->data.mode = ir_var_auto; } } @@ -748,8 +754,8 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog, prog->LinkedTransformFeedback.Varyings = rzalloc_array(prog, - struct gl_transform_feedback_varying_info, - num_tfeedback_decls); + struct gl_transform_feedback_varying_info, + num_tfeedback_decls); unsigned num_outputs = 0; for (unsigned i = 0; i < num_tfeedback_decls; ++i) @@ -1561,9 +1567,9 @@ reserved_varying_slot(struct gl_shader *stage, ir_variable_mode io_mode) */ bool assign_varying_locations(struct gl_context *ctx, - void *mem_ctx, - struct gl_shader_program *prog, - gl_shader *producer, gl_shader *consumer, + void *mem_ctx, + struct gl_shader_program *prog, + gl_shader *producer, gl_shader *consumer, unsigned num_tfeedback_decls, tfeedback_decl *tfeedback_decls) { @@ -1755,7 +1761,7 @@ assign_varying_locations(struct gl_context *ctx, linker_error(prog, "%s shader varying %s not written " "by %s shader\n.", _mesa_shader_stage_to_string(consumer->Stage), - var->name, + var->name, _mesa_shader_stage_to_string(producer->Stage)); } else { linker_warning(prog, "%s shader varying %s not written " diff --git a/src/compiler/glsl/linker.cpp b/src/compiler/glsl/linker.cpp index 3039232162a..76b700d3451 100644 --- a/src/compiler/glsl/linker.cpp +++ b/src/compiler/glsl/linker.cpp @@ -2417,7 +2417,8 @@ assign_attribute_or_color_locations(gl_shader_program *prog, /* Reversed because we want a descending order sort below. */ return r->slots - l->slots; } - } to_assign[16]; + } to_assign[32]; + assert(max_index <= 32); unsigned num_attr = 0; @@ -2625,6 +2626,13 @@ assign_attribute_or_color_locations(gl_shader_program *prog, continue; } + if (num_attr >= max_index) { + linker_error(prog, "too many %s (max %u)", + target_index == MESA_SHADER_VERTEX ? + "vertex shader inputs" : "fragment shader outputs", + max_index); + return false; + } to_assign[num_attr].slots = slots; to_assign[num_attr].var = var; num_attr++; diff --git a/src/compiler/glsl/linker.h b/src/compiler/glsl/linker.h index a60bb6ed087..4311d1659ec 100644 --- a/src/compiler/glsl/linker.h +++ b/src/compiler/glsl/linker.h @@ -182,6 +182,8 @@ protected: virtual void leave_record(const glsl_type *type, const char *name, bool row_major, const unsigned packing); + virtual void set_buffer_offset(unsigned offset); + virtual void set_record_array_count(unsigned record_array_count); private: diff --git a/src/compiler/glsl/lower_buffer_access.cpp b/src/compiler/glsl/lower_buffer_access.cpp index 9ad811de9f1..f85b421cf27 100644 --- a/src/compiler/glsl/lower_buffer_access.cpp +++ b/src/compiler/glsl/lower_buffer_access.cpp @@ -440,6 +440,10 @@ lower_buffer_access::setup_buffer_access(void *mem_ctx, else field_align = type->std140_base_alignment(field_row_major); + if (struct_type->fields.structure[i].offset != -1) { + intra_struct_offset = struct_type->fields.structure[i].offset; + } + intra_struct_offset = glsl_align(intra_struct_offset, field_align); if (strcmp(struct_type->fields.structure[i].name, diff --git a/src/compiler/glsl/opt_array_splitting.cpp b/src/compiler/glsl/opt_array_splitting.cpp index cceec6b6431..a294da56616 100644 --- a/src/compiler/glsl/opt_array_splitting.cpp +++ b/src/compiler/glsl/opt_array_splitting.cpp @@ -55,9 +55,9 @@ public: this->components = NULL; this->mem_ctx = NULL; if (var->type->is_array()) - this->size = var->type->length; + this->size = var->type->length; else - this->size = var->type->matrix_columns; + this->size = var->type->matrix_columns; } ir_variable *var; /* The key: the variable's pointer. */ @@ -137,7 +137,7 @@ ir_array_reference_visitor::get_variable_entry(ir_variable *var) foreach_in_list(variable_entry, entry, &this->variable_list) { if (entry->var == var) - return entry; + return entry; } variable_entry *entry = new(mem_ctx) variable_entry(var); @@ -185,8 +185,18 @@ ir_array_reference_visitor::visit_enter(ir_dereference_array *ir) /* If the access to the array has a variable index, we wouldn't * know which split variable this dereference should go to. */ - if (entry && !ir->array_index->as_constant()) - entry->split = false; + if (!ir->array_index->as_constant()) { + if (entry) + entry->split = false; + /* This variable indexing could come from a different array dereference + * that also has variable indexing, that is, something like a[b[a[b[0]]]]. + * If we return visit_continue_with_parent here for the first appearence + * of a, then we can miss that b also has indirect indexing (if this is + * the only place in the program where such indirect indexing into b + * happens), so keep going. + */ + return visit_continue; + } /* If the index is also array dereference, visit index. */ if (ir->array_index->as_dereference_array()) @@ -208,7 +218,7 @@ ir_array_reference_visitor::visit_enter(ir_function_signature *ir) bool ir_array_reference_visitor::get_split_list(exec_list *instructions, - bool linked) + bool linked) { visit_list_elements(this, instructions); @@ -217,25 +227,25 @@ ir_array_reference_visitor::get_split_list(exec_list *instructions, */ if (!linked) { foreach_in_list(ir_instruction, node, instructions) { - ir_variable *var = node->as_variable(); - if (var) { - variable_entry *entry = get_variable_entry(var); - if (entry) - entry->remove(); - } + ir_variable *var = node->as_variable(); + if (var) { + variable_entry *entry = get_variable_entry(var); + if (entry) + entry->remove(); + } } } /* Trim out variables we found that we can't split. */ foreach_in_list_safe(variable_entry, entry, &variable_list) { if (debug) { - printf("array %s@%p: decl %d, split %d\n", - entry->var->name, (void *) entry->var, entry->declaration, - entry->split); + printf("array %s@%p: decl %d, split %d\n", + entry->var->name, (void *) entry->var, entry->declaration, + entry->split); } if (!(entry->declaration && entry->split)) { - entry->remove(); + entry->remove(); } } @@ -273,7 +283,7 @@ ir_array_splitting_visitor::get_splitting_entry(ir_variable *var) foreach_in_list(variable_entry, entry, this->variable_list) { if (entry->var == var) { - return entry; + return entry; } } @@ -301,7 +311,7 @@ ir_array_splitting_visitor::split_deref(ir_dereference **deref) if (constant->value.i[0] >= 0 && constant->value.i[0] < (int)entry->size) { *deref = new(entry->mem_ctx) - ir_dereference_variable(entry->components[constant->value.i[0]]); + ir_dereference_variable(entry->components[constant->value.i[0]]); } else { /* There was a constant array access beyond the end of the * array. This might have happened due to constant folding @@ -310,8 +320,8 @@ ir_array_splitting_visitor::split_deref(ir_dereference **deref) * variable. */ ir_variable *temp = new(entry->mem_ctx) ir_variable(deref_array->type, - "undef", - ir_var_temporary); + "undef", + ir_var_temporary); entry->components[0]->insert_before(temp); *deref = new(entry->mem_ctx) ir_dereference_variable(temp); } @@ -373,23 +383,21 @@ optimize_split_arrays(exec_list *instructions, bool linked) const struct glsl_type *subtype; if (type->is_matrix()) - subtype = type->column_type(); + subtype = type->column_type(); else - subtype = type->fields.array; + subtype = type->fields.array; entry->mem_ctx = ralloc_parent(entry->var); - entry->components = ralloc_array(mem_ctx, - ir_variable *, - entry->size); + entry->components = ralloc_array(mem_ctx, ir_variable *, entry->size); for (unsigned int i = 0; i < entry->size; i++) { - const char *name = ralloc_asprintf(mem_ctx, "%s_%d", - entry->var->name, i); + const char *name = ralloc_asprintf(mem_ctx, "%s_%d", + entry->var->name, i); - entry->components[i] = - new(entry->mem_ctx) ir_variable(subtype, name, ir_var_temporary); - entry->var->insert_before(entry->components[i]); + entry->components[i] = + new(entry->mem_ctx) ir_variable(subtype, name, ir_var_temporary); + entry->var->insert_before(entry->components[i]); } entry->var->remove(); diff --git a/src/compiler/glsl_types.cpp b/src/compiler/glsl_types.cpp index c549230a83c..2421bd61954 100644 --- a/src/compiler/glsl_types.cpp +++ b/src/compiler/glsl_types.cpp @@ -120,6 +120,7 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields, this->fields.structure[i].name = ralloc_strdup(this->fields.structure, fields[i].name); this->fields.structure[i].location = fields[i].location; + this->fields.structure[i].offset = fields[i].offset; this->fields.structure[i].interpolation = fields[i].interpolation; this->fields.structure[i].centroid = fields[i].centroid; this->fields.structure[i].sample = fields[i].sample; @@ -159,6 +160,7 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields, this->fields.structure[i].name = ralloc_strdup(this->fields.structure, fields[i].name); this->fields.structure[i].location = fields[i].location; + this->fields.structure[i].offset = fields[i].offset; this->fields.structure[i].interpolation = fields[i].interpolation; this->fields.structure[i].centroid = fields[i].centroid; this->fields.structure[i].sample = fields[i].sample; @@ -880,6 +882,9 @@ glsl_type::record_compare(const glsl_type *b) const if (this->fields.structure[i].location != b->fields.structure[i].location) return false; + if (this->fields.structure[i].offset + != b->fields.structure[i].offset) + return false; if (this->fields.structure[i].interpolation != b->fields.structure[i].interpolation) return false; diff --git a/src/compiler/glsl_types.h b/src/compiler/glsl_types.h index 2f612d8857d..b0e6f3f730f 100644 --- a/src/compiler/glsl_types.h +++ b/src/compiler/glsl_types.h @@ -838,6 +838,14 @@ struct glsl_struct_field { int location; /** + * For interface blocks, members may have an explicit byte offset + * specified; -1 otherwise. + * + * Ignored for structs. + */ + int offset; + + /** * For interface blocks, the interpolation mode (as in * ir_variable::interpolation). 0 otherwise. */ diff --git a/src/compiler/nir/Makefile.sources b/src/compiler/nir/Makefile.sources index 04e8ab88a35..a876eff289a 100644 --- a/src/compiler/nir/Makefile.sources +++ b/src/compiler/nir/Makefile.sources @@ -32,10 +32,10 @@ NIR_FILES = \ nir_lower_clip.c \ nir_lower_global_vars_to_local.c \ nir_lower_gs_intrinsics.c \ - nir_lower_indirect_derefs.c \ nir_lower_load_const_to_scalar.c \ nir_lower_locals_to_regs.c \ nir_lower_idiv.c \ + nir_lower_indirect_derefs.c \ nir_lower_io.c \ nir_lower_outputs_to_temporaries.c \ nir_lower_phis_to_scalar.c \ diff --git a/src/compiler/nir/glsl_to_nir.cpp b/src/compiler/nir/glsl_to_nir.cpp index 6671691fd06..da5d730b49e 100644 --- a/src/compiler/nir/glsl_to_nir.cpp +++ b/src/compiler/nir/glsl_to_nir.cpp @@ -454,34 +454,8 @@ nir_visitor::create_function(ir_function_signature *ir) nir_function *func = nir_function_create(shader, ir->function_name()); - unsigned num_params = ir->parameters.length(); - func->num_params = num_params; - func->params = ralloc_array(shader, nir_parameter, num_params); - - unsigned i = 0; - foreach_in_list(ir_variable, param, &ir->parameters) { - switch (param->data.mode) { - case ir_var_function_in: - func->params[i].param_type = nir_parameter_in; - break; - - case ir_var_function_out: - func->params[i].param_type = nir_parameter_out; - break; - - case ir_var_function_inout: - func->params[i].param_type = nir_parameter_inout; - break; - - default: - unreachable("not reached"); - } - - func->params[i].type = param->type; - i++; - } - - func->return_type = ir->return_type; + assert(ir->parameters.is_empty()); + assert(ir->return_type == glsl_type::void_type); _mesa_hash_table_insert(this->overload_table, ir, func); } @@ -509,24 +483,9 @@ nir_visitor::visit(ir_function_signature *ir) nir_function_impl *impl = nir_function_impl_create(func); this->impl = impl; - unsigned num_params = func->num_params; - impl->num_params = num_params; - impl->params = ralloc_array(this->shader, nir_variable *, num_params); - unsigned i = 0; - foreach_in_list(ir_variable, param, &ir->parameters) { - param->accept(this); - impl->params[i] = this->var; - i++; - } - - if (func->return_type == glsl_type::void_type) { - impl->return_var = NULL; - } else { - impl->return_var = ralloc(this->shader, nir_variable); - impl->return_var->name = ralloc_strdup(impl->return_var, - "return_var"); - impl->return_var->type = func->return_type; - } + assert(strcmp(func->name, "main") == 0); + assert(ir->parameters.is_empty()); + assert(func->return_type == glsl_type::void_type); this->is_global = false; diff --git a/src/compiler/nir/nir.c b/src/compiler/nir/nir.c index 91206a92717..7e41ed37b0d 100644 --- a/src/compiler/nir/nir.c +++ b/src/compiler/nir/nir.c @@ -323,6 +323,8 @@ nir_function_impl_create(nir_function *function) impl->return_var->type = function->return_type; impl->return_var->data.mode = nir_var_param; impl->return_var->data.location = -1; + } else { + impl->return_var = NULL; } return impl; diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 0f8c78100bf..ae37cbf7325 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -2094,8 +2094,8 @@ void nir_index_blocks(nir_function_impl *impl); void nir_print_shader(nir_shader *shader, FILE *fp); void nir_print_instr(const nir_instr *instr, FILE *fp); -nir_shader *nir_shader_clone(void *mem_ctx, const nir_shader *s); -nir_function_impl *nir_function_impl_clone(const nir_function_impl *impl); +nir_shader * nir_shader_clone(void *mem_ctx, const nir_shader *s); +nir_function_impl *nir_function_impl_clone(const nir_function_impl *fi); nir_constant *nir_constant_clone(const nir_constant *c, nir_variable *var); #ifdef DEBUG diff --git a/src/compiler/nir/nir_lower_indirect_derefs.c b/src/compiler/nir/nir_lower_indirect_derefs.c index 69f2df4ba6d..a4affa7bdcf 100644 --- a/src/compiler/nir/nir_lower_indirect_derefs.c +++ b/src/compiler/nir/nir_lower_indirect_derefs.c @@ -134,7 +134,7 @@ emit_load_store(nir_builder *b, nir_intrinsic_instr *orig_instr, nir_intrinsic_instr *store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_var); store->num_components = orig_instr->num_components; - store->const_index[0] = orig_instr->const_index[0]; /* writemask */ + nir_intrinsic_set_write_mask(store, nir_intrinsic_write_mask(orig_instr)); store->variables[0] = nir_deref_as_var(nir_copy_deref(store, &deref->deref)); store->src[0] = nir_src_for_ssa(src); diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index c9c917b77a5..54f7d86843a 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -164,6 +164,8 @@ optimizations = [ (('ishr', a, 0), a), (('ushr', 0, a), 0), (('ushr', a, 0), a), + (('iand', 0xff, ('ushr', a, 24)), ('ushr', a, 24)), + (('iand', 0xffff, ('ushr', a, 16)), ('ushr', a, 16)), # Exponential/logarithmic identities (('fexp2', ('flog2', a)), a), # 2^lg2(a) = a (('flog2', ('fexp2', a)), a), # lg2(2^a) = a @@ -215,6 +217,16 @@ optimizations = [ (('f2i', ('ftrunc', a)), ('f2i', a)), (('f2u', ('ftrunc', a)), ('f2u', a)), + # Byte extraction + (('ushr', a, 24), ('extract_u8', a, 3), '!options->lower_extract_byte'), + (('iand', 0xff, ('ushr', a, 16)), ('extract_u8', a, 2), '!options->lower_extract_byte'), + (('iand', 0xff, ('ushr', a, 8)), ('extract_u8', a, 1), '!options->lower_extract_byte'), + (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'), + + # Word extraction + (('ushr', a, 16), ('extract_u16', a, 1), '!options->lower_extract_word'), + (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'), + # Subtracts (('fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)), (('isub', a, ('isub', 0, b)), ('iadd', a, b)), diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c index 63e34ea5255..24d5281ec54 100644 --- a/src/compiler/nir/nir_print.c +++ b/src/compiler/nir/nir_print.c @@ -381,6 +381,14 @@ print_var(nir_variable *var, print_state *state) } static void +print_arg(nir_variable *var, print_state *state) +{ + FILE *fp = state->fp; + glsl_print_type(var->type, fp); + fprintf(fp, " %s", get_var_name(var, state)); +} + +static void print_deref_var(nir_deref_var *deref, print_state *state) { print_var(deref->var, state); @@ -942,14 +950,14 @@ print_function_impl(nir_function_impl *impl, print_state *state) if (i != 0) fprintf(fp, ", "); - print_var(impl->params[i], state); + print_arg(impl->params[i], state); } if (impl->return_var != NULL) { if (impl->num_params != 0) fprintf(fp, ", "); fprintf(fp, "returning "); - print_var(impl->return_var, state); + print_arg(impl->return_var, state); } fprintf(fp, "{\n"); diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c index a9d213b95c4..0c32d5fe07a 100644 --- a/src/compiler/nir/nir_validate.c +++ b/src/compiler/nir/nir_validate.c @@ -938,6 +938,7 @@ validate_function_impl(nir_function_impl *impl, validate_state *state) assert(impl->num_params == impl->function->num_params); for (unsigned i = 0; i < impl->num_params; i++) { assert(impl->params[i]->type == impl->function->params[i].type); + assert(impl->params[i]->data.mode == nir_var_param); assert(impl->params[i]->data.location == i); validate_var_decl(impl->params[i], false, state); } @@ -946,6 +947,7 @@ validate_function_impl(nir_function_impl *impl, validate_state *state) assert(impl->return_var == NULL); } else { assert(impl->return_var->type == impl->function->return_type); + assert(impl->return_var->data.mode == nir_var_param); assert(impl->return_var->data.location == -1); validate_var_decl(impl->return_var, false, state); } diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c index 341acb7ed1b..ff0d5c802ac 100644 --- a/src/egl/drivers/dri2/platform_wayland.c +++ b/src/egl/drivers/dri2/platform_wayland.c @@ -305,7 +305,7 @@ get_back_bo(struct dri2_egl_surface *dri2_surf) { struct dri2_egl_display *dri2_dpy = dri2_egl_display(dri2_surf->base.Resource.Display); - int i; + int i, use_flags; unsigned int dri_image_format; /* currently supports three WL DRM formats, @@ -352,6 +352,8 @@ get_back_bo(struct dri2_egl_surface *dri2_surf) if (dri2_surf->back == NULL) return -1; + use_flags = __DRI_IMAGE_USE_SHARE | __DRI_IMAGE_USE_BACKBUFFER; + if (dri2_dpy->is_different_gpu && dri2_surf->back->linear_copy == NULL) { dri2_surf->back->linear_copy = @@ -359,7 +361,7 @@ get_back_bo(struct dri2_egl_surface *dri2_surf) dri2_surf->base.Width, dri2_surf->base.Height, dri_image_format, - __DRI_IMAGE_USE_SHARE | + use_flags | __DRI_IMAGE_USE_LINEAR, NULL); if (dri2_surf->back->linear_copy == NULL) @@ -373,7 +375,7 @@ get_back_bo(struct dri2_egl_surface *dri2_surf) dri2_surf->base.Height, dri_image_format, dri2_dpy->is_different_gpu ? - 0 : __DRI_IMAGE_USE_SHARE, + 0 : use_flags, NULL); dri2_surf->back->age = 0; } diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c index 420f567651c..3ab91886e01 100644 --- a/src/egl/drivers/dri2/platform_x11.c +++ b/src/egl/drivers/dri2/platform_x11.c @@ -1006,6 +1006,9 @@ dri2_create_image_khr_pixmap(_EGLDisplay *disp, _EGLContext *ctx, geometry_cookie = xcb_get_geometry (dri2_dpy->conn, drawable); buffers_reply = xcb_dri2_get_buffers_reply (dri2_dpy->conn, buffers_cookie, NULL); + if (buffers_reply == NULL) + return NULL; + buffers = xcb_dri2_get_buffers_buffers (buffers_reply); if (buffers == NULL) { return NULL; diff --git a/src/egl/main/eglapi.c b/src/egl/main/eglapi.c index 32f68233aeb..dd145a1195e 100644 --- a/src/egl/main/eglapi.c +++ b/src/egl/main/eglapi.c @@ -405,11 +405,9 @@ _eglCreateExtensionsString(_EGLDisplay *dpy) _EGL_CHECK_EXTENSION(KHR_image_pixmap); _EGL_CHECK_EXTENSION(KHR_reusable_sync); _EGL_CHECK_EXTENSION(KHR_surfaceless_context); - _EGL_CHECK_EXTENSION(KHR_vg_parent_image); _EGL_CHECK_EXTENSION(KHR_wait_sync); _EGL_CHECK_EXTENSION(MESA_configless_context); - _EGL_CHECK_EXTENSION(MESA_drm_display); _EGL_CHECK_EXTENSION(MESA_drm_image); _EGL_CHECK_EXTENSION(MESA_image_dma_buf_export); @@ -1198,13 +1196,6 @@ eglGetError(void) } -static EGLDisplay EGLAPIENTRY -eglGetDRMDisplayMESA(int fd) -{ - _EGLDisplay *dpy = _eglFindDisplay(_EGL_PLATFORM_DRM, (void *) (intptr_t) fd); - return _eglGetDisplayHandle(dpy); -} - /** ** EGL 1.2 **/ @@ -1858,7 +1849,6 @@ eglGetProcAddress(const char *procname) { "eglGetPlatformDisplay", (_EGLProc) eglGetPlatformDisplay }, { "eglCreatePlatformWindowSurface", (_EGLProc) eglCreatePlatformWindowSurface }, { "eglCreatePlatformPixmapSurface", (_EGLProc) eglCreatePlatformPixmapSurface }, - { "eglGetDRMDisplayMESA", (_EGLProc) eglGetDRMDisplayMESA }, { "eglCreateImageKHR", (_EGLProc) eglCreateImageKHR }, { "eglDestroyImageKHR", (_EGLProc) eglDestroyImage }, { "eglCreateSyncKHR", (_EGLProc) eglCreateSyncKHR }, diff --git a/src/egl/main/eglapi.h b/src/egl/main/eglapi.h index 6c54c7c410d..3f6d3c27a52 100644 --- a/src/egl/main/eglapi.h +++ b/src/egl/main/eglapi.h @@ -41,153 +41,153 @@ extern "C" { */ typedef void (*_EGLProc)(void); - -/** - * Typedefs for all EGL API entrypoint functions. - */ - -/* driver funcs */ -typedef EGLBoolean (*Initialize_t)(_EGLDriver *, _EGLDisplay *dpy); -typedef EGLBoolean (*Terminate_t)(_EGLDriver *, _EGLDisplay *dpy); - -/* config funcs */ -typedef EGLBoolean (*GetConfigs_t)(_EGLDriver *drv, _EGLDisplay *dpy, EGLConfig *configs, EGLint config_size, EGLint *num_config); -typedef EGLBoolean (*ChooseConfig_t)(_EGLDriver *drv, _EGLDisplay *dpy, const EGLint *attrib_list, EGLConfig *configs, EGLint config_size, EGLint *num_config); -typedef EGLBoolean (*GetConfigAttrib_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *config, EGLint attribute, EGLint *value); - -/* context funcs */ -typedef _EGLContext *(*CreateContext_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *config, _EGLContext *share_list, const EGLint *attrib_list); -typedef EGLBoolean (*DestroyContext_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLContext *ctx); -/* this is the only function (other than Initialize) that may be called with an uninitialized display */ -typedef EGLBoolean (*MakeCurrent_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *draw, _EGLSurface *read, _EGLContext *ctx); -typedef EGLBoolean (*QueryContext_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLContext *ctx, EGLint attribute, EGLint *value); - -/* surface funcs */ -typedef _EGLSurface *(*CreateWindowSurface_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *config, void *native_window, const EGLint *attrib_list); -typedef _EGLSurface *(*CreatePixmapSurface_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *config, void *native_pixmap, const EGLint *attrib_list); -typedef _EGLSurface *(*CreatePbufferSurface_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *config, const EGLint *attrib_list); -typedef EGLBoolean (*DestroySurface_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface); -typedef EGLBoolean (*QuerySurface_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface, EGLint attribute, EGLint *value); -typedef EGLBoolean (*SurfaceAttrib_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface, EGLint attribute, EGLint value); -typedef EGLBoolean (*BindTexImage_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface, EGLint buffer); -typedef EGLBoolean (*ReleaseTexImage_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface, EGLint buffer); -typedef EGLBoolean (*SwapInterval_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surf, EGLint interval); -typedef EGLBoolean (*SwapBuffers_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *draw); -typedef EGLBoolean (*CopyBuffers_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface, void *native_pixmap_target); - -/* misc funcs */ -typedef EGLBoolean (*WaitClient_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLContext *ctx); -typedef EGLBoolean (*WaitNative_t)(_EGLDriver *drv, _EGLDisplay *dpy, EGLint engine); - -/* this function may be called from multiple threads at the same time */ -typedef _EGLProc (*GetProcAddress_t)(_EGLDriver *drv, const char *procname); - - - -typedef _EGLSurface *(*CreatePbufferFromClientBuffer_t)(_EGLDriver *drv, _EGLDisplay *dpy, EGLenum buftype, EGLClientBuffer buffer, _EGLConfig *config, const EGLint *attrib_list); - - -typedef _EGLImage *(*CreateImageKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLContext *ctx, EGLenum target, EGLClientBuffer buffer, const EGLint *attr_list); -typedef EGLBoolean (*DestroyImageKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLImage *image); - - -typedef _EGLSync *(*CreateSyncKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, EGLenum type, const EGLint *attrib_list, const EGLAttrib *attrib_list64); -typedef EGLBoolean (*DestroySyncKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync); -typedef EGLint (*ClientWaitSyncKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync, EGLint flags, EGLTime timeout); -typedef EGLint (*WaitSyncKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync); -typedef EGLBoolean (*SignalSyncKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync, EGLenum mode); -typedef EGLBoolean (*GetSyncAttrib_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync, EGLint attribute, EGLAttrib *value); - - -typedef EGLBoolean (*SwapBuffersRegionNOK_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surf, EGLint numRects, const EGLint *rects); - -typedef _EGLImage *(*CreateDRMImageMESA_t)(_EGLDriver *drv, _EGLDisplay *disp, const EGLint *attr_list); -typedef EGLBoolean (*ExportDRMImageMESA_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLImage *img, EGLint *name, EGLint *handle, EGLint *stride); - struct wl_display; -typedef EGLBoolean (*BindWaylandDisplayWL_t)(_EGLDriver *drv, _EGLDisplay *disp, struct wl_display *display); -typedef EGLBoolean (*UnbindWaylandDisplayWL_t)(_EGLDriver *drv, _EGLDisplay *disp, struct wl_display *display); -typedef EGLBoolean (*QueryWaylandBufferWL_t)(_EGLDriver *drv, _EGLDisplay *displ, struct wl_resource *buffer, EGLint attribute, EGLint *value); - -typedef struct wl_buffer * (*CreateWaylandBufferFromImageWL_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLImage *img); - -typedef EGLBoolean (*PostSubBufferNV_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surface, EGLint x, EGLint y, EGLint width, EGLint height); - -typedef EGLint (*QueryBufferAge_t)(_EGLDriver *drv, - _EGLDisplay *dpy, _EGLSurface *surface); - -typedef EGLBoolean (*SwapBuffersWithDamageEXT_t) (_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface, const EGLint *rects, EGLint n_rects); - -typedef EGLBoolean (*GetSyncValuesCHROMIUM_t) (_EGLDisplay *dpy, _EGLSurface *surface, EGLuint64KHR *ust, EGLuint64KHR *msc, EGLuint64KHR *sbc); - -typedef EGLBoolean (*ExportDMABUFImageQueryMESA_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLImage *img, EGLint *fourcc, EGLint *nplanes, EGLuint64KHR *modifiers); -typedef EGLBoolean (*ExportDMABUFImageMESA_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLImage *img, EGLint *fds, EGLint *strides, EGLint *offsets); /** * The API dispatcher jumps through these functions */ struct _egl_api { - Initialize_t Initialize; - Terminate_t Terminate; - - GetConfigs_t GetConfigs; - ChooseConfig_t ChooseConfig; - GetConfigAttrib_t GetConfigAttrib; - - CreateContext_t CreateContext; - DestroyContext_t DestroyContext; - MakeCurrent_t MakeCurrent; - QueryContext_t QueryContext; - - CreateWindowSurface_t CreateWindowSurface; - CreatePixmapSurface_t CreatePixmapSurface; - CreatePbufferSurface_t CreatePbufferSurface; - DestroySurface_t DestroySurface; - QuerySurface_t QuerySurface; - SurfaceAttrib_t SurfaceAttrib; - BindTexImage_t BindTexImage; - ReleaseTexImage_t ReleaseTexImage; - SwapInterval_t SwapInterval; - SwapBuffers_t SwapBuffers; - CopyBuffers_t CopyBuffers; - - WaitClient_t WaitClient; - WaitNative_t WaitNative; - GetProcAddress_t GetProcAddress; - - CreatePbufferFromClientBuffer_t CreatePbufferFromClientBuffer; - - CreateImageKHR_t CreateImageKHR; - DestroyImageKHR_t DestroyImageKHR; - - CreateSyncKHR_t CreateSyncKHR; - DestroySyncKHR_t DestroySyncKHR; - ClientWaitSyncKHR_t ClientWaitSyncKHR; - WaitSyncKHR_t WaitSyncKHR; - SignalSyncKHR_t SignalSyncKHR; - GetSyncAttrib_t GetSyncAttrib; - - SwapBuffersRegionNOK_t SwapBuffersRegionNOK; - - CreateDRMImageMESA_t CreateDRMImageMESA; - ExportDRMImageMESA_t ExportDRMImageMESA; - - BindWaylandDisplayWL_t BindWaylandDisplayWL; - UnbindWaylandDisplayWL_t UnbindWaylandDisplayWL; - QueryWaylandBufferWL_t QueryWaylandBufferWL; - - CreateWaylandBufferFromImageWL_t CreateWaylandBufferFromImageWL; - - SwapBuffersWithDamageEXT_t SwapBuffersWithDamageEXT; - - PostSubBufferNV_t PostSubBufferNV; - - QueryBufferAge_t QueryBufferAge; - GetSyncValuesCHROMIUM_t GetSyncValuesCHROMIUM; - - ExportDMABUFImageQueryMESA_t ExportDMABUFImageQueryMESA; - ExportDMABUFImageMESA_t ExportDMABUFImageMESA; + /* driver funcs */ + EGLBoolean (*Initialize)(_EGLDriver *, _EGLDisplay *dpy); + EGLBoolean (*Terminate)(_EGLDriver *, _EGLDisplay *dpy); + + /* config funcs */ + EGLBoolean (*GetConfigs)(_EGLDriver *drv, _EGLDisplay *dpy, + EGLConfig *configs, EGLint config_size, + EGLint *num_config); + EGLBoolean (*ChooseConfig)(_EGLDriver *drv, _EGLDisplay *dpy, + const EGLint *attrib_list, EGLConfig *configs, + EGLint config_size, EGLint *num_config); + EGLBoolean (*GetConfigAttrib)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLConfig *config, EGLint attribute, + EGLint *value); + + /* context funcs */ + _EGLContext *(*CreateContext)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLConfig *config, _EGLContext *share_list, + const EGLint *attrib_list); + EGLBoolean (*DestroyContext)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLContext *ctx); + /* this is the only function (other than Initialize) that may be called + * with an uninitialized display + */ + EGLBoolean (*MakeCurrent)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLSurface *draw, _EGLSurface *read, + _EGLContext *ctx); + EGLBoolean (*QueryContext)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLContext *ctx, EGLint attribute, + EGLint *value); + + /* surface funcs */ + _EGLSurface *(*CreateWindowSurface)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLConfig *config, void *native_window, + const EGLint *attrib_list); + _EGLSurface *(*CreatePixmapSurface)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLConfig *config, void *native_pixmap, + const EGLint *attrib_list); + _EGLSurface *(*CreatePbufferSurface)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLConfig *config, + const EGLint *attrib_list); + EGLBoolean (*DestroySurface)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLSurface *surface); + EGLBoolean (*QuerySurface)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLSurface *surface, EGLint attribute, + EGLint *value); + EGLBoolean (*SurfaceAttrib)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLSurface *surface, EGLint attribute, + EGLint value); + EGLBoolean (*BindTexImage)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLSurface *surface, EGLint buffer); + EGLBoolean (*ReleaseTexImage)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLSurface *surface, EGLint buffer); + EGLBoolean (*SwapInterval)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLSurface *surf, EGLint interval); + EGLBoolean (*SwapBuffers)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLSurface *draw); + EGLBoolean (*CopyBuffers)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLSurface *surface, void *native_pixmap_target); + + /* misc functions */ + EGLBoolean (*WaitClient)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLContext *ctx); + EGLBoolean (*WaitNative)(_EGLDriver *drv, _EGLDisplay *dpy, + EGLint engine); + + /* this function may be called from multiple threads at the same time */ + _EGLProc (*GetProcAddress)(_EGLDriver *drv, const char *procname); + + _EGLSurface *(*CreatePbufferFromClientBuffer)(_EGLDriver *drv, + _EGLDisplay *dpy, + EGLenum buftype, + EGLClientBuffer buffer, + _EGLConfig *config, + const EGLint *attrib_list); + + _EGLImage *(*CreateImageKHR)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLContext *ctx, EGLenum target, + EGLClientBuffer buffer, + const EGLint *attr_list); + EGLBoolean (*DestroyImageKHR)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLImage *image); + + _EGLSync *(*CreateSyncKHR)(_EGLDriver *drv, _EGLDisplay *dpy, EGLenum type, + const EGLint *attrib_list, + const EGLAttrib *attrib_list64); + EGLBoolean (*DestroySyncKHR)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLSync *sync); + EGLint (*ClientWaitSyncKHR)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLSync *sync, EGLint flags, EGLTime timeout); + EGLint (*WaitSyncKHR)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync); + EGLBoolean (*SignalSyncKHR)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLSync *sync, EGLenum mode); + EGLBoolean (*GetSyncAttrib)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLSync *sync, EGLint attribute, + EGLAttrib *value); + + EGLBoolean (*SwapBuffersRegionNOK)(_EGLDriver *drv, _EGLDisplay *disp, + _EGLSurface *surf, EGLint numRects, + const EGLint *rects); + + _EGLImage *(*CreateDRMImageMESA)(_EGLDriver *drv, _EGLDisplay *disp, + const EGLint *attr_list); + EGLBoolean (*ExportDRMImageMESA)(_EGLDriver *drv, _EGLDisplay *disp, + _EGLImage *img, EGLint *name, + EGLint *handle, EGLint *stride); + + EGLBoolean (*BindWaylandDisplayWL)(_EGLDriver *drv, _EGLDisplay *disp, + struct wl_display *display); + EGLBoolean (*UnbindWaylandDisplayWL)(_EGLDriver *drv, _EGLDisplay *disp, + struct wl_display *display); + EGLBoolean (*QueryWaylandBufferWL)(_EGLDriver *drv, _EGLDisplay *displ, + struct wl_resource *buffer, + EGLint attribute, EGLint *value); + + struct wl_buffer *(*CreateWaylandBufferFromImageWL)(_EGLDriver *drv, + _EGLDisplay *disp, + _EGLImage *img); + + EGLBoolean (*SwapBuffersWithDamageEXT)(_EGLDriver *drv, _EGLDisplay *dpy, + _EGLSurface *surface, + const EGLint *rects, EGLint n_rects); + + EGLBoolean (*PostSubBufferNV)(_EGLDriver *drv, _EGLDisplay *disp, + _EGLSurface *surface, EGLint x, EGLint y, + EGLint width, EGLint height); + + EGLint (*QueryBufferAge)(_EGLDriver *drv, + _EGLDisplay *dpy, _EGLSurface *surface); + EGLBoolean (*GetSyncValuesCHROMIUM)(_EGLDisplay *dpy, _EGLSurface *surface, + EGLuint64KHR *ust, EGLuint64KHR *msc, + EGLuint64KHR *sbc); + + EGLBoolean (*ExportDMABUFImageQueryMESA)(_EGLDriver *drv, _EGLDisplay *disp, + _EGLImage *img, EGLint *fourcc, + EGLint *nplanes, + EGLuint64KHR *modifiers); + EGLBoolean (*ExportDMABUFImageMESA)(_EGLDriver *drv, _EGLDisplay *disp, + _EGLImage *img, EGLint *fds, + EGLint *strides, EGLint *offsets); }; diff --git a/src/egl/main/egldisplay.h b/src/egl/main/egldisplay.h index 6c64980cf20..cec6d59e6a4 100644 --- a/src/egl/main/egldisplay.h +++ b/src/egl/main/egldisplay.h @@ -112,11 +112,9 @@ struct _egl_extensions EGLBoolean KHR_image_pixmap; EGLBoolean KHR_reusable_sync; EGLBoolean KHR_surfaceless_context; - EGLBoolean KHR_vg_parent_image; EGLBoolean KHR_wait_sync; EGLBoolean MESA_configless_context; - EGLBoolean MESA_drm_display; EGLBoolean MESA_drm_image; EGLBoolean MESA_image_dma_buf_export; diff --git a/src/egl/main/eglfallbacks.c b/src/egl/main/eglfallbacks.c index 65daf8fd0f5..d0fce8c20de 100644 --- a/src/egl/main/eglfallbacks.c +++ b/src/egl/main/eglfallbacks.c @@ -59,29 +59,29 @@ _eglInitDriverFallbacks(_EGLDriver *drv) drv->API.ChooseConfig = _eglChooseConfig; drv->API.GetConfigAttrib = _eglGetConfigAttrib; - drv->API.CreateContext = (CreateContext_t) _eglReturnFalse; - drv->API.DestroyContext = (DestroyContext_t) _eglReturnFalse; - drv->API.MakeCurrent = (MakeCurrent_t) _eglReturnFalse; + drv->API.CreateContext = (void*) _eglReturnFalse; + drv->API.DestroyContext = (void*) _eglReturnFalse; + drv->API.MakeCurrent = (void*) _eglReturnFalse; drv->API.QueryContext = _eglQueryContext; - drv->API.CreateWindowSurface = (CreateWindowSurface_t) _eglReturnFalse; - drv->API.CreatePixmapSurface = (CreatePixmapSurface_t) _eglReturnFalse; - drv->API.CreatePbufferSurface = (CreatePbufferSurface_t) _eglReturnFalse; + drv->API.CreateWindowSurface = (void*) _eglReturnFalse; + drv->API.CreatePixmapSurface = (void*) _eglReturnFalse; + drv->API.CreatePbufferSurface = (void*) _eglReturnFalse; drv->API.CreatePbufferFromClientBuffer = - (CreatePbufferFromClientBuffer_t) _eglReturnFalse; - drv->API.DestroySurface = (DestroySurface_t) _eglReturnFalse; + (void*) _eglReturnFalse; + drv->API.DestroySurface = (void*) _eglReturnFalse; drv->API.QuerySurface = _eglQuerySurface; drv->API.SurfaceAttrib = _eglSurfaceAttrib; - drv->API.BindTexImage = (BindTexImage_t) _eglReturnFalse; - drv->API.ReleaseTexImage = (ReleaseTexImage_t) _eglReturnFalse; - drv->API.CopyBuffers = (CopyBuffers_t) _eglReturnFalse; - drv->API.SwapBuffers = (SwapBuffers_t) _eglReturnFalse; + drv->API.BindTexImage = (void*) _eglReturnFalse; + drv->API.ReleaseTexImage = (void*) _eglReturnFalse; + drv->API.CopyBuffers = (void*) _eglReturnFalse; + drv->API.SwapBuffers = (void*) _eglReturnFalse; drv->API.SwapInterval = _eglSwapInterval; - drv->API.WaitClient = (WaitClient_t) _eglReturnFalse; - drv->API.WaitNative = (WaitNative_t) _eglReturnFalse; - drv->API.GetProcAddress = (GetProcAddress_t) _eglReturnFalse; + drv->API.WaitClient = (void*) _eglReturnFalse; + drv->API.WaitNative = (void*) _eglReturnFalse; + drv->API.GetProcAddress = (void*) _eglReturnFalse; drv->API.CreateImageKHR = NULL; drv->API.DestroyImageKHR = NULL; diff --git a/src/egl/wayland/wayland-egl/wayland-egl-priv.h b/src/egl/wayland/wayland-egl/wayland-egl-priv.h index 74a155202be..f1e3ba28309 100644 --- a/src/egl/wayland/wayland-egl/wayland-egl-priv.h +++ b/src/egl/wayland/wayland-egl/wayland-egl-priv.h @@ -1,10 +1,6 @@ #ifndef _WAYLAND_EGL_PRIV_H #define _WAYLAND_EGL_PRIV_H -#ifdef __cplusplus -extern "C" { -#endif - /* GCC visibility */ #if defined(__GNUC__) #define WL_EGL_EXPORT __attribute__ ((visibility("default"))) @@ -14,6 +10,10 @@ extern "C" { #include <wayland-client.h> +#ifdef __cplusplus +extern "C" { +#endif + struct wl_egl_window { struct wl_surface *surface; diff --git a/src/gallium/Makefile.am b/src/gallium/Makefile.am index e42a8f17703..086e1701128 100644 --- a/src/gallium/Makefile.am +++ b/src/gallium/Makefile.am @@ -78,6 +78,12 @@ SUBDIRS += drivers/llvmpipe endif endif +if HAVE_GALLIUM_SWR +SUBDIRS += drivers/swr +SUBDIRS += drivers/swr/avx +SUBDIRS += drivers/swr/avx2 +endif + ## vc4/rpi if HAVE_GALLIUM_VC4 SUBDIRS += drivers/vc4 winsys/vc4/drm diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c index f0013f70472..790e1211898 100644 --- a/src/gallium/auxiliary/cso_cache/cso_context.c +++ b/src/gallium/auxiliary/cso_cache/cso_context.c @@ -847,7 +847,7 @@ void cso_set_geometry_shader_handle(struct cso_context *ctx, void *handle) void cso_delete_geometry_shader(struct cso_context *ctx, void *handle) { - if (handle == ctx->geometry_shader) { + if (handle == ctx->geometry_shader) { /* unbind before deleting */ ctx->pipe->bind_gs_state(ctx->pipe, NULL); ctx->geometry_shader = NULL; @@ -892,7 +892,7 @@ void cso_set_tessctrl_shader_handle(struct cso_context *ctx, void *handle) void cso_delete_tessctrl_shader(struct cso_context *ctx, void *handle) { - if (handle == ctx->tessctrl_shader) { + if (handle == ctx->tessctrl_shader) { /* unbind before deleting */ ctx->pipe->bind_tcs_state(ctx->pipe, NULL); ctx->tessctrl_shader = NULL; @@ -937,7 +937,7 @@ void cso_set_tesseval_shader_handle(struct cso_context *ctx, void *handle) void cso_delete_tesseval_shader(struct cso_context *ctx, void *handle) { - if (handle == ctx->tesseval_shader) { + if (handle == ctx->tesseval_shader) { /* unbind before deleting */ ctx->pipe->bind_tes_state(ctx->pipe, NULL); ctx->tesseval_shader = NULL; @@ -982,7 +982,7 @@ void cso_set_compute_shader_handle(struct cso_context *ctx, void *handle) void cso_delete_compute_shader(struct cso_context *ctx, void *handle) { - if (handle == ctx->compute_shader) { + if (handle == ctx->compute_shader) { /* unbind before deleting */ ctx->pipe->bind_compute_state(ctx->pipe, NULL); ctx->compute_shader = NULL; diff --git a/src/gallium/auxiliary/draw/draw_pipe_stipple.c b/src/gallium/auxiliary/draw/draw_pipe_stipple.c index dcf05aac1d9..0d39ee4ec47 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_stipple.c +++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c @@ -108,11 +108,11 @@ emit_segment(struct draw_stage *stage, struct prim_header *header, } -static inline unsigned +static inline bool stipple_test(int counter, ushort pattern, int factor) { int b = (counter / factor) & 0xf; - return (1 << b) & pattern; + return !!((1 << b) & pattern); } @@ -126,7 +126,7 @@ stipple_line(struct draw_stage *stage, struct prim_header *header) const float *pos0 = v0->data[pos]; const float *pos1 = v1->data[pos]; float start = 0; - int state = 0; + bool state = 0; float x0 = pos0[0]; float x1 = pos1[0]; @@ -143,29 +143,29 @@ stipple_line(struct draw_stage *stage, struct prim_header *header) stipple->counter = 0; - /* XXX ToDo: intead of iterating pixel-by-pixel, use a look-up table. + /* XXX ToDo: instead of iterating pixel-by-pixel, use a look-up table. */ for (i = 0; i < length; i++) { - int result = stipple_test( (int) stipple->counter+i, - (ushort) stipple->pattern, stipple->factor ); + bool result = stipple_test((int)stipple->counter + i, + (ushort)stipple->pattern, stipple->factor); if (result != state) { /* changing from "off" to "on" or vice versa */ - if (state) { - if (start != i) { + if (state) { + if (start != i) { /* finishing an "on" segment */ - emit_segment( stage, header, start / length, i / length ); + emit_segment(stage, header, start / length, i / length); } - } - else { + } + else { /* starting an "on" segment */ - start = (float) i; - } - state = result; + start = (float)i; + } + state = result; } } if (state && start < length) - emit_segment( stage, header, start / length, 1.0 ); + emit_segment(stage, header, start / length, 1.0); stipple->counter += length; } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.h b/src/gallium/auxiliary/gallivm/lp_bld_flow.h index 0da849bfe0c..083b0ad9fec 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_flow.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.h @@ -37,6 +37,9 @@ #include "gallivm/lp_bld.h" +#ifdef __cplusplus +extern "C" { +#endif struct lp_type; @@ -198,4 +201,8 @@ lp_build_array_alloca(struct gallivm_state *gallivm, LLVMValueRef count, const char *name); +#ifdef __cplusplus +} +#endif + #endif /* !LP_BLD_FLOW_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h index 9e50f88931d..ab44661a271 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_init.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h @@ -35,6 +35,9 @@ #include "lp_bld.h" #include <llvm-c/ExecutionEngine.h> +#ifdef __cplusplus +extern "C" { +#endif struct gallivm_state { @@ -82,4 +85,8 @@ void lp_set_store_alignment(LLVMValueRef Inst, unsigned Align); +#ifdef __cplusplus +} +#endif + #endif /* !LP_BLD_INIT_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h index a6f0eff42f6..902ae41f960 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h @@ -42,6 +42,9 @@ #include "gallivm/lp_bld_type.h" #include "gallivm/lp_bld_swizzle.h" +#ifdef __cplusplus +extern "C" { +#endif struct pipe_resource; struct pipe_sampler_view; @@ -625,5 +628,8 @@ lp_build_minify(struct lp_build_context *bld, LLVMValueRef level, boolean lod_scalar); +#ifdef __cplusplus +} +#endif #endif /* LP_BLD_SAMPLE_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c index 1cbe47ca91f..614c6558ede 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c @@ -315,7 +315,7 @@ lp_build_tgsi_inst_llvm( } } - if (info->num_dst > 0) { + if (info->num_dst > 0 && info->opcode != TGSI_OPCODE_STORE) { bld_base->emit_store(bld_base, inst, info, emit_data.output); } return TRUE; diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h index cc4549778a3..b005d7a0ac1 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h @@ -48,6 +48,10 @@ #include "tgsi/tgsi_scan.h" #include "tgsi/tgsi_info.h" +#ifdef __cplusplus +extern "C" { +#endif + #define LP_CHAN_ALL ~0 #define LP_MAX_INSTRUCTIONS 256 @@ -663,4 +667,8 @@ lp_build_tgsi_llvm( struct lp_build_tgsi_context * bld_base, const struct tgsi_token *tokens); +#ifdef __cplusplus +} +#endif + #endif /* LP_BLD_TGSI_H */ diff --git a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h index a9ab16f2b54..5bb77a5bde2 100644 --- a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h +++ b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h @@ -6,6 +6,9 @@ #include "util/u_debug.h" #include "state_tracker/sw_winsys.h" +#ifdef GALLIUM_SWR +#include "swr/swr_public.h" +#endif /* Helper function to choose and instantiate one of the software rasterizers: * llvmpipe, softpipe. @@ -43,10 +46,15 @@ sw_screen_create_named(struct sw_winsys *winsys, const char *driver) #endif #if defined(GALLIUM_SOFTPIPE) - if (screen == NULL) + if (screen == NULL && strcmp(driver, "softpipe") == 0) screen = softpipe_create_screen(winsys); #endif +#if defined(GALLIUM_SWR) + if (screen == NULL && strcmp(driver, "swr") == 0) + screen = swr_create_screen(winsys); +#endif + return screen; } @@ -61,6 +69,8 @@ sw_screen_create(struct sw_winsys *winsys) default_driver = "llvmpipe"; #elif defined(GALLIUM_SOFTPIPE) default_driver = "softpipe"; +#elif defined(GALLIUM_SWR) + default_driver = "swr"; #else default_driver = ""; #endif diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c index cfe9b92ee1b..e5355f573bb 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_build.c +++ b/src/gallium/auxiliary/tgsi/tgsi_build.c @@ -1425,3 +1425,18 @@ tgsi_build_full_property( return size; } + +struct tgsi_full_src_register +tgsi_full_src_register_from_dst(const struct tgsi_full_dst_register *dst) +{ + struct tgsi_full_src_register src; + src.Register = tgsi_default_src_register(); + src.Register.File = dst->Register.File; + src.Register.Indirect = dst->Register.Indirect; + src.Register.Dimension = dst->Register.Dimension; + src.Register.Index = dst->Register.Index; + src.Indirect = dst->Indirect; + src.Dimension = dst->Dimension; + src.DimIndirect = dst->DimIndirect; + return src; +} diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.h b/src/gallium/auxiliary/tgsi/tgsi_build.h index c5127e1855c..34d181ab247 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_build.h +++ b/src/gallium/auxiliary/tgsi/tgsi_build.h @@ -30,6 +30,8 @@ struct tgsi_token; +struct tgsi_full_dst_register; +struct tgsi_full_src_register; #if defined __cplusplus @@ -111,6 +113,9 @@ tgsi_build_full_instruction( struct tgsi_instruction_predicate tgsi_default_instruction_predicate(void); +struct tgsi_full_src_register +tgsi_full_src_register_from_dst(const struct tgsi_full_dst_register *dst); + #if defined __cplusplus } #endif diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c index f232f3870d1..c8b91bba534 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_dump.c +++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c @@ -637,6 +637,14 @@ iter_instruction( TXT(", "); ENM(bit, tgsi_memory_names); } + if (inst->Memory.Texture) { + TXT( ", " ); + ENM( inst->Memory.Texture, tgsi_texture_names ); + } + if (inst->Memory.Format) { + TXT( ", " ); + TXT( util_format_name(inst->Memory.Format) ); + } } switch (inst->Instruction.Opcode) { diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c index 70fc4604537..462bd15f01c 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_info.c +++ b/src/gallium/auxiliary/tgsi/tgsi_info.c @@ -142,7 +142,7 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] = { 0, 0, 0, 0, 0, 1, 0, NONE, "ENDSUB", TGSI_OPCODE_ENDSUB }, { 1, 1, 1, 0, 0, 0, 0, OTHR, "TXQ_LZ", TGSI_OPCODE_TXQ_LZ }, { 1, 1, 1, 0, 0, 0, 0, OTHR, "TXQS", TGSI_OPCODE_TXQS }, - { 1, 1, 0, 0, 0, 0, 0, NONE, "RESQ", TGSI_OPCODE_RESQ }, + { 1, 1, 0, 0, 0, 0, 0, OTHR, "RESQ", TGSI_OPCODE_RESQ }, { 0, 0, 0, 0, 0, 0, 0, NONE, "", 106 }, /* removed */ { 0, 0, 0, 0, 0, 0, 0, NONE, "NOP", TGSI_OPCODE_NOP }, { 1, 2, 0, 0, 0, 0, 0, COMP, "FSEQ", TGSI_OPCODE_FSEQ }, diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.h b/src/gallium/auxiliary/tgsi/tgsi_sanity.h index 1ff7874b8ce..b78d1aba714 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_sanity.h +++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.h @@ -28,12 +28,12 @@ #ifndef TGSI_SANITY_H #define TGSI_SANITY_H +#include "pipe/p_compiler.h" + #if defined __cplusplus extern "C" { #endif -#include "pipe/p_compiler.h" - struct tgsi_token; /* Check the given token stream for errors and common mistakes. diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c b/src/gallium/auxiliary/tgsi/tgsi_strings.c index b15ae69cf7a..6bd1a2e14d2 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_strings.c +++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c @@ -144,6 +144,7 @@ const char *tgsi_property_names[TGSI_PROPERTY_COUNT] = "TES_POINT_MODE", "NUM_CLIPDIST_ENABLED", "NUM_CULLDIST_ENABLED", + "FS_EARLY_DEPTH_STENCIL", }; const char *tgsi_return_type_names[TGSI_RETURN_TYPE_COUNT] = diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c index 91baa01ad8b..77598d2cb79 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_text.c +++ b/src/gallium/auxiliary/tgsi/tgsi_text.c @@ -1388,7 +1388,9 @@ static boolean parse_declaration( struct translate_ctx *ctx ) if (str_match_nocase_whole(&cur, "ATOMIC")) { decl.Declaration.Atomic = 1; ctx->cur = cur; - } else if (str_match_nocase_whole(&cur, "SHARED")) { + } + } else if (file == TGSI_FILE_MEMORY) { + if (str_match_nocase_whole(&cur, "SHARED")) { decl.Declaration.Shared = 1; ctx->cur = cur; } diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.h b/src/gallium/auxiliary/tgsi/tgsi_text.h index 6a306e6b674..a34565795a9 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_text.h +++ b/src/gallium/auxiliary/tgsi/tgsi_text.h @@ -28,12 +28,12 @@ #ifndef TGSI_TEXT_H #define TGSI_TEXT_H +#include "pipe/p_compiler.h" + #if defined __cplusplus extern "C" { #endif -#include "pipe/p_compiler.h" - struct tgsi_token; boolean diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c index e1a72786476..ab1d03458ef 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c +++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c @@ -1242,7 +1242,9 @@ ureg_emit_texture_offset(struct ureg_program *ureg, void ureg_emit_memory(struct ureg_program *ureg, unsigned extended_token, - unsigned qualifier) + unsigned qualifier, + unsigned texture, + unsigned format) { union tgsi_any_token *out, *insn; @@ -1253,6 +1255,8 @@ ureg_emit_memory(struct ureg_program *ureg, out[0].value = 0; out[0].insn_memory.Qualifier = qualifier; + out[0].insn_memory.Texture = texture; + out[0].insn_memory.Format = format; } void @@ -1413,7 +1417,9 @@ ureg_memory_insn(struct ureg_program *ureg, unsigned nr_dst, const struct ureg_src *src, unsigned nr_src, - unsigned qualifier) + unsigned qualifier, + unsigned texture, + unsigned format) { struct ureg_emit_insn_result insn; unsigned i; @@ -1430,7 +1436,7 @@ ureg_memory_insn(struct ureg_program *ureg, nr_dst, nr_src); - ureg_emit_memory(ureg, insn.extended_token, qualifier); + ureg_emit_memory(ureg, insn.extended_token, qualifier, texture, format); for (i = 0; i < nr_dst; i++) ureg_emit_dst(ureg, dst[i]); diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h index 6a3b5ddf017..04a62a6e160 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h +++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h @@ -541,7 +541,9 @@ ureg_memory_insn(struct ureg_program *ureg, unsigned nr_dst, const struct ureg_src *src, unsigned nr_src, - unsigned qualifier); + unsigned qualifier, + unsigned texture, + unsigned format); /*********************************************************************** * Internal instruction helpers, don't call these directly: @@ -582,7 +584,9 @@ ureg_emit_texture_offset(struct ureg_program *ureg, void ureg_emit_memory(struct ureg_program *ureg, unsigned insn_token, - unsigned qualifier); + unsigned qualifier, + unsigned texture, + unsigned format); void ureg_emit_dst( struct ureg_program *ureg, diff --git a/src/gallium/auxiliary/util/u_debug.h b/src/gallium/auxiliary/util/u_debug.h index c2707b402cb..85d0cb64e6c 100644 --- a/src/gallium/auxiliary/util/u_debug.h +++ b/src/gallium/auxiliary/util/u_debug.h @@ -39,6 +39,11 @@ #define U_DEBUG_H_ +#if defined(PIPE_OS_HAIKU) +/* Haiku provides debug_printf in libroot with OS.h */ +#include <OS.h> +#endif + #include "os/os_misc.h" #include "pipe/p_format.h" @@ -94,9 +99,6 @@ debug_printf(const char *format, ...) (void) format; /* silence warning */ #endif } -#else /* is Haiku */ -/* Haiku provides debug_printf in libroot with OS.h */ -#include <OS.h> #endif diff --git a/src/gallium/auxiliary/util/u_debug_refcnt.c b/src/gallium/auxiliary/util/u_debug_refcnt.c index 2c3dc986a90..0a4786442fc 100644 --- a/src/gallium/auxiliary/util/u_debug_refcnt.c +++ b/src/gallium/auxiliary/util/u_debug_refcnt.c @@ -26,9 +26,14 @@ #if defined(DEBUG) -/* see http://www.mozilla.org/performance/refcnt-balancer.html for what do with the output - * on Linux, use tools/addr2line.sh to postprocess it before anything else - **/ +/** + * If the GALLIUM_REFCNT_LOG env var is defined as a filename, gallium + * reference counting will be logged to the file. + * + * See http://www-archive.mozilla.org/performance/refcnt-balancer.html + * for what to do with the output on Linux, use tools/addr2line.sh to + * postprocess it before anything else. + */ #include <stdio.h> @@ -42,30 +47,41 @@ int debug_refcnt_state; -FILE* stream; +static FILE *stream; -/* TODO: maybe move this serial machinery to a stand-alone module and expose it? */ +/* TODO: maybe move this serial machinery to a stand-alone module and + * expose it? + */ pipe_static_mutex(serials_mutex); -static struct util_hash_table* serials_hash; +static struct util_hash_table *serials_hash; static unsigned serials_last; -static unsigned hash_ptr(void* p) + +static unsigned +hash_ptr(void *p) { - return (unsigned)(uintptr_t)p; + return (unsigned) (uintptr_t) p; } -static int compare_ptr(void* a, void* b) + +static int +compare_ptr(void *a, void *b) { - if(a == b) + if (a == b) return 0; - else if(a < b) + else if (a < b) return -1; else return 1; } -static boolean debug_serial(void* p, unsigned* pserial) + +/** + * Return a small integer serial number for the given pointer. + */ +static boolean +debug_serial(void *p, unsigned *pserial) { unsigned serial; boolean found = TRUE; @@ -81,79 +97,99 @@ static boolean debug_serial(void* p, unsigned* pserial) pipe_mutex_lock(serials_mutex); if (!serials_hash) serials_hash = util_hash_table_create(hash_ptr, compare_ptr); - serial = (unsigned)(uintptr_t)util_hash_table_get(serials_hash, p); - if(!serial) - { - /* time to stop logging... (you'll have a 100 GB logfile at least at this point) - * TODO: avoid this + + serial = (unsigned) (uintptr_t) util_hash_table_get(serials_hash, p); + if (!serial) { + /* time to stop logging... (you'll have a 100 GB logfile at least at + * this point) TODO: avoid this */ serial = ++serials_last; - if(!serial) - { + if (!serial) { debug_error("More than 2^32 objects detected, aborting.\n"); os_abort(); } - util_hash_table_set(serials_hash, p, (void*)(uintptr_t)serial); + util_hash_table_set(serials_hash, p, (void *) (uintptr_t) serial); found = FALSE; } pipe_mutex_unlock(serials_mutex); + *pserial = serial; + return found; } -static void debug_serial_delete(void* p) + +/** + * Free the serial number for the given pointer. + */ +static void +debug_serial_delete(void *p) { pipe_mutex_lock(serials_mutex); util_hash_table_remove(serials_hash, p); pipe_mutex_unlock(serials_mutex); } + #define STACK_LEN 64 -static void dump_stack(const char* symbols[STACK_LEN]) +static void +dump_stack(const char *symbols[STACK_LEN]) { unsigned i; - for(i = 0; i < STACK_LEN; ++i) - { - if(symbols[i]) + for (i = 0; i < STACK_LEN; ++i) { + if (symbols[i]) fprintf(stream, "%s\n", symbols[i]); } fprintf(stream, "\n"); } -void debug_reference_slowpath(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change) + +/** + * Log a reference count change to the log file (if enabled). + * This is called via the pipe_reference() and debug_reference() functions, + * basically whenever a reference count is initialized or changed. + * + * \param p the refcount being changed (the value is not changed here) + * \param get_desc a function which will be called to print an object's + * name/pointer into a string buffer during logging + * \param change the reference count change which must be +/-1 or 0 when + * creating the object and initializing the refcount. + */ +void +debug_reference_slowpath(const struct pipe_reference *p, + debug_reference_descriptor get_desc, int change) { - if(debug_refcnt_state < 0) + assert(change >= -1); + assert(change <= 1); + + if (debug_refcnt_state < 0) return; - if(!debug_refcnt_state) - { - const char* filename = debug_get_option("GALLIUM_REFCNT_LOG", NULL); - if(filename && filename[0]) + if (!debug_refcnt_state) { + const char *filename = debug_get_option("GALLIUM_REFCNT_LOG", NULL); + if (filename && filename[0]) stream = fopen(filename, "wt"); - if(stream) + if (stream) debug_refcnt_state = 1; else debug_refcnt_state = -1; } - if(debug_refcnt_state > 0) - { + if (debug_refcnt_state > 0) { struct debug_stack_frame frames[STACK_LEN]; - const char* symbols[STACK_LEN]; + const char *symbols[STACK_LEN]; char buf[1024]; - unsigned i; unsigned refcnt = p->count; unsigned serial; - boolean existing = debug_serial((void*)p, &serial); + boolean existing = debug_serial((void *) p, &serial); debug_backtrace_capture(frames, 1, STACK_LEN); - for(i = 0; i < STACK_LEN; ++i) - { - if(frames[i].function) + for (i = 0; i < STACK_LEN; ++i) { + if (frames[i].function) symbols[i] = debug_symbol_name_cached(frames[i].function); else symbols[i] = 0; @@ -161,30 +197,28 @@ void debug_reference_slowpath(const struct pipe_reference* p, debug_reference_de get_desc(buf, p); - if(!existing) - { + if (!existing) { fprintf(stream, "<%s> %p %u Create\n", buf, (void *) p, serial); dump_stack(symbols); - /* this is there to provide a gradual change even if we don't see the initialization */ - for(i = 1; i <= refcnt - change; ++i) - { + /* this is here to provide a gradual change even if we don't see + * the initialization + */ + for (i = 1; i <= refcnt - change; ++i) { fprintf(stream, "<%s> %p %u AddRef %u\n", buf, (void *) p, serial, i); dump_stack(symbols); } } - if(change) - { + if (change) { fprintf(stream, "<%s> %p %u %s %u\n", buf, (void *) p, serial, change > 0 ? "AddRef" : "Release", refcnt); dump_stack(symbols); } - if(!refcnt) - { - debug_serial_delete((void*)p); + if (!refcnt) { + debug_serial_delete((void *) p); fprintf(stream, "<%s> %p %u Destroy\n", buf, (void *) p, serial); dump_stack(symbols); } @@ -192,4 +226,5 @@ void debug_reference_slowpath(const struct pipe_reference* p, debug_reference_de fflush(stream); } } -#endif + +#endif /* DEBUG */ diff --git a/src/gallium/auxiliary/util/u_debug_refcnt.h b/src/gallium/auxiliary/util/u_debug_refcnt.h index 1f9218fec9a..cf047776661 100644 --- a/src/gallium/auxiliary/util/u_debug_refcnt.h +++ b/src/gallium/auxiliary/util/u_debug_refcnt.h @@ -40,9 +40,13 @@ typedef void (*debug_reference_descriptor)(char*, const struct pipe_reference*); extern int debug_refcnt_state; -void debug_reference_slowpath(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change); +void +debug_reference_slowpath(const struct pipe_reference* p, + debug_reference_descriptor get_desc, int change); -static inline void debug_reference(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change) +static inline void +debug_reference(const struct pipe_reference* p, + debug_reference_descriptor get_desc, int change) { if (debug_refcnt_state >= 0) debug_reference_slowpath(p, get_desc, change); @@ -50,7 +54,9 @@ static inline void debug_reference(const struct pipe_reference* p, debug_referen #else -static inline void debug_reference(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change) +static inline void +debug_reference(const struct pipe_reference* p, + debug_reference_descriptor get_desc, int change) { } diff --git a/src/gallium/auxiliary/util/u_dl.h b/src/gallium/auxiliary/util/u_dl.h index 80a00ed6796..d2f4737d42a 100644 --- a/src/gallium/auxiliary/util/u_dl.h +++ b/src/gallium/auxiliary/util/u_dl.h @@ -32,6 +32,9 @@ #include "pipe/p_config.h" +#ifdef __cplusplus +extern "C" { +#endif #if defined(PIPE_OS_WINDOWS) # define UTIL_DL_EXT ".dll" @@ -79,5 +82,8 @@ util_dl_close(struct util_dl_library *library); const char * util_dl_error(void); +#ifdef __cplusplus +} +#endif #endif /* U_DL_H_ */ diff --git a/src/gallium/auxiliary/util/u_draw_quad.h b/src/gallium/auxiliary/util/u_draw_quad.h index b298ef2ae59..6553d5d7b6b 100644 --- a/src/gallium/auxiliary/util/u_draw_quad.h +++ b/src/gallium/auxiliary/util/u_draw_quad.h @@ -32,6 +32,7 @@ #include "pipe/p_compiler.h" #include "pipe/p_context.h" +#include "util/u_draw.h" #ifdef __cplusplus extern "C" { @@ -40,8 +41,6 @@ extern "C" { struct pipe_resource; struct cso_context; -#include "util/u_draw.h" - extern void util_draw_vertex_buffer(struct pipe_context *pipe, struct cso_context *cso, struct pipe_resource *vbuf, uint vbuf_slot, diff --git a/src/gallium/auxiliary/util/u_helpers.h b/src/gallium/auxiliary/util/u_helpers.h index f25f2807fe5..a9a53e4347a 100644 --- a/src/gallium/auxiliary/util/u_helpers.h +++ b/src/gallium/auxiliary/util/u_helpers.h @@ -28,12 +28,12 @@ #ifndef U_HELPERS_H #define U_HELPERS_H +#include "pipe/p_state.h" + #ifdef __cplusplus extern "C" { #endif -#include "pipe/p_state.h" - void util_set_vertex_buffers_mask(struct pipe_vertex_buffer *dst, uint32_t *enabled_buffers, const struct pipe_vertex_buffer *src, diff --git a/src/gallium/auxiliary/util/u_inlines.h b/src/gallium/auxiliary/util/u_inlines.h index d0812039292..0e80cef0b08 100644 --- a/src/gallium/auxiliary/util/u_inlines.h +++ b/src/gallium/auxiliary/util/u_inlines.h @@ -622,6 +622,16 @@ util_copy_constant_buffer(struct pipe_constant_buffer *dst, } } +static inline void +util_copy_image_view(struct pipe_image_view *dst, + const struct pipe_image_view *src) +{ + pipe_resource_reference(&dst->resource, src->resource); + dst->format = src->format; + dst->access = src->access; + dst->u = src->u; +} + static inline unsigned util_max_layer(const struct pipe_resource *r, unsigned level) { diff --git a/src/gallium/auxiliary/util/u_transfer.c b/src/gallium/auxiliary/util/u_transfer.c index adae84bbfab..0610535cd2c 100644 --- a/src/gallium/auxiliary/util/u_transfer.c +++ b/src/gallium/auxiliary/util/u_transfer.c @@ -98,7 +98,8 @@ u_resource( struct pipe_resource *res ) boolean u_resource_get_handle_vtbl(struct pipe_screen *screen, struct pipe_resource *resource, - struct winsys_handle *handle) + struct winsys_handle *handle, + unsigned usage) { struct u_resource *ur = u_resource(resource); return ur->vtbl->resource_get_handle(screen, resource, handle); diff --git a/src/gallium/auxiliary/util/u_transfer.h b/src/gallium/auxiliary/util/u_transfer.h index 6c25ee0f024..660dc161d33 100644 --- a/src/gallium/auxiliary/util/u_transfer.h +++ b/src/gallium/auxiliary/util/u_transfer.h @@ -78,7 +78,8 @@ struct u_resource { boolean u_resource_get_handle_vtbl(struct pipe_screen *screen, struct pipe_resource *resource, - struct winsys_handle *handle); + struct winsys_handle *handle, + unsigned usage); void u_resource_destroy_vtbl(struct pipe_screen *screen, struct pipe_resource *resource); diff --git a/src/gallium/auxiliary/util/u_video.h b/src/gallium/auxiliary/util/u_video.h index ddc00216105..9196afc11be 100644 --- a/src/gallium/auxiliary/util/u_video.h +++ b/src/gallium/auxiliary/util/u_video.h @@ -28,10 +28,6 @@ #ifndef U_VIDEO_H #define U_VIDEO_H -#ifdef __cplusplus -extern "C" { -#endif - #include "pipe/p_defines.h" #include "pipe/p_video_enums.h" @@ -40,6 +36,10 @@ extern "C" { #include "util/u_debug.h" #include "util/u_math.h" +#ifdef __cplusplus +extern "C" { +#endif + static inline enum pipe_video_format u_reduce_video_profile(enum pipe_video_profile profile) { diff --git a/src/gallium/auxiliary/vl/vl_winsys_dri.c b/src/gallium/auxiliary/vl/vl_winsys_dri.c index 758f50d7c23..01365260312 100644 --- a/src/gallium/auxiliary/vl/vl_winsys_dri.c +++ b/src/gallium/auxiliary/vl/vl_winsys_dri.c @@ -248,7 +248,8 @@ vl_dri2_screen_texture_from_drawable(struct vl_screen *vscreen, void *drawable) template.flags = 0; tex = scrn->base.pscreen->resource_from_handle(scrn->base.pscreen, &template, - &dri2_handle); + &dri2_handle, + PIPE_HANDLE_USAGE_READ_WRITE); free(reply); return tex; diff --git a/src/gallium/docs/source/drivers/openswr.rst b/src/gallium/docs/source/drivers/openswr.rst new file mode 100644 index 00000000000..84aa51f5d80 --- /dev/null +++ b/src/gallium/docs/source/drivers/openswr.rst @@ -0,0 +1,21 @@ +OpenSWR +======= + +The Gallium OpenSWR driver is a high performance, highly scalable +software renderer targeted towards visualization workloads. For such +geometry heavy workloads there is a considerable speedup over llvmpipe, +which is to be expected as the geometry frontend of llvmpipe is single +threaded. + +This rasterizer is x86 specific and requires AVX or AVX2. The driver +fits into the gallium framework, and reuses gallivm for doing the TGSI +to vectorized llvm-IR conversion of the shader kernels. + +.. toctree:: + :glob: + + openswr/usage + openswr/faq + openswr/profiling + openswr/knobs + diff --git a/src/gallium/docs/source/drivers/openswr/faq.rst b/src/gallium/docs/source/drivers/openswr/faq.rst new file mode 100644 index 00000000000..596d77f3780 --- /dev/null +++ b/src/gallium/docs/source/drivers/openswr/faq.rst @@ -0,0 +1,141 @@ +FAQ +=== + +Why another software rasterizer? +-------------------------------- + +Good question, given there are already three (swrast, softpipe, +llvmpipe) in the Mesa3D tree. Two important reasons for this: + + * Architecture - given our focus on scientific visualization, our + workloads are much different than the typical game; we have heavy + vertex load and relatively simple shaders. In addition, the core + counts of machines we run on are much higher. These parameters led + to design decisions much different than llvmpipe. + + * Historical - Intel had developed a high performance software + graphics stack for internal purposes. Later we adapted this + graphics stack for use in visualization and decided to move forward + with Mesa3D to provide a high quality API layer while at the same + time benefiting from the excellent performance the software + rasterizerizer gives us. + +What's the architecture? +------------------------ + +SWR is a tile based immediate mode renderer with a sort-free threading +model which is arranged as a ring of queues. Each entry in the ring +represents a draw context that contains all of the draw state and work +queues. An API thread sets up each draw context and worker threads +will execute both the frontend (vertex/geometry processing) and +backend (fragment) work as required. The ring allows for backend +threads to pull work in order. Large draws are split into chunks to +allow vertex processing to happen in parallel, with the backend work +pickup preserving draw ordering. + +Our pipeline uses just-in-time compiled code for the fetch shader that +does vertex attribute gathering and AOS to SOA conversions, the vertex +shader and fragment shaders, streamout, and fragment blending. SWR +core also supports geometry and compute shaders but we haven't exposed +them through our driver yet. The fetch shader, streamout, and blend is +built internally to swr core using LLVM directly, while for the vertex +and pixel shaders we reuse bits of llvmpipe from +``gallium/auxiliary/gallivm`` to build the kernels, which we wrap +differently than llvmpipe's ``auxiliary/draw`` code. + +What's the performance? +----------------------- + +For the types of high-geometry workloads we're interested in, we are +significantly faster than llvmpipe. This is to be expected, as +llvmpipe only threads the fragment processing and not the geometry +frontend. The performance advantage over llvmpipe roughly scales +linearly with the number of cores available. + +While our current performance is quite good, we know there is more +potential in this architecture. When we switched from a prototype +OpenGL driver to Mesa we regressed performance severely, some due to +interface issues that need tuning, some differences in shader code +generation, and some due to conformance and feature additions to the +core swr. We are looking to recovering most of this performance back. + +What's the conformance? +----------------------- + +The major applications we are targeting are all based on the +Visualization Toolkit (VTK), and as such our development efforts have +been focused on making sure these work as best as possible. Our +current code passes vtk's rendering tests with their new "OpenGL2" +(really OpenGL 3.2) backend at 99%. + +piglit testing shows a much lower pass rate, roughly 80% at the time +of writing. Core SWR undergoes rigorous unit testing and we are quite +confident in the rasterizer, and understand the areas where it +currently has issues (example: line rendering is done with triangles, +so doesn't match the strict line rendering rules). The majority of +the piglit failures are errors in our driver layer interfacing Mesa +and SWR. Fixing these issues is one of our major future development +goals. + +Why are you open sourcing this? +------------------------------- + + * Our customers prefer open source, and allowing them to simply + download the Mesa source and enable our driver makes life much + easier for them. + + * The internal gallium APIs are not stable, so we'd like our driver + to be visible for changes. + + * It's easier to work with the Mesa community when the source we're + working with can be used as reference. + +What are your development plans? +-------------------------------- + + * Performance - see the performance section earlier for details. + + * Conformance - see the conformance section earlier for details. + + * Features - core SWR has a lot of functionality we have yet to + expose through our driver, such as MSAA, geometry shaders, compute + shaders, and tesselation. + + * AVX512 support + +What is the licensing of the code? +---------------------------------- + + * All code is under the normal Mesa MIT license. + +Will this work on AMD? +---------------------- + + * If using an AMD processor with AVX or AVX2, it should work though + we don't have that hardware around to test. Patches if needed + would be welcome. + +Will this work on ARM, MIPS, POWER, <other non-x86 architecture>? +------------------------------------------------------------------------- + + * Not without a lot of work. We make extensive use of AVX and AVX2 + intrinsics in our code and the in-tree JIT creation. It is not the + intention for this codebase to support non-x86 architectures. + +What hardware do I need? +------------------------ + + * Any x86 processor with at least AVX (introduced in the Intel + SandyBridge and AMD Bulldozer microarchitectures in 2011) will + work. + + * You don't need a fire-breathing Xeon machine to work on SWR - we do + day-to-day development with laptops and desktop CPUs. + +Does one build work on both AVX and AVX2? +----------------------------------------- + +Yes. The build system creates two shared libraries, ``libswrAVX.so`` and +``libswrAVX2.so``, and ``swr_create_screen()`` loads the appropriate one at +runtime. + diff --git a/src/gallium/docs/source/drivers/openswr/knobs.rst b/src/gallium/docs/source/drivers/openswr/knobs.rst new file mode 100644 index 00000000000..06f228a2e92 --- /dev/null +++ b/src/gallium/docs/source/drivers/openswr/knobs.rst @@ -0,0 +1,114 @@ +Knobs +===== + +OpenSWR has a number of environment variables which control its +operation, in addition to the normal Mesa and gallium controls. + +.. envvar:: KNOB_ENABLE_ASSERT_DIALOGS <bool> (true) + +Use dialogs when asserts fire. Asserts are only enabled in debug builds + +.. envvar:: KNOB_SINGLE_THREADED <bool> (false) + +If enabled will perform all rendering on the API thread. This is useful mainly for debugging purposes. + +.. envvar:: KNOB_DUMP_SHADER_IR <bool> (false) + +Dumps shader LLVM IR at various stages of jit compilation. + +.. envvar:: KNOB_USE_GENERIC_STORETILE <bool> (false) + +Always use generic function for performing StoreTile. Will be slightly slower than using optimized (jitted) path + +.. envvar:: KNOB_FAST_CLEAR <bool> (true) + +Replace 3D primitive execute with a SWRClearRT operation and defer clear execution to first backend op on hottile, or hottile store + +.. envvar:: KNOB_MAX_NUMA_NODES <uint32_t> (0) + +Maximum # of NUMA-nodes per system used for worker threads 0 == ALL NUMA-nodes in the system N == Use at most N NUMA-nodes for rendering + +.. envvar:: KNOB_MAX_CORES_PER_NUMA_NODE <uint32_t> (0) + +Maximum # of cores per NUMA-node used for worker threads. 0 == ALL non-API thread cores per NUMA-node N == Use at most N cores per NUMA-node + +.. envvar:: KNOB_MAX_THREADS_PER_CORE <uint32_t> (1) + +Maximum # of (hyper)threads per physical core used for worker threads. 0 == ALL hyper-threads per core N == Use at most N hyper-threads per physical core + +.. envvar:: KNOB_MAX_WORKER_THREADS <uint32_t> (0) + +Maximum worker threads to spawn. IMPORTANT: If this is non-zero, no worker threads will be bound to specific HW threads. They will all be "floating" SW threads. In this case, the above 3 KNOBS will be ignored. + +.. envvar:: KNOB_BUCKETS_START_FRAME <uint32_t> (1200) + +Frame from when to start saving buckets data. NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h for this to have an effect. + +.. envvar:: KNOB_BUCKETS_END_FRAME <uint32_t> (1400) + +Frame at which to stop saving buckets data. NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h for this to have an effect. + +.. envvar:: KNOB_WORKER_SPIN_LOOP_COUNT <uint32_t> (5000) + +Number of spin-loop iterations worker threads will perform before going to sleep when waiting for work + +.. envvar:: KNOB_MAX_DRAWS_IN_FLIGHT <uint32_t> (160) + +Maximum number of draws outstanding before API thread blocks. + +.. envvar:: KNOB_MAX_PRIMS_PER_DRAW <uint32_t> (2040) + +Maximum primitives in a single Draw(). Larger primitives are split into smaller Draw calls. Should be a multiple of (3 * vectorWidth). + +.. envvar:: KNOB_MAX_TESS_PRIMS_PER_DRAW <uint32_t> (16) + +Maximum primitives in a single Draw() with tessellation enabled. Larger primitives are split into smaller Draw calls. Should be a multiple of (vectorWidth). + +.. envvar:: KNOB_MAX_FRAC_ODD_TESS_FACTOR <float> (63.0f) + +(DEBUG) Maximum tessellation factor for fractional-odd partitioning. + +.. envvar:: KNOB_MAX_FRAC_EVEN_TESS_FACTOR <float> (64.0f) + +(DEBUG) Maximum tessellation factor for fractional-even partitioning. + +.. envvar:: KNOB_MAX_INTEGER_TESS_FACTOR <uint32_t> (64) + +(DEBUG) Maximum tessellation factor for integer partitioning. + +.. envvar:: KNOB_BUCKETS_ENABLE_THREADVIZ <bool> (false) + +Enable threadviz output. + +.. envvar:: KNOB_TOSS_DRAW <bool> (false) + +Disable per-draw/dispatch execution + +.. envvar:: KNOB_TOSS_QUEUE_FE <bool> (false) + +Stop per-draw execution at worker FE NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h + +.. envvar:: KNOB_TOSS_FETCH <bool> (false) + +Stop per-draw execution at vertex fetch NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h + +.. envvar:: KNOB_TOSS_IA <bool> (false) + +Stop per-draw execution at input assembler NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h + +.. envvar:: KNOB_TOSS_VS <bool> (false) + +Stop per-draw execution at vertex shader NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h + +.. envvar:: KNOB_TOSS_SETUP_TRIS <bool> (false) + +Stop per-draw execution at primitive setup NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h + +.. envvar:: KNOB_TOSS_BIN_TRIS <bool> (false) + +Stop per-draw execution at primitive binning NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h + +.. envvar:: KNOB_TOSS_RS <bool> (false) + +Stop per-draw execution at rasterizer NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h + diff --git a/src/gallium/docs/source/drivers/openswr/profiling.rst b/src/gallium/docs/source/drivers/openswr/profiling.rst new file mode 100644 index 00000000000..357754c3506 --- /dev/null +++ b/src/gallium/docs/source/drivers/openswr/profiling.rst @@ -0,0 +1,67 @@ +Profiling +========= + +OpenSWR contains built-in profiling which can be enabled +at build time to provide insight into performance tuning. + +To enable this, uncomment the following line in ``rasterizer/core/knobs.h`` and rebuild: :: + + //#define KNOB_ENABLE_RDTSC + +Running an application will result in a ``rdtsc.txt`` file being +created in current working directory. This file contains profile +information captured between the ``KNOB_BUCKETS_START_FRAME`` and +``KNOB_BUCKETS_END_FRAME`` (see knobs section). + +The resulting file will contain sections for each thread with a +hierarchical breakdown of the time spent in the various operations. +For example: :: + + Thread 0 (API) + %Tot %Par Cycles CPE NumEvent CPE2 NumEvent2 Bucket + 0.00 0.00 28370 2837 10 0 0 APIClearRenderTarget + 0.00 41.23 11698 1169 10 0 0 |-> APIDrawWakeAllThreads + 0.00 18.34 5202 520 10 0 0 |-> APIGetDrawContext + 98.72 98.72 12413773688 29957 414380 0 0 APIDraw + 0.36 0.36 44689364 107 414380 0 0 |-> APIDrawWakeAllThreads + 96.36 97.62 12117951562 9747 1243140 0 0 |-> APIGetDrawContext + 0.00 0.00 19904 995 20 0 0 APIStoreTiles + 0.00 7.88 1568 78 20 0 0 |-> APIDrawWakeAllThreads + 0.00 25.28 5032 251 20 0 0 |-> APIGetDrawContext + 1.28 1.28 161344902 64 2486370 0 0 APIGetDrawContext + 0.00 0.00 50368 2518 20 0 0 APISync + 0.00 2.70 1360 68 20 0 0 |-> APIDrawWakeAllThreads + 0.00 65.27 32876 1643 20 0 0 |-> APIGetDrawContext + + + Thread 1 (WORKER) + %Tot %Par Cycles CPE NumEvent CPE2 NumEvent2 Bucket + 83.92 83.92 13198987522 96411 136902 0 0 FEProcessDraw + 24.91 29.69 3918184840 167 23410158 0 0 |-> FEFetchShader + 11.17 13.31 1756972646 75 23410158 0 0 |-> FEVertexShader + 8.89 10.59 1397902996 59 23410161 0 0 |-> FEPAAssemble + 19.06 22.71 2997794710 384 7803387 0 0 |-> FEClipTriangles + 11.67 61.21 1834958176 235 7803387 0 0 |-> FEBinTriangles + 0.00 0.00 0 0 187258 0 0 |-> FECullZeroAreaAndBackface + 0.00 0.00 0 0 60051033 0 0 |-> FECullBetweenCenters + 0.11 0.11 17217556 2869592 6 0 0 FEProcessStoreTiles + 15.97 15.97 2511392576 73665 34092 0 0 WorkerWorkOnFifoBE + 14.04 87.95 2208687340 9187 240408 0 0 |-> WorkerFoundWork + 0.06 0.43 9390536 13263 708 0 0 |-> BELoadTiles + 0.00 0.01 293020 182 1609 0 0 |-> BEClear + 12.63 89.94 1986508990 949 2093014 0 0 |-> BERasterizeTriangle + 2.37 18.75 372374596 177 2093014 0 0 |-> BETriangleSetup + 0.42 3.35 66539016 31 2093014 0 0 |-> BEStepSetup + 0.00 0.00 0 0 21766 0 0 |-> BETrivialReject + 1.05 8.33 165410662 79 2071248 0 0 |-> BERasterizePartial + 6.06 48.02 953847796 1260 756783 0 0 |-> BEPixelBackend + 0.20 3.30 31521202 41 756783 0 0 |-> BESetup + 0.16 2.69 25624304 33 756783 0 0 |-> BEBarycentric + 0.18 2.92 27884986 36 756783 0 0 |-> BEEarlyDepthTest + 0.19 3.20 30564174 41 744058 0 0 |-> BEPixelShader + 0.26 4.30 41058646 55 744058 0 0 |-> BEOutputMerger + 1.27 20.94 199750822 32 6054264 0 0 |-> BEEndTile + 0.33 2.34 51758160 23687 2185 0 0 |-> BEStoreTiles + 0.20 60.22 31169500 28807 1082 0 0 |-> B8G8R8A8_UNORM + 0.00 0.00 302752 302752 1 0 0 WorkerWaitForThreadEvent + diff --git a/src/gallium/docs/source/drivers/openswr/usage.rst b/src/gallium/docs/source/drivers/openswr/usage.rst new file mode 100644 index 00000000000..e55b4211a54 --- /dev/null +++ b/src/gallium/docs/source/drivers/openswr/usage.rst @@ -0,0 +1,38 @@ +Usage +===== + +Requirements +^^^^^^^^^^^^ + +* An x86 processor with AVX or AVX2 +* LLVM version 3.6 or later + +Building +^^^^^^^^ + +To build with GNU automake, select building the swr driver at +configure time, for example: :: + + configure --with-gallium-drivers=swrast,swr + +Using +^^^^^ + +On Linux, building will create a drop-in alternative for libGL.so into:: + + lib/gallium/libGL.so + +or:: + + build/foo/gallium/targets/libgl-xlib/libGL.so + +To use it set the LD_LIBRARY_PATH environment variable accordingly. + +**IMPORTANT:** Mesa will default to using llvmpipe or softpipe as the default software renderer. To select the OpenSWR driver, set the GALLIUM_DRIVER environment variable appropriately: :: + + GALLIUM_DRIVER=swr + +To verify OpenSWR is being used, check to see if a message like the following is printed when the application is started: :: + + SWR detected AVX2 + diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst index b5d691f4f7e..46ec3815412 100644 --- a/src/gallium/docs/source/screen.rst +++ b/src/gallium/docs/source/screen.rst @@ -319,6 +319,10 @@ The integer capabilities: adjusted appropriately. * ``PIPE_CAP_QUERY_BUFFER_OBJECT``: Driver supports context::get_query_result_resource callback. +* ``PIPE_CAP_PCI_GROUP``: Return the PCI segment group number. +* ``PIPE_CAP_PCI_BUS``: Return the PCI bus number. +* ``PIPE_CAP_PCI_DEVICE``: Return the PCI device number. +* ``PIPE_CAP_PCI_FUNCTION``: Return the PCI function number. .. _pipe_capf: diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst index 489cbb0bc2f..af2df2251da 100644 --- a/src/gallium/docs/source/tgsi.rst +++ b/src/gallium/docs/source/tgsi.rst @@ -3206,6 +3206,12 @@ NUM_CULLDIST_ENABLED How many cull distance scalar outputs are enabled. +FS_EARLY_DEPTH_STENCIL +"""""""""""""""""""""" + +Whether depth test, stencil test, and occlusion query should run before +the fragment shader (regardless of fragment shader side effects). Corresponds +to GLSL early_fragment_tests. Texture Sampling and Texture Formats ------------------------------------ diff --git a/src/gallium/drivers/ddebug/dd_screen.c b/src/gallium/drivers/ddebug/dd_screen.c index 3706b2d63f5..fbc0bec73dd 100644 --- a/src/gallium/drivers/ddebug/dd_screen.c +++ b/src/gallium/drivers/ddebug/dd_screen.c @@ -179,11 +179,12 @@ dd_screen_resource_create(struct pipe_screen *_screen, static struct pipe_resource * dd_screen_resource_from_handle(struct pipe_screen *_screen, const struct pipe_resource *templ, - struct winsys_handle *handle) + struct winsys_handle *handle, + unsigned usage) { struct pipe_screen *screen = dd_screen(_screen)->screen; struct pipe_resource *res = - screen->resource_from_handle(screen, templ, handle); + screen->resource_from_handle(screen, templ, handle, usage); if (!res) return NULL; @@ -218,11 +219,12 @@ dd_screen_resource_destroy(struct pipe_screen *_screen, static boolean dd_screen_resource_get_handle(struct pipe_screen *_screen, struct pipe_resource *resource, - struct winsys_handle *handle) + struct winsys_handle *handle, + unsigned usage) { struct pipe_screen *screen = dd_screen(_screen)->screen; - return screen->resource_get_handle(screen, resource, handle); + return screen->resource_get_handle(screen, resource, handle, usage); } diff --git a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h index 71ee55054d3..606252e6726 100644 --- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h +++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h @@ -12,9 +12,9 @@ The rules-ng-ng source files this header was generated from are: - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1572 bytes, from 2016-02-10 17:07:21) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 11518 bytes, from 2016-02-10 21:03:25) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 16166 bytes, from 2016-02-11 21:20:31) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 16185 bytes, from 2016-03-05 03:08:05) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 83967 bytes, from 2016-02-10 17:07:21) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 109858 bytes, from 2016-02-10 17:07:21) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 110342 bytes, from 2016-03-07 11:20:29) - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00) Copyright (C) 2013-2015 by the following authors: diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h index c6286a1f290..3838fdf44d0 100644 --- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h +++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h @@ -12,9 +12,9 @@ The rules-ng-ng source files this header was generated from are: - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1572 bytes, from 2016-02-10 17:07:21) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 11518 bytes, from 2016-02-10 21:03:25) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 16166 bytes, from 2016-02-11 21:20:31) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 16185 bytes, from 2016-03-05 03:08:05) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 83967 bytes, from 2016-02-10 17:07:21) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 109858 bytes, from 2016-02-10 17:07:21) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 110342 bytes, from 2016-03-07 11:20:29) - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00) Copyright (C) 2013-2016 by the following authors: diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c index b8a31d84b3f..f48d464c294 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c @@ -167,6 +167,7 @@ fd3_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info) emit.key.binning_pass = false; emit.dirty = dirty; emit.vp = NULL; /* we changed key so need to refetch vp */ + emit.fp = NULL; draw_impl(ctx, ctx->ring, &emit); } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c index 8c37992e17d..adfa9a96a46 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c @@ -353,7 +353,7 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit) int32_t i, j, last = -1; uint32_t total_in = 0; const struct fd_vertex_state *vtx = emit->vtx; - struct ir3_shader_variant *vp = fd3_emit_get_vp(emit); + const struct ir3_shader_variant *vp = fd3_emit_get_vp(emit); unsigned vertex_regid = regid(63, 0); unsigned instance_regid = regid(63, 0); unsigned vtxcnt_regid = regid(63, 0); @@ -478,8 +478,8 @@ void fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, struct fd3_emit *emit) { - struct ir3_shader_variant *vp = fd3_emit_get_vp(emit); - struct ir3_shader_variant *fp = fd3_emit_get_fp(emit); + const struct ir3_shader_variant *vp = fd3_emit_get_vp(emit); + const struct ir3_shader_variant *fp = fd3_emit_get_fp(emit); uint32_t dirty = emit->dirty; emit_marker(ring, 5); @@ -656,9 +656,9 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, OUT_RING(ring, HLSQ_FLUSH); if (emit->prog == &ctx->prog) { /* evil hack to deal sanely with clear path */ - ir3_emit_consts(vp, ring, emit->info, dirty); + ir3_emit_consts(vp, ring, ctx, emit->info, dirty); if (!emit->key.binning_pass) - ir3_emit_consts(fp, ring, emit->info, dirty); + ir3_emit_consts(fp, ring, ctx, emit->info, dirty); /* mark clean after emitting consts: */ ctx->prog.dirty = 0; } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h index 42483f6c39b..5dbb11599b5 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h @@ -58,10 +58,10 @@ struct fd3_emit { bool rasterflat; /* cached to avoid repeated lookups of same variants: */ - struct ir3_shader_variant *vp, *fp; + const struct ir3_shader_variant *vp, *fp; }; -static inline struct ir3_shader_variant * +static inline const struct ir3_shader_variant * fd3_emit_get_vp(struct fd3_emit *emit) { if (!emit->vp) { @@ -71,12 +71,18 @@ fd3_emit_get_vp(struct fd3_emit *emit) return emit->vp; } -static inline struct ir3_shader_variant * +static inline const struct ir3_shader_variant * fd3_emit_get_fp(struct fd3_emit *emit) { if (!emit->fp) { - struct fd3_shader_stateobj *so = emit->prog->fp; - emit->fp = ir3_shader_variant(so->shader, emit->key); + if (emit->key.binning_pass) { + /* use dummy stateobj to simplify binning vs non-binning: */ + static const struct ir3_shader_variant binning_fp = {}; + emit->fp = &binning_fp; + } else { + struct fd3_shader_stateobj *so = emit->prog->fp; + emit->fp = ir3_shader_variant(so->shader, emit->key); + } } return emit->fp; } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c index 2ce393a41ae..815a310df83 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c @@ -931,9 +931,6 @@ fd3_emit_tile_init(struct fd_context *ctx) update_vsc_pipe(ctx); if (use_hw_binning(ctx)) { - /* mark the end of the binning cmds: */ - fd_ringmarker_mark(ctx->binning_end); - /* emit hw binning pass: */ emit_binning_pass(ctx); @@ -1017,8 +1014,8 @@ fd3_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile) OUT_PKT3(ring, CP_SET_BIN_DATA, 2); - OUT_RELOC(ring, pipe->bo, 0, 0, 0); /* BIN_DATA_ADDR <- VSC_PIPE[p].DATA_ADDRESS */ - OUT_RELOC(ring, fd3_ctx->vsc_size_mem, /* BIN_SIZE_ADDR <- VSC_SIZE_ADDRESS + (p * 4) */ + OUT_RELOCW(ring, pipe->bo, 0, 0, 0); /* BIN_DATA_ADDR <- VSC_PIPE[p].DATA_ADDRESS */ + OUT_RELOCW(ring, fd3_ctx->vsc_size_mem, /* BIN_SIZE_ADDR <- VSC_SIZE_ADDRESS + (p * 4) */ (tile->p * 4), 0, 0); } else { OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1); diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c index a64ecf16eab..57e269cc21f 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c @@ -51,7 +51,8 @@ create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state enum shader_t type) { struct fd3_shader_stateobj *so = CALLOC_STRUCT(fd3_shader_stateobj); - so->shader = ir3_shader_create(pctx, cso, type); + struct ir3_compiler *compiler = fd_context(pctx)->screen->compiler; + so->shader = ir3_shader_create(compiler, cso, type); return so; } @@ -139,14 +140,7 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit, debug_assert(nr <= ARRAY_SIZE(color_regid)); vp = fd3_emit_get_vp(emit); - - if (emit->key.binning_pass) { - /* use dummy stateobj to simplify binning vs non-binning: */ - static const struct ir3_shader_variant binning_fp = {}; - fp = &binning_fp; - } else { - fp = fd3_emit_get_fp(emit); - } + fp = fd3_emit_get_fp(emit); vsi = &vp->info; fsi = &fp->info; diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c index 722fe360202..4aea2fe0f37 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c @@ -106,7 +106,7 @@ fd3_screen_init(struct pipe_screen *pscreen) { struct fd_screen *screen = fd_screen(pscreen); screen->max_rts = A3XX_MAX_RENDER_TARGETS; - screen->compiler = ir3_compiler_create(screen->gpu_id); + screen->compiler = ir3_compiler_create(screen->dev, screen->gpu_id); pscreen->context_create = fd3_context_create; pscreen->is_format_supported = fd3_screen_is_format_supported; } diff --git a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h index d6fd1bb583e..98750123291 100644 --- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h +++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h @@ -12,9 +12,9 @@ The rules-ng-ng source files this header was generated from are: - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1572 bytes, from 2016-02-10 17:07:21) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 11518 bytes, from 2016-02-10 21:03:25) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 16166 bytes, from 2016-02-11 21:20:31) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 16185 bytes, from 2016-03-05 03:08:05) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 83967 bytes, from 2016-02-10 17:07:21) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 109858 bytes, from 2016-02-10 17:07:21) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 110342 bytes, from 2016-03-07 11:20:29) - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00) Copyright (C) 2013-2016 by the following authors: @@ -940,6 +940,7 @@ static inline uint32_t A4XX_RB_MODE_CONTROL_HEIGHT(uint32_t val) { return ((val >> 5) << A4XX_RB_MODE_CONTROL_HEIGHT__SHIFT) & A4XX_RB_MODE_CONTROL_HEIGHT__MASK; } +#define A4XX_RB_MODE_CONTROL_ENABLE_GMEM 0x00010000 #define REG_A4XX_RB_RENDER_CONTROL 0x000020a1 #define A4XX_RB_RENDER_CONTROL_BINNING_PASS 0x00000001 @@ -1613,6 +1614,7 @@ static inline uint32_t REG_A4XX_RBBM_CLOCK_DELAY_TP_REG(uint32_t i0) { return 0x #define REG_A4XX_RBBM_POWER_CNTL_IP 0x00000098 #define A4XX_RBBM_POWER_CNTL_IP_SW_COLLAPSE 0x00000001 +#define A4XX_RBBM_POWER_CNTL_IP_SP_TP_PWR_ON 0x00100000 #define REG_A4XX_RBBM_PERFCTR_CP_0_LO 0x0000009c @@ -3689,6 +3691,20 @@ static inline uint32_t A4XX_HLSQ_GS_CONTROL_REG_INSTRLENGTH(uint32_t val) #define REG_A4XX_PC_BIN_BASE 0x000021c0 +#define REG_A4XX_PC_VSTREAM_CONTROL 0x000021c2 +#define A4XX_PC_VSTREAM_CONTROL_SIZE__MASK 0x003f0000 +#define A4XX_PC_VSTREAM_CONTROL_SIZE__SHIFT 16 +static inline uint32_t A4XX_PC_VSTREAM_CONTROL_SIZE(uint32_t val) +{ + return ((val) << A4XX_PC_VSTREAM_CONTROL_SIZE__SHIFT) & A4XX_PC_VSTREAM_CONTROL_SIZE__MASK; +} +#define A4XX_PC_VSTREAM_CONTROL_N__MASK 0x07c00000 +#define A4XX_PC_VSTREAM_CONTROL_N__SHIFT 22 +static inline uint32_t A4XX_PC_VSTREAM_CONTROL_N(uint32_t val) +{ + return ((val) << A4XX_PC_VSTREAM_CONTROL_N__SHIFT) & A4XX_PC_VSTREAM_CONTROL_N__MASK; +} + #define REG_A4XX_PC_PRIM_VTX_CNTL 0x000021c4 #define A4XX_PC_PRIM_VTX_CNTL_VAROUT__MASK 0x0000000f #define A4XX_PC_PRIM_VTX_CNTL_VAROUT__SHIFT 0 diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.c b/src/gallium/drivers/freedreno/a4xx/fd4_context.c index 7d6365bbb6d..62cfda97ac3 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_context.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.c @@ -43,8 +43,6 @@ fd4_context_destroy(struct pipe_context *pctx) { struct fd4_context *fd4_ctx = fd4_context(fd_context(pctx)); - util_dynarray_fini(&fd4_ctx->rbrc_patches); - fd_bo_del(fd4_ctx->vs_pvt_mem); fd_bo_del(fd4_ctx->fs_pvt_mem); fd_bo_del(fd4_ctx->vsc_size_mem); @@ -127,8 +125,6 @@ fd4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags) if (!pctx) return NULL; - util_dynarray_init(&fd4_ctx->rbrc_patches); - fd4_ctx->vs_pvt_mem = fd_bo_new(screen->dev, 0x2000, DRM_FREEDRENO_GEM_TYPE_KMEM); diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.h b/src/gallium/drivers/freedreno/a4xx/fd4_context.h index 0c1027d5804..8996de932b8 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_context.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.h @@ -40,11 +40,6 @@ struct fd4_context { struct fd_context base; - /* Keep track of writes to RB_RENDER_CONTROL which need to be patched - * once we know whether or not to use GMEM, and GMEM tile pitch. - */ - struct util_dynarray rbrc_patches; - struct fd_bo *vs_pvt_mem, *fs_pvt_mem; /* This only needs to be 4 * num_of_pipes bytes (ie. 32 bytes). We diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c index 8cbe68d5790..c34f9441c7b 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c @@ -156,6 +156,7 @@ fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info) emit.key.binning_pass = false; emit.dirty = dirty; emit.vp = NULL; /* we changed key so need to refetch vp */ + emit.fp = NULL; draw_impl(ctx, ctx->ring, &emit); } @@ -175,6 +176,43 @@ reset_viewport(struct fd_ringbuffer *ring, struct pipe_framebuffer_state *pfb) OUT_RING(ring, A4XX_GRAS_CL_VPORT_YSCALE_0(-half_height)); } +/* TODO maybe we should just migrate u_blitter for clear and do it in + * core (so we get normal draw pass state mgmt and binning).. That should + * work well enough for a3xx/a4xx (but maybe not a2xx?) + */ + +static void +fd4_clear_binning(struct fd_context *ctx, unsigned dirty) +{ + struct fd4_context *fd4_ctx = fd4_context(ctx); + struct fd_ringbuffer *ring = ctx->binning_ring; + struct fd4_emit emit = { + .vtx = &fd4_ctx->solid_vbuf_state, + .prog = &ctx->solid_prog, + .key = { + .binning_pass = true, + .half_precision = true, + }, + .dirty = dirty, + }; + + fd4_emit_state(ctx, ring, &emit); + fd4_emit_vertex_bufs(ring, &emit); + reset_viewport(ring, &ctx->framebuffer); + + OUT_PKT0(ring, REG_A4XX_PC_PRIM_VTX_CNTL, 2); + OUT_RING(ring, A4XX_PC_PRIM_VTX_CNTL_VAROUT(0) | + A4XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST); + OUT_RING(ring, A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_FRONT_PTYPE(PC_DRAW_TRIANGLES) | + A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_BACK_PTYPE(PC_DRAW_TRIANGLES)); + + OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1); + OUT_RING(ring, 0x00000002); + + fd4_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 2, 1, INDEX_SIZE_IGN, 0, 0, NULL); +} + static void fd4_clear(struct fd_context *ctx, unsigned buffers, const union pipe_color_union *color, double depth, unsigned stencil) @@ -197,6 +235,8 @@ fd4_clear(struct fd_context *ctx, unsigned buffers, dirty |= FD_DIRTY_PROG; emit.dirty = dirty; + fd4_clear_binning(ctx, dirty); + OUT_PKT0(ring, REG_A4XX_PC_PRIM_VTX_CNTL, 1); OUT_RING(ring, A4XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST); diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.h b/src/gallium/drivers/freedreno/a4xx/fd4_draw.h index a6c56404a8a..2b23e33b42f 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.h @@ -41,9 +41,10 @@ static inline uint32_t DRAW4(enum pc_di_primtype prim_type, enum pc_di_src_sel source_select, enum a4xx_index_size index_size, enum pc_di_vis_cull_mode vis_cull_mode) { - return (prim_type << 0) | - (source_select << 6) | - (index_size << 10); + return CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(prim_type) | + CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(source_select) | + CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(index_size) | + CP_DRAW_INDX_OFFSET_0_VIS_CULL(vis_cull_mode); } static inline void diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c index 72154bf286a..81ed16ce8ac 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c @@ -328,7 +328,7 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit) int32_t i, j, last = -1; uint32_t total_in = 0; const struct fd_vertex_state *vtx = emit->vtx; - struct ir3_shader_variant *vp = fd4_emit_get_vp(emit); + const struct ir3_shader_variant *vp = fd4_emit_get_vp(emit); unsigned vertex_regid = regid(63, 0); unsigned instance_regid = regid(63, 0); unsigned vtxcnt_regid = regid(63, 0); @@ -460,8 +460,8 @@ void fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, struct fd4_emit *emit) { - struct ir3_shader_variant *vp = fd4_emit_get_vp(emit); - struct ir3_shader_variant *fp = fd4_emit_get_fp(emit); + const struct ir3_shader_variant *vp = fd4_emit_get_vp(emit); + const struct ir3_shader_variant *fp = fd4_emit_get_fp(emit); uint32_t dirty = emit->dirty; emit_marker(ring, 5); @@ -485,19 +485,6 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7])); } - if ((dirty & (FD_DIRTY_ZSA | FD_DIRTY_PROG)) && !emit->key.binning_pass) { - uint32_t val = fd4_zsa_stateobj(ctx->zsa)->rb_render_control; - - /* I suppose if we needed to (which I don't *think* we need - * to), we could emit this for binning pass too. But we - * would need to keep a different patch-list for binning - * vs render pass. - */ - - OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1); - OUT_RINGP(ring, val, &fd4_context(ctx)->rbrc_patches); - } - if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_FRAMEBUFFER)) { struct fd4_zsa_stateobj *zsa = fd4_zsa_stateobj(ctx->zsa); struct pipe_framebuffer_state *pfb = &ctx->framebuffer; @@ -619,13 +606,17 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, if (dirty & (FD_DIRTY_PROG | FD_DIRTY_FRAMEBUFFER)) { struct pipe_framebuffer_state *pfb = &ctx->framebuffer; - fd4_program_emit(ring, emit, pfb->nr_cbufs, pfb->cbufs); + unsigned n = pfb->nr_cbufs; + /* if we have depth/stencil, we need at least on MRT: */ + if (pfb->zsbuf) + n = MAX2(1, n); + fd4_program_emit(ring, emit, n, pfb->cbufs); } if (emit->prog == &ctx->prog) { /* evil hack to deal sanely with clear path */ - ir3_emit_consts(vp, ring, emit->info, dirty); + ir3_emit_consts(vp, ring, ctx, emit->info, dirty); if (!emit->key.binning_pass) - ir3_emit_consts(fp, ring, emit->info, dirty); + ir3_emit_consts(fp, ring, ctx, emit->info, dirty); /* mark clean after emitting consts: */ ctx->prog.dirty = 0; } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h index 3a1d4b617d3..d8d3fd88a69 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h @@ -59,7 +59,7 @@ struct fd4_emit { bool no_decode_srgb; /* cached to avoid repeated lookups of same variants: */ - struct ir3_shader_variant *vp, *fp; + const struct ir3_shader_variant *vp, *fp; /* TODO: other shader stages.. */ }; @@ -70,7 +70,7 @@ static inline enum a4xx_color_fmt fd4_emit_format(struct pipe_surface *surf) return fd4_pipe2color(surf->format); } -static inline struct ir3_shader_variant * +static inline const struct ir3_shader_variant * fd4_emit_get_vp(struct fd4_emit *emit) { if (!emit->vp) { @@ -80,12 +80,18 @@ fd4_emit_get_vp(struct fd4_emit *emit) return emit->vp; } -static inline struct ir3_shader_variant * +static inline const struct ir3_shader_variant * fd4_emit_get_fp(struct fd4_emit *emit) { if (!emit->fp) { - struct fd4_shader_stateobj *so = emit->prog->fp; - emit->fp = ir3_shader_variant(so->shader, emit->key); + if (emit->key.binning_pass) { + /* use dummy stateobj to simplify binning vs non-binning: */ + static const struct ir3_shader_variant binning_fp = {}; + emit->fp = &binning_fp; + } else { + struct fd4_shader_stateobj *so = emit->prog->fp; + emit->fp = ir3_shader_variant(so->shader, emit->key); + } } return emit->fp; } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c index 221608127b4..0629c303656 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c @@ -130,6 +130,19 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs, } } +static bool +use_hw_binning(struct fd_context *ctx) +{ + struct fd_gmem_stateobj *gmem = &ctx->gmem; + struct pipe_framebuffer_state *pfb = &ctx->framebuffer; + + /* this seems to be a hw bug.. but this hack fixes piglit fbo-maxsize: */ + if ((pfb->width > 4096) && (pfb->height > 4096)) + return false; + + return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2); +} + /* transfer from gmem to system memory (ie. normal RAM) */ static void @@ -502,18 +515,6 @@ patch_draws(struct fd_context *ctx, enum pc_di_vis_cull_mode vismode) util_dynarray_resize(&ctx->draw_patches, 0); } -static void -patch_rbrc(struct fd_context *ctx, uint32_t val) -{ - struct fd4_context *fd4_ctx = fd4_context(ctx); - unsigned i; - for (i = 0; i < fd_patch_num_elements(&fd4_ctx->rbrc_patches); i++) { - struct fd_cs_patch *patch = fd_patch_element(&fd4_ctx->rbrc_patches, i); - *patch->cs = patch->val | val; - } - util_dynarray_resize(&fd4_ctx->rbrc_patches, 0); -} - /* for rendering directly to system memory: */ static void fd4_emit_sysmem_prep(struct fd_context *ctx) @@ -545,8 +546,10 @@ fd4_emit_sysmem_prep(struct fd_context *ctx) A4XX_RB_MODE_CONTROL_HEIGHT(0) | 0x00c00000); /* XXX */ + OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1); + OUT_RING(ring, 0x8); + patch_draws(ctx, IGNORE_VISIBILITY); - patch_rbrc(ctx, 0); // XXX } static void @@ -585,13 +588,76 @@ update_vsc_pipe(struct fd_context *ctx) } } +static void +emit_binning_pass(struct fd_context *ctx) +{ + struct fd_gmem_stateobj *gmem = &ctx->gmem; + struct pipe_framebuffer_state *pfb = &ctx->framebuffer; + struct fd_ringbuffer *ring = ctx->ring; + int i; + + uint32_t x1 = gmem->minx; + uint32_t y1 = gmem->miny; + uint32_t x2 = gmem->minx + gmem->width - 1; + uint32_t y2 = gmem->miny + gmem->height - 1; + + OUT_PKT0(ring, REG_A4XX_PC_BINNING_COMMAND, 1); + OUT_RING(ring, A4XX_PC_BINNING_COMMAND_BINNING_ENABLE); + + OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_TILING_PASS) | + A4XX_GRAS_SC_CONTROL_MSAA_DISABLE | + A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A4XX_GRAS_SC_CONTROL_RASTER_MODE(0)); + + OUT_PKT0(ring, REG_A4XX_RB_FRAME_BUFFER_DIMENSION, 1); + OUT_RING(ring, A4XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) | + A4XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height)); + + /* setup scissor/offset for whole screen: */ + OUT_PKT0(ring, REG_A4XX_RB_BIN_OFFSET, 1); + OUT_RING(ring, A4XX_RB_BIN_OFFSET_X(x1) | + A4XX_RB_BIN_OFFSET_Y(y1)); + + OUT_PKT0(ring, REG_A4XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); + OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_TL_X(x1) | + A4XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(y1)); + OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_BR_X(x2) | + A4XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(y2)); + + for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) { + OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1); + OUT_RING(ring, A4XX_RB_MRT_CONTROL_ROP_CODE(ROP_CLEAR) | + A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE(0xf)); + } + + /* emit IB to binning drawcmds: */ + ctx->emit_ib(ring, ctx->binning_start, ctx->binning_end); + + fd_reset_wfi(ctx); + fd_wfi(ctx, ring); + + /* and then put stuff back the way it was: */ + + OUT_PKT0(ring, REG_A4XX_PC_BINNING_COMMAND, 1); + OUT_RING(ring, 0x00000000); + + OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A4XX_GRAS_SC_CONTROL_MSAA_DISABLE | + A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A4XX_GRAS_SC_CONTROL_RASTER_MODE(0)); + + fd_event_write(ctx, ring, CACHE_FLUSH); + fd_wfi(ctx, ring); +} + /* before first tile */ static void fd4_emit_tile_init(struct fd_context *ctx) { struct fd_ringbuffer *ring = ctx->ring; struct fd_gmem_stateobj *gmem = &ctx->gmem; - uint32_t rb_render_control; fd4_emit_restore(ctx); @@ -599,16 +665,30 @@ fd4_emit_tile_init(struct fd_context *ctx) OUT_RING(ring, A4XX_VSC_BIN_SIZE_WIDTH(gmem->bin_w) | A4XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h)); + update_vsc_pipe(ctx); + + if (use_hw_binning(ctx)) { + OUT_PKT0(ring, REG_A4XX_RB_MODE_CONTROL, 1); + OUT_RING(ring, A4XX_RB_MODE_CONTROL_WIDTH(gmem->bin_w) | + A4XX_RB_MODE_CONTROL_HEIGHT(gmem->bin_h)); + + OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1); + OUT_RING(ring, A4XX_RB_RENDER_CONTROL_BINNING_PASS | + A4XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE | + 0x8); + + /* emit hw binning pass: */ + emit_binning_pass(ctx); + + patch_draws(ctx, USE_VISIBILITY); + } else { + patch_draws(ctx, IGNORE_VISIBILITY); + } + OUT_PKT0(ring, REG_A4XX_RB_MODE_CONTROL, 1); OUT_RING(ring, A4XX_RB_MODE_CONTROL_WIDTH(gmem->bin_w) | A4XX_RB_MODE_CONTROL_HEIGHT(gmem->bin_h) | - 0x00010000); /* XXX */ - - update_vsc_pipe(ctx); - patch_draws(ctx, IGNORE_VISIBILITY); - - rb_render_control = 0; // XXX or BINNING_PASS.. but maybe we can emit only from gmem - patch_rbrc(ctx, rb_render_control); + A4XX_RB_MODE_CONTROL_ENABLE_GMEM); } /* before mem2gmem */ @@ -670,6 +750,7 @@ fd4_emit_tile_prep(struct fd_context *ctx, struct fd_tile *tile) static void fd4_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile) { + struct fd4_context *fd4_ctx = fd4_context(ctx); struct fd_ringbuffer *ring = ctx->ring; struct fd_gmem_stateobj *gmem = &ctx->gmem; struct pipe_framebuffer_state *pfb = &ctx->framebuffer; @@ -679,6 +760,27 @@ fd4_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile) uint32_t x2 = tile->xoff + tile->bin_w - 1; uint32_t y2 = tile->yoff + tile->bin_h - 1; + if (use_hw_binning(ctx)) { + struct fd_vsc_pipe *pipe = &ctx->pipe[tile->p]; + + assert(pipe->w * pipe->h); + + fd_event_write(ctx, ring, HLSQ_FLUSH); + fd_wfi(ctx, ring); + + OUT_PKT0(ring, REG_A4XX_PC_VSTREAM_CONTROL, 1); + OUT_RING(ring, A4XX_PC_VSTREAM_CONTROL_SIZE(pipe->w * pipe->h) | + A4XX_PC_VSTREAM_CONTROL_N(tile->n)); + + OUT_PKT3(ring, CP_SET_BIN_DATA, 2); + OUT_RELOCW(ring, pipe->bo, 0, 0, 0); /* BIN_DATA_ADDR <- VSC_PIPE[p].DATA_ADDRESS */ + OUT_RELOCW(ring, fd4_ctx->vsc_size_mem, /* BIN_SIZE_ADDR <- VSC_SIZE_ADDRESS + (p * 4) */ + (tile->p * 4), 0, 0); + } else { + OUT_PKT0(ring, REG_A4XX_PC_VSTREAM_CONTROL, 1); + OUT_RING(ring, 0x00000000); + } + OUT_PKT3(ring, CP_SET_BIN, 3); OUT_RING(ring, 0x00000000); OUT_RING(ring, CP_SET_BIN_1_X1(x1) | CP_SET_BIN_1_Y1(y1)); @@ -696,6 +798,9 @@ fd4_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile) A4XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(y1)); OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_BR_X(x2) | A4XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(y2)); + + OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1); + OUT_RING(ring, 0x8); } void diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c index 74716fb733f..d782b94f848 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c @@ -51,7 +51,8 @@ create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state enum shader_t type) { struct fd4_shader_stateobj *so = CALLOC_STRUCT(fd4_shader_stateobj); - so->shader = ir3_shader_create(pctx, cso, type); + struct ir3_compiler *compiler = fd_context(pctx)->screen->compiler; + so->shader = ir3_shader_create(compiler, cso, type); return so; } @@ -150,14 +151,7 @@ setup_stages(struct fd4_emit *emit, struct stage *s) unsigned i; s[VS].v = fd4_emit_get_vp(emit); - - if (emit->key.binning_pass) { - /* use dummy stateobj to simplify binning vs non-binning: */ - static const struct ir3_shader_variant binning_fp = {}; - s[FS].v = &binning_fp; - } else { - s[FS].v = fd4_emit_get_fp(emit); - } + s[FS].v = fd4_emit_get_fp(emit); s[HS].v = s[DS].v = s[GS].v = NULL; /* for now */ @@ -223,6 +217,9 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, debug_assert(nr <= ARRAY_SIZE(color_regid)); + if (emit->key.binning_pass) + nr = 0; + setup_stages(emit, s); fssz = (s[FS].i->max_reg >= 24) ? TWO_QUADS : FOUR_QUADS; @@ -379,31 +376,49 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, A4XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[VS].instroff)); OUT_RELOC(ring, s[VS].v->bo, 0, 0, 0); /* SP_VS_OBJ_START_REG */ - OUT_PKT0(ring, REG_A4XX_SP_FS_LENGTH_REG, 1); - OUT_RING(ring, s[FS].v->instrlen); /* SP_FS_LENGTH_REG */ - - OUT_PKT0(ring, REG_A4XX_SP_FS_CTRL_REG0, 2); - OUT_RING(ring, A4XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | - COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG0_VARYING) | - A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(s[FS].i->max_half_reg + 1) | - A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) | - A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | - A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) | - A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE | - COND(s[FS].v->has_samp, A4XX_SP_FS_CTRL_REG0_PIXLODENABLE)); - OUT_RING(ring, A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) | - 0x80000000 | /* XXX */ - COND(s[FS].v->frag_face, A4XX_SP_FS_CTRL_REG1_FACENESS) | - COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG1_VARYING) | - COND(s[FS].v->frag_coord, A4XX_SP_FS_CTRL_REG1_FRAGCOORD)); - - OUT_PKT0(ring, REG_A4XX_SP_FS_OBJ_OFFSET_REG, 2); - OUT_RING(ring, A4XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[FS].constoff) | - A4XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[FS].instroff)); - if (emit->key.binning_pass) + if (emit->key.binning_pass) { + OUT_PKT0(ring, REG_A4XX_SP_FS_LENGTH_REG, 1); + OUT_RING(ring, 0x00000000); /* SP_FS_LENGTH_REG */ + + OUT_PKT0(ring, REG_A4XX_SP_FS_CTRL_REG0, 2); + OUT_RING(ring, A4XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | + COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG0_VARYING) | + A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(0) | + A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(0) | + A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | + A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) | + A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE); + OUT_RING(ring, A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) | + 0x80000000); + + OUT_PKT0(ring, REG_A4XX_SP_FS_OBJ_OFFSET_REG, 2); + OUT_RING(ring, A4XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[FS].constoff) | + A4XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[FS].instroff)); OUT_RING(ring, 0x00000000); - else + } else { + OUT_PKT0(ring, REG_A4XX_SP_FS_LENGTH_REG, 1); + OUT_RING(ring, s[FS].v->instrlen); /* SP_FS_LENGTH_REG */ + + OUT_PKT0(ring, REG_A4XX_SP_FS_CTRL_REG0, 2); + OUT_RING(ring, A4XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | + COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG0_VARYING) | + A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(s[FS].i->max_half_reg + 1) | + A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) | + A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | + A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) | + A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE | + COND(s[FS].v->has_samp, A4XX_SP_FS_CTRL_REG0_PIXLODENABLE)); + OUT_RING(ring, A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) | + 0x80000000 | /* XXX */ + COND(s[FS].v->frag_face, A4XX_SP_FS_CTRL_REG1_FACENESS) | + COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG1_VARYING) | + COND(s[FS].v->frag_coord, A4XX_SP_FS_CTRL_REG1_FRAGCOORD)); + + OUT_PKT0(ring, REG_A4XX_SP_FS_OBJ_OFFSET_REG, 2); + OUT_RING(ring, A4XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[FS].constoff) | + A4XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[FS].instroff)); OUT_RELOC(ring, s[FS].v->bo, 0, 0, 0); /* SP_FS_OBJ_START_REG */ + } OUT_PKT0(ring, REG_A4XX_SP_HS_OBJ_OFFSET_REG, 1); OUT_RING(ring, A4XX_SP_HS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[HS].constoff) | @@ -427,11 +442,11 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, A4XX_RB_RENDER_CONTROL2_WCOORD)); OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT_REG, 1); - OUT_RING(ring, A4XX_RB_FS_OUTPUT_REG_MRT(MAX2(1, nr)) | + OUT_RING(ring, A4XX_RB_FS_OUTPUT_REG_MRT(nr) | COND(s[FS].v->writes_pos, A4XX_RB_FS_OUTPUT_REG_FRAG_WRITES_Z)); OUT_PKT0(ring, REG_A4XX_SP_FS_OUTPUT_REG, 1); - OUT_RING(ring, A4XX_SP_FS_OUTPUT_REG_MRT(MAX2(1, nr)) | + OUT_RING(ring, A4XX_SP_FS_OUTPUT_REG_MRT(nr) | COND(s[FS].v->writes_pos, A4XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE) | A4XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid)); diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_query.c b/src/gallium/drivers/freedreno/a4xx/fd4_query.c index 14a809431ac..77e203f6c56 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_query.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_query.c @@ -173,7 +173,7 @@ time_elapsed_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring) OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A4XX_RBBM_PERFCTR_CP_0_LO) | CP_REG_TO_MEM_0_64B | CP_REG_TO_MEM_0_CNT(2-1)); /* write 2 regs to mem */ - OUT_RELOC(ring, scratch_bo, sample_off, 0, 0); + OUT_RELOCW(ring, scratch_bo, sample_off, 0, 0); /* ok... here we really *would* like to use the CP_SET_CONSTANT * mode which can add a constant to value in reg2 and write to @@ -187,7 +187,7 @@ time_elapsed_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring) /* per-sample offset to scratch bo: */ OUT_PKT3(ring, CP_MEM_WRITE, 2); - OUT_RELOC(ring, scratch_bo, addr_off, 0, 0); + OUT_RELOCW(ring, scratch_bo, addr_off, 0, 0); OUT_RING(ring, samp->offset); /* now add to that the per-tile base: */ @@ -195,7 +195,7 @@ time_elapsed_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring) OUT_RING(ring, CP_REG_TO_MEM_0_REG(HW_QUERY_BASE_REG) | CP_REG_TO_MEM_0_ACCUMULATE | CP_REG_TO_MEM_0_CNT(1-1)); /* readback 1 regs */ - OUT_RELOC(ring, scratch_bo, addr_off, 0, 0); + OUT_RELOCW(ring, scratch_bo, addr_off, 0, 0); /* now copy that back to CP_ME_NRT_ADDR: */ OUT_PKT3(ring, CP_MEM_TO_REG, 2); diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c index b2a69cca56c..c193f361e4c 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c @@ -105,7 +105,7 @@ fd4_screen_init(struct pipe_screen *pscreen) { struct fd_screen *screen = fd_screen(pscreen); screen->max_rts = A4XX_MAX_RENDER_TARGETS; - screen->compiler = ir3_compiler_create(screen->gpu_id); + screen->compiler = ir3_compiler_create(screen->dev, screen->gpu_id); pscreen->context_create = fd4_context_create; pscreen->is_format_supported = fd4_screen_is_format_supported; } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_zsa.c b/src/gallium/drivers/freedreno/a4xx/fd4_zsa.c index e14b617570d..a9c8d5a3d62 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_zsa.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_zsa.c @@ -103,7 +103,5 @@ fd4_zsa_state_create(struct pipe_context *pctx, A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE; } - so->rb_render_control = 0x8; /* XXX */ - return so; } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h b/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h index 6a92a9b6785..3c46117a3fe 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h @@ -39,7 +39,6 @@ struct fd4_zsa_stateobj { struct pipe_depth_stencil_alpha_state base; uint32_t gras_alpha_control; uint32_t rb_alpha_control; - uint32_t rb_render_control; uint32_t rb_depth_control; uint32_t rb_stencil_control; uint32_t rb_stencil_control2; diff --git a/src/gallium/drivers/freedreno/adreno_common.xml.h b/src/gallium/drivers/freedreno/adreno_common.xml.h index ac5343f1a78..4e361b0a246 100644 --- a/src/gallium/drivers/freedreno/adreno_common.xml.h +++ b/src/gallium/drivers/freedreno/adreno_common.xml.h @@ -12,9 +12,9 @@ The rules-ng-ng source files this header was generated from are: - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1572 bytes, from 2016-02-10 17:07:21) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 11518 bytes, from 2016-02-10 21:03:25) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 16166 bytes, from 2016-02-11 21:20:31) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 16185 bytes, from 2016-03-05 03:08:05) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 83967 bytes, from 2016-02-10 17:07:21) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 109858 bytes, from 2016-02-10 17:07:21) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 110342 bytes, from 2016-03-07 11:20:29) - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00) Copyright (C) 2013-2016 by the following authors: diff --git a/src/gallium/drivers/freedreno/adreno_pm4.xml.h b/src/gallium/drivers/freedreno/adreno_pm4.xml.h index 09b26a253f0..932cfc0d5e8 100644 --- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h +++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h @@ -12,9 +12,9 @@ The rules-ng-ng source files this header was generated from are: - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1572 bytes, from 2016-02-10 17:07:21) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 11518 bytes, from 2016-02-10 21:03:25) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 16166 bytes, from 2016-02-11 21:20:31) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 16185 bytes, from 2016-03-05 03:08:05) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 83967 bytes, from 2016-02-10 17:07:21) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 109858 bytes, from 2016-02-10 17:07:21) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 110342 bytes, from 2016-03-07 11:20:29) - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00) Copyright (C) 2013-2016 by the following authors: @@ -389,7 +389,12 @@ static inline uint32_t CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(enum pc_di_src_sel va { return ((val) << CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT__SHIFT) & CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT__MASK; } -#define CP_DRAW_INDX_OFFSET_0_TESSELLATE 0x00000100 +#define CP_DRAW_INDX_OFFSET_0_VIS_CULL__MASK 0x00000300 +#define CP_DRAW_INDX_OFFSET_0_VIS_CULL__SHIFT 8 +static inline uint32_t CP_DRAW_INDX_OFFSET_0_VIS_CULL(enum pc_di_vis_cull_mode val) +{ + return ((val) << CP_DRAW_INDX_OFFSET_0_VIS_CULL__SHIFT) & CP_DRAW_INDX_OFFSET_0_VIS_CULL__MASK; +} #define CP_DRAW_INDX_OFFSET_0_INDEX_SIZE__MASK 0x00000c00 #define CP_DRAW_INDX_OFFSET_0_INDEX_SIZE__SHIFT 10 static inline uint32_t CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(enum a4xx_index_size val) diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c index bcdd518c8bf..9aded3bb7fe 100644 --- a/src/gallium/drivers/freedreno/freedreno_resource.c +++ b/src/gallium/drivers/freedreno/freedreno_resource.c @@ -637,7 +637,8 @@ fail: static struct pipe_resource * fd_resource_from_handle(struct pipe_screen *pscreen, const struct pipe_resource *tmpl, - struct winsys_handle *handle) + struct winsys_handle *handle, + unsigned usage) { struct fd_resource *rsc = CALLOC_STRUCT(fd_resource); struct fd_resource_slice *slice = &rsc->slices[0]; diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index 2b3ecfe664e..d47cb07f10b 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -166,6 +166,10 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_TEXTURE_MIRROR_CLAMP: case PIPE_CAP_COMPUTE: case PIPE_CAP_QUERY_MEMORY_INFO: + case PIPE_CAP_PCI_GROUP: + case PIPE_CAP_PCI_BUS: + case PIPE_CAP_PCI_DEVICE: + case PIPE_CAP_PCI_FUNCTION: return 0; case PIPE_CAP_SM3: @@ -241,7 +245,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_DEPTH_BOUNDS_TEST: case PIPE_CAP_TGSI_TXQS: case PIPE_CAP_FORCE_PERSAMPLE_INTERP: - case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: case PIPE_CAP_CLEAR_TEXTURE: case PIPE_CAP_DRAW_PARAMETERS: @@ -257,6 +260,11 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MAX_VIEWPORTS: return 1; + case PIPE_CAP_SHAREABLE_SHADERS: + if (is_ir3(screen)) + return 1; + return 0; + /* Stream output. */ case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: if (is_ir3(screen)) diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c index 481859efb17..7ae4e94f0b3 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c @@ -233,7 +233,7 @@ int main(int argc, char **argv) tgsi_dump(toks, 0); nir_shader *nir = ir3_tgsi_to_nir(toks); - s.compiler = ir3_compiler_create(gpu_id); + s.compiler = ir3_compiler_create(NULL, gpu_id); s.nir = ir3_optimize_nir(&s, nir, NULL); v.key = key; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c index 7c8eccb54e1..37ad73380ab 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c @@ -30,9 +30,10 @@ #include "ir3_compiler.h" -struct ir3_compiler * ir3_compiler_create(uint32_t gpu_id) +struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id) { struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler); + compiler->dev = dev; compiler->gpu_id = gpu_id; compiler->set = ir3_ra_alloc_reg_set(compiler); return compiler; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h index 697afeba61a..0ad689ca1e7 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h @@ -34,12 +34,13 @@ struct ir3_ra_reg_set; struct ir3_compiler { + struct fd_device *dev; uint32_t gpu_id; struct ir3_ra_reg_set *set; uint32_t shader_count; }; -struct ir3_compiler * ir3_compiler_create(uint32_t gpu_id); +struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id); void ir3_compiler_destroy(struct ir3_compiler *compiler); int ir3_compile_shader_nir(struct ir3_compiler *compiler, diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_nir.c index 565b9c32c1d..73c65d6ad27 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.c @@ -45,6 +45,8 @@ ir3_tgsi_to_nir(const struct tgsi_token *tokens) .lower_flrp = true, .lower_ffract = true, .native_integers = true, + .lower_extract_byte = true, + .lower_extract_word = true, }; return tgsi_to_nir(tokens, &options); } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c index 7d17f426ad3..c05b52e7a5e 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c @@ -127,14 +127,14 @@ void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id) static void assemble_variant(struct ir3_shader_variant *v) { - struct fd_context *ctx = fd_context(v->shader->pctx); - uint32_t gpu_id = v->shader->compiler->gpu_id; + struct ir3_compiler *compiler = v->shader->compiler; + uint32_t gpu_id = compiler->gpu_id; uint32_t sz, *bin; bin = ir3_shader_assemble(v, gpu_id); sz = v->info.sizedwords * 4; - v->bo = fd_bo_new(ctx->dev, sz, + v->bo = fd_bo_new(compiler->dev, sz, DRM_FREEDRENO_GEM_CACHE_WCOMBINE | DRM_FREEDRENO_GEM_TYPE_KMEM); @@ -266,14 +266,13 @@ ir3_shader_destroy(struct ir3_shader *shader) } struct ir3_shader * -ir3_shader_create(struct pipe_context *pctx, +ir3_shader_create(struct ir3_compiler *compiler, const struct pipe_shader_state *cso, enum shader_t type) { struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader); - shader->compiler = fd_context(pctx)->screen->compiler; + shader->compiler = compiler; shader->id = ++shader->compiler->shader_count; - shader->pctx = pctx; shader->type = type; if (fd_mesa_debug & FD_DBG_DISASM) { DBG("dump tgsi: type=%d", shader->type); @@ -463,10 +462,9 @@ ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin) #include "freedreno_resource.h" static void -emit_user_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring, - struct fd_constbuf_stateobj *constbuf) +emit_user_consts(struct fd_context *ctx, const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf) { - struct fd_context *ctx = fd_context(v->shader->pctx); const unsigned index = 0; /* user consts are index 0 */ /* TODO save/restore dirty_mask for binning pass instead: */ uint32_t dirty_mask = constbuf->enabled_mask; @@ -502,12 +500,11 @@ emit_user_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring, } static void -emit_ubos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring, - struct fd_constbuf_stateobj *constbuf) +emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf) { uint32_t offset = v->first_driver_param + IR3_UBOS_OFF; if (v->constlen > offset) { - struct fd_context *ctx = fd_context(v->shader->pctx); uint32_t params = MIN2(4, v->constlen - offset) * 4; uint32_t offsets[params]; struct fd_bo *bos[params]; @@ -532,9 +529,9 @@ emit_ubos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring, } static void -emit_immediates(struct ir3_shader_variant *v, struct fd_ringbuffer *ring) +emit_immediates(struct fd_context *ctx, const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring) { - struct fd_context *ctx = fd_context(v->shader->pctx); int size = v->immediates_count; uint32_t base = v->first_immediate; @@ -556,12 +553,12 @@ emit_immediates(struct ir3_shader_variant *v, struct fd_ringbuffer *ring) /* emit stream-out buffers: */ static void -emit_tfbos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring) +emit_tfbos(struct fd_context *ctx, const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring) { /* streamout addresses after driver-params: */ uint32_t offset = v->first_driver_param + IR3_TFBOS_OFF; if (v->constlen > offset) { - struct fd_context *ctx = fd_context(v->shader->pctx); struct fd_streamout_stateobj *so = &ctx->streamout; struct pipe_stream_output_info *info = &v->shader->stream_output; uint32_t params = 4; @@ -587,9 +584,8 @@ emit_tfbos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring) } static uint32_t -max_tf_vtx(struct ir3_shader_variant *v) +max_tf_vtx(struct fd_context *ctx, const struct ir3_shader_variant *v) { - struct fd_context *ctx = fd_context(v->shader->pctx); struct fd_streamout_stateobj *so = &ctx->streamout; struct pipe_stream_output_info *info = &v->shader->stream_output; uint32_t maxvtxcnt = 0x7fffffff; @@ -633,11 +629,9 @@ max_tf_vtx(struct ir3_shader_variant *v) } void -ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring, - const struct pipe_draw_info *info, uint32_t dirty) +ir3_emit_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, + struct fd_context *ctx, const struct pipe_draw_info *info, uint32_t dirty) { - struct fd_context *ctx = fd_context(v->shader->pctx); - if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) { struct fd_constbuf_stateobj *constbuf; bool shader_dirty; @@ -653,10 +647,10 @@ ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring, return; } - emit_user_consts(v, ring, constbuf); - emit_ubos(v, ring, constbuf); + emit_user_consts(ctx, v, ring, constbuf); + emit_ubos(ctx, v, ring, constbuf); if (shader_dirty) - emit_immediates(v, ring); + emit_immediates(ctx, v, ring); } /* emit driver params every time: */ @@ -667,7 +661,7 @@ ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring, uint32_t vertex_params[IR3_DP_COUNT] = { [IR3_DP_VTXID_BASE] = info->indexed ? info->index_bias : info->start, - [IR3_DP_VTXCNT_MAX] = max_tf_vtx(v), + [IR3_DP_VTXCNT_MAX] = max_tf_vtx(ctx, v), }; /* if no user-clip-planes, we don't need to emit the * entire thing: @@ -692,7 +686,7 @@ ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring, /* if needed, emit stream-out buffer addresses: */ if (vertex_params[IR3_DP_VTXCNT_MAX] > 0) { - emit_tfbos(v, ring); + emit_tfbos(ctx, v, ring); } } } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h index 03d4fa2e927..c89dc29ff08 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h @@ -241,7 +241,6 @@ struct ir3_shader { struct ir3_compiler *compiler; - struct pipe_context *pctx; /* TODO replace w/ pipe_screen */ nir_shader *nir; struct pipe_stream_output_info stream_output; @@ -250,7 +249,7 @@ struct ir3_shader { void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id); -struct ir3_shader * ir3_shader_create(struct pipe_context *pctx, +struct ir3_shader * ir3_shader_create(struct ir3_compiler *compiler, const struct pipe_shader_state *cso, enum shader_t type); void ir3_shader_destroy(struct ir3_shader *shader); struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader, @@ -258,8 +257,9 @@ struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader, void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin); struct fd_ringbuffer; -void ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring, - const struct pipe_draw_info *info, uint32_t dirty); +struct fd_context; +void ir3_emit_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, + struct fd_context *ctx, const struct pipe_draw_info *info, uint32_t dirty); static inline const char * ir3_shader_stage(struct ir3_shader *shader) diff --git a/src/gallium/drivers/i915/i915_resource.c b/src/gallium/drivers/i915/i915_resource.c index 627ed2b4445..3ffb0b7a5d2 100644 --- a/src/gallium/drivers/i915/i915_resource.c +++ b/src/gallium/drivers/i915/i915_resource.c @@ -23,7 +23,8 @@ i915_resource_create(struct pipe_screen *screen, static struct pipe_resource * i915_resource_from_handle(struct pipe_screen * screen, const struct pipe_resource *template, - struct winsys_handle *whandle) + struct winsys_handle *whandle, + unsigned usage) { if (template->target == PIPE_BUFFER) return NULL; diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c index 8d010f9dc8c..f4aa310ecdc 100644 --- a/src/gallium/drivers/i915/i915_screen.c +++ b/src/gallium/drivers/i915/i915_screen.c @@ -265,6 +265,10 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap) case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY: case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: case PIPE_CAP_QUERY_MEMORY_INFO: + case PIPE_CAP_PCI_GROUP: + case PIPE_CAP_PCI_BUS: + case PIPE_CAP_PCI_DEVICE: + case PIPE_CAP_PCI_FUNCTION: return 0; case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: diff --git a/src/gallium/drivers/ilo/ilo_resource.c b/src/gallium/drivers/ilo/ilo_resource.c index 9026ba9a983..8c888c529c4 100644 --- a/src/gallium/drivers/ilo/ilo_resource.c +++ b/src/gallium/drivers/ilo/ilo_resource.c @@ -714,7 +714,8 @@ ilo_resource_create(struct pipe_screen *screen, static struct pipe_resource * ilo_resource_from_handle(struct pipe_screen *screen, const struct pipe_resource *templ, - struct winsys_handle *handle) + struct winsys_handle *handle, + unsigned usage) { if (templ->target == PIPE_BUFFER) return NULL; @@ -725,7 +726,8 @@ ilo_resource_from_handle(struct pipe_screen *screen, static boolean ilo_resource_get_handle(struct pipe_screen *screen, struct pipe_resource *res, - struct winsys_handle *handle) + struct winsys_handle *handle, + unsigned usage) { if (res->target == PIPE_BUFFER) return false; diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c index ef9da6b8315..548d215c718 100644 --- a/src/gallium/drivers/ilo/ilo_screen.c +++ b/src/gallium/drivers/ilo/ilo_screen.c @@ -493,6 +493,10 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: case PIPE_CAP_QUERY_BUFFER_OBJECT: case PIPE_CAP_QUERY_MEMORY_INFO: + case PIPE_CAP_PCI_GROUP: + case PIPE_CAP_PCI_BUS: + case PIPE_CAP_PCI_DEVICE: + case PIPE_CAP_PCI_FUNCTION: return 0; case PIPE_CAP_VENDOR_ID: diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h index 34008e1c01e..d9be7f392ef 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.h +++ b/src/gallium/drivers/llvmpipe/lp_rast.h @@ -116,6 +116,12 @@ struct lp_rast_plane { /* one-pixel sized trivial reject offsets for each plane */ uint32_t eo; + /* + * We rely on this struct being 64bit aligned (ideally it would be 128bit + * but that's quite the waste) and therefore on 32bit we need padding + * since otherwise (even with the 64bit number in there) it wouldn't be. + */ + uint32_t pad; }; /** diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c index 2c66bf46332..2529b546564 100644 --- a/src/gallium/drivers/llvmpipe/lp_screen.c +++ b/src/gallium/drivers/llvmpipe/lp_screen.c @@ -315,6 +315,10 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: case PIPE_CAP_QUERY_BUFFER_OBJECT: case PIPE_CAP_QUERY_MEMORY_INFO: + case PIPE_CAP_PCI_GROUP: + case PIPE_CAP_PCI_BUS: + case PIPE_CAP_PCI_DEVICE: + case PIPE_CAP_PCI_FUNCTION: return 0; } /* should only get here on unhandled cases */ diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c index 29aee726941..98243a12de1 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c @@ -94,6 +94,8 @@ lp_setup_alloc_triangle(struct lp_scene *scene, unsigned plane_sz = nr_planes * sizeof(struct lp_rast_plane); struct lp_rast_triangle *tri; + STATIC_ASSERT(sizeof(struct lp_rast_plane) % 8 == 0); + *tri_size = (sizeof(struct lp_rast_triangle) + 3 * input_array_sz + plane_sz); diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c index ae266ceb082..c2ca8b8d6a0 100644 --- a/src/gallium/drivers/llvmpipe/lp_texture.c +++ b/src/gallium/drivers/llvmpipe/lp_texture.c @@ -434,7 +434,8 @@ llvmpipe_resource_data(struct pipe_resource *resource) static struct pipe_resource * llvmpipe_resource_from_handle(struct pipe_screen *screen, const struct pipe_resource *template, - struct winsys_handle *whandle) + struct winsys_handle *whandle, + unsigned usage) { struct sw_winsys *winsys = llvmpipe_screen(screen)->winsys; struct llvmpipe_resource *lpr; @@ -485,7 +486,8 @@ no_lpr: static boolean llvmpipe_resource_get_handle(struct pipe_screen *screen, struct pipe_resource *pt, - struct winsys_handle *whandle) + struct winsys_handle *whandle, + unsigned usage) { struct sw_winsys *winsys = llvmpipe_screen(screen)->winsys; struct llvmpipe_resource *lpr = llvmpipe_resource(pt); diff --git a/src/gallium/drivers/noop/noop_pipe.c b/src/gallium/drivers/noop/noop_pipe.c index 165284a90bf..fd0a5d0f830 100644 --- a/src/gallium/drivers/noop/noop_pipe.c +++ b/src/gallium/drivers/noop/noop_pipe.c @@ -114,14 +114,15 @@ static struct pipe_resource *noop_resource_create(struct pipe_screen *screen, static struct pipe_resource *noop_resource_from_handle(struct pipe_screen *screen, const struct pipe_resource *templ, - struct winsys_handle *handle) + struct winsys_handle *handle, + unsigned usage) { struct noop_pipe_screen *noop_screen = (struct noop_pipe_screen*)screen; struct pipe_screen *oscreen = noop_screen->oscreen; struct pipe_resource *result; struct pipe_resource *noop_resource; - result = oscreen->resource_from_handle(oscreen, templ, handle); + result = oscreen->resource_from_handle(oscreen, templ, handle, usage); noop_resource = noop_resource_create(screen, result); pipe_resource_reference(&result, NULL); return noop_resource; @@ -129,7 +130,8 @@ static struct pipe_resource *noop_resource_from_handle(struct pipe_screen *scree static boolean noop_resource_get_handle(struct pipe_screen *screen, struct pipe_resource *resource, - struct winsys_handle *handle) + struct winsys_handle *handle, + unsigned usage) { return FALSE; } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp index f58cf97646e..84ebfdb1cba 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp @@ -585,6 +585,7 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i, return NULL; srcNr = 2; break; + case OP_SELP: srcNr = 3; break; default: // TODO when needed return NULL; @@ -601,7 +602,10 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i, for (int s = 0; s < srcNr; ++s) { if (lo->getSrc(s)->reg.size < 8) { - hi->setSrc(s, zero); + if (s == 2) + hi->setSrc(s, lo->getSrc(s)); + else + hi->setSrc(s, zero); } else { if (lo->getSrc(s)->refCount() > 1) lo->setSrc(s, cloneShallow(fn, lo->getSrc(s))); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h index 4504240ac5e..9f7d2572bbe 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h @@ -146,7 +146,6 @@ struct nv50_ir_prog_info bool earlyFragTests; bool separateFragData; bool usesDiscard; - bool sampleInterp; /* perform sample interp on all fp inputs */ } fp; struct { uint32_t inputOffset; /* base address for user args */ diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp index b6b3ec7b948..0d7d95e3105 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp @@ -1273,15 +1273,41 @@ CodeEmitterGK110::emitBAR(const Instruction *i) case NV50_IR_SUBOP_BAR_RED_OR: code[1] |= 0x90; break; case NV50_IR_SUBOP_BAR_RED_POPC: code[1] |= 0x10; break; default: - code[1] |= 0x20; assert(i->subOp == NV50_IR_SUBOP_BAR_SYNC); break; } emitPredicate(i); - srcId(i->src(0), 10); - srcId(i->src(1), 23); + // barrier id + if (i->src(0).getFile() == FILE_GPR) { + srcId(i->src(0), 10); + } else { + ImmediateValue *imm = i->getSrc(0)->asImm(); + assert(imm); + code[0] |= imm->reg.data.u32 << 10; + code[1] |= 0x8000; + } + + // thread count + if (i->src(1).getFile() == FILE_GPR) { + srcId(i->src(1), 23); + } else { + ImmediateValue *imm = i->getSrc(0)->asImm(); + assert(imm); + assert(imm->reg.data.u32 <= 0xfff); + code[0] |= imm->reg.data.u32 << 23; + code[1] |= imm->reg.data.u32 >> 9; + code[1] |= 0x4000; + } + + if (i->srcExists(2) && (i->predSrc != 2)) { + srcId(i->src(2), 32 + 10); + if (i->src(2).mod == Modifier(NV50_IR_MOD_NOT)) + code[1] |= 1 << 13; + } else { + code[1] |= 7 << 10; + } } void CodeEmitterGK110::emitMEMBAR(const Instruction *i) @@ -1386,7 +1412,7 @@ CodeEmitterGK110::emitVOTE(const Instruction *i) defId(i->def(0), 2); defId(i->def(1), 48); if (i->src(0).mod == Modifier(NV50_IR_MOD_NOT)) - code[0] |= 1 << 45; + code[1] |= 1 << 13; srcId(i->src(0), 42); } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp index a383c53fcd3..e079a574cc8 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp @@ -177,6 +177,7 @@ private: void emitAL2P(); void emitIPA(); void emitATOM(); + void emitATOMS(); void emitCCTL(); void emitPIXLD(); @@ -194,6 +195,7 @@ private: void emitKIL(); void emitOUT(); + void emitBAR(); void emitMEMBAR(); void emitVOTE(); @@ -2373,6 +2375,45 @@ CodeEmitterGM107::emitATOM() } void +CodeEmitterGM107::emitATOMS() +{ + unsigned dType, subOp; + + if (insn->subOp == NV50_IR_SUBOP_ATOM_CAS) { + switch (insn->dType) { + case TYPE_U32: dType = 0; break; + case TYPE_U64: dType = 1; break; + default: assert(!"unexpected dType"); dType = 0; break; + } + subOp = 4; + + emitInsn (0xee000000); + emitField(0x34, 1, dType); + } else { + switch (insn->dType) { + case TYPE_U32: dType = 0; break; + case TYPE_S32: dType = 1; break; + case TYPE_U64: dType = 2; break; + case TYPE_S64: dType = 3; break; + default: assert(!"unexpected dType"); dType = 0; break; + } + + if (insn->subOp == NV50_IR_SUBOP_ATOM_EXCH) + subOp = 8; + else + subOp = insn->subOp; + + emitInsn (0xec000000); + emitField(0x1c, 3, dType); + } + + emitField(0x34, 4, subOp); + emitGPR (0x14, insn->src(1)); + emitADDR (0x08, 0x12, 22, 0, insn->src(0)); + emitGPR (0x00, insn->def(0)); +} + +void CodeEmitterGM107::emitCCTL() { unsigned width; @@ -2649,6 +2690,54 @@ CodeEmitterGM107::emitOUT() } void +CodeEmitterGM107::emitBAR() +{ + uint8_t subop; + + emitInsn (0xf0a80000); + + switch (insn->subOp) { + case NV50_IR_SUBOP_BAR_RED_POPC: subop = 0x02; break; + case NV50_IR_SUBOP_BAR_RED_AND: subop = 0x0a; break; + case NV50_IR_SUBOP_BAR_RED_OR: subop = 0x12; break; + case NV50_IR_SUBOP_BAR_ARRIVE: subop = 0x81; break; + default: + subop = 0x80; + assert(insn->subOp == NV50_IR_SUBOP_BAR_SYNC); + break; + } + + emitField(0x20, 8, subop); + + // barrier id + if (insn->src(0).getFile() == FILE_GPR) { + emitGPR(0x08, insn->src(0)); + } else { + ImmediateValue *imm = insn->getSrc(0)->asImm(); + assert(imm); + emitField(0x08, 8, imm->reg.data.u32); + emitField(0x2b, 1, 1); + } + + // thread count + if (insn->src(1).getFile() == FILE_GPR) { + emitGPR(0x14, insn->src(1)); + } else { + ImmediateValue *imm = insn->getSrc(0)->asImm(); + assert(imm); + emitField(0x14, 12, imm->reg.data.u32); + emitField(0x2c, 1, 1); + } + + if (insn->srcExists(2) && (insn->predSrc != 2)) { + emitPRED (0x27, insn->src(2)); + emitField(0x2a, 1, insn->src(2).mod == Modifier(NV50_IR_MOD_NOT)); + } else { + emitField(0x27, 3, 7); + } +} + +void CodeEmitterGM107::emitMEMBAR() { emitInsn (0xef980000); @@ -2918,7 +3007,10 @@ CodeEmitterGM107::emitInstruction(Instruction *i) } break; case OP_ATOM: - emitATOM(); + if (insn->src(0).getFile() == FILE_MEMORY_SHARED) + emitATOMS(); + else + emitATOM(); break; case OP_CCTL: emitCCTL(); @@ -2978,6 +3070,9 @@ CodeEmitterGM107::emitInstruction(Instruction *i) case OP_RESTART: emitOUT(); break; + case OP_BAR: + emitBAR(); + break; case OP_MEMBAR: emitMEMBAR(); break; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp index 7bd7c732c49..8b9328b6296 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp @@ -1482,6 +1482,7 @@ CodeEmitterNVC0::emitBAR(const Instruction *i) } else { ImmediateValue *imm = i->getSrc(1)->asImm(); assert(imm); + assert(imm->reg.data.u32 <= 0xfff); code[0] |= imm->reg.data.u32 << 26; code[1] |= imm->reg.data.u32 >> 6; code[1] |= 0x4000; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp index d06e9efa463..d284446f5d9 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp @@ -525,6 +525,7 @@ nv50_ir::DataType Instruction::inferSrcType() const case TGSI_OPCODE_DRCP: case TGSI_OPCODE_DSQRT: case TGSI_OPCODE_DMAD: + case TGSI_OPCODE_DFMA: case TGSI_OPCODE_DFRAC: case TGSI_OPCODE_DRSQ: case TGSI_OPCODE_DTRUNC: @@ -615,6 +616,7 @@ static nv50_ir::operation translateOpcode(uint opcode) NV50_IR_OPCODE_CASE(RCP, RCP); NV50_IR_OPCODE_CASE(RSQ, RSQ); + NV50_IR_OPCODE_CASE(SQRT, SQRT); NV50_IR_OPCODE_CASE(MUL, MUL); NV50_IR_OPCODE_CASE(ADD, ADD); @@ -624,6 +626,7 @@ static nv50_ir::operation translateOpcode(uint opcode) NV50_IR_OPCODE_CASE(SLT, SET); NV50_IR_OPCODE_CASE(SGE, SET); NV50_IR_OPCODE_CASE(MAD, MAD); + NV50_IR_OPCODE_CASE(FMA, FMA); NV50_IR_OPCODE_CASE(SUB, SUB); NV50_IR_OPCODE_CASE(FLR, FLOOR); @@ -723,6 +726,7 @@ static nv50_ir::operation translateOpcode(uint opcode) NV50_IR_OPCODE_CASE(DRCP, RCP); NV50_IR_OPCODE_CASE(DSQRT, SQRT); NV50_IR_OPCODE_CASE(DMAD, MAD); + NV50_IR_OPCODE_CASE(DFMA, FMA); NV50_IR_OPCODE_CASE(D2I, CVT); NV50_IR_OPCODE_CASE(D2U, CVT); NV50_IR_OPCODE_CASE(I2D, CVT); @@ -1182,10 +1186,6 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl) case TGSI_SEMANTIC_VERTEXID: info->io.vertexId = first; break; - case TGSI_SEMANTIC_SAMPLEID: - case TGSI_SEMANTIC_SAMPLEPOS: - info->prop.fp.sampleInterp = 1; - break; case TGSI_SEMANTIC_BASEVERTEX: case TGSI_SEMANTIC_BASEINSTANCE: case TGSI_SEMANTIC_DRAWID: @@ -1564,7 +1564,7 @@ Converter::translateInterpMode(const struct nv50_ir_varying *var, operation& op) op = (mode == NV50_IR_INTERP_PERSPECTIVE || mode == NV50_IR_INTERP_SC) ? OP_PINTERP : OP_LINTERP; - if (var->centroid || info->prop.fp.sampleInterp) + if (var->centroid) mode |= NV50_IR_INTERP_CENTROID; return mode; @@ -2676,6 +2676,7 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn) case TGSI_OPCODE_MAD: case TGSI_OPCODE_UMAD: case TGSI_OPCODE_SAD: + case TGSI_OPCODE_FMA: FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { src0 = fetchSrc(0, c); src1 = fetchSrc(1, c); @@ -2689,6 +2690,7 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn) case TGSI_OPCODE_FLR: case TGSI_OPCODE_TRUNC: case TGSI_OPCODE_RCP: + case TGSI_OPCODE_SQRT: case TGSI_OPCODE_IABS: case TGSI_OPCODE_INEG: case TGSI_OPCODE_NOT: @@ -3399,6 +3401,7 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn) } break; case TGSI_OPCODE_DMAD: + case TGSI_OPCODE_DFMA: FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { src0 = getSSA(8); src1 = getSSA(8); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp index 8752b0c8c54..12c5f699603 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp @@ -1203,10 +1203,9 @@ NV50LoweringPreSSA::handleDIV(Instruction *i) bool NV50LoweringPreSSA::handleSQRT(Instruction *i) { - Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32, - bld.getSSA(), i->getSrc(0)); - i->op = OP_MUL; - i->setSrc(1, rsq->getDef(0)); + bld.setPosition(i, true); + i->op = OP_RSQ; + bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0)); return true; } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index d181f1574f1..d0936d88d60 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -1778,22 +1778,21 @@ NVC0LoweringPass::handleMOD(Instruction *i) bool NVC0LoweringPass::handleSQRT(Instruction *i) { - Value *pred = bld.getSSA(1, FILE_PREDICATE); - Value *zero = bld.getSSA(); - Instruction *rsq; - - bld.mkOp1(OP_MOV, TYPE_U32, zero, bld.mkImm(0)); - if (i->dType == TYPE_F64) - zero = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), zero, zero); - bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero); - bld.mkOp1(OP_MOV, i->dType, i->getDef(0), zero)->setPredicate(CC_P, pred); - rsq = bld.mkOp1(OP_RSQ, i->dType, - bld.getSSA(typeSizeof(i->dType)), i->getSrc(0)); - rsq->setPredicate(CC_NOT_P, pred); - i->op = OP_MUL; - i->setSrc(1, rsq->getDef(0)); - i->setPredicate(CC_NOT_P, pred); - + if (i->dType == TYPE_F64) { + Value *pred = bld.getSSA(1, FILE_PREDICATE); + Value *zero = bld.loadImm(NULL, 0.0d); + Value *dst = bld.getSSA(8); + bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0)); + bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero); + bld.mkOp3(OP_SELP, TYPE_U64, dst, zero, dst, pred); + i->op = OP_MUL; + i->setSrc(1, dst); + // TODO: Handle this properly with a library function + } else { + bld.setPosition(i, true); + i->op = OP_RSQ; + bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0)); + } return true; } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index 6192c0665e4..66e7b2e8243 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -1635,11 +1635,10 @@ AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp) if (src->getUniqueInsn() && src->getUniqueInsn()->bb != add->bb) return false; - if (src->getInsn()->saturate) + if (src->getInsn()->saturate || src->getInsn()->postFactor || + src->getInsn()->dnz) return false; - if (src->getInsn()->postFactor) - return false; if (toOp == OP_SAD) { ImmediateValue imm; if (!src->getInsn()->src(2).getImmediate(imm)) diff --git a/src/gallium/drivers/nouveau/nouveau_debug.h b/src/gallium/drivers/nouveau/nouveau_debug.h index d17df81fed2..546a4ad0af3 100644 --- a/src/gallium/drivers/nouveau/nouveau_debug.h +++ b/src/gallium/drivers/nouveau/nouveau_debug.h @@ -16,7 +16,7 @@ #define NOUVEAU_DEBUG 0 #define NOUVEAU_ERR(fmt, args...) \ - fprintf(stderr, "%s:%d - "fmt, __FUNCTION__, __LINE__, ##args) + fprintf(stderr, "%s:%d - " fmt, __FUNCTION__, __LINE__, ##args) #define NOUVEAU_DBG(ch, args...) \ if ((NOUVEAU_DEBUG) & (NOUVEAU_DEBUG_##ch)) \ diff --git a/src/gallium/drivers/nouveau/nv30/nv30_resource.c b/src/gallium/drivers/nouveau/nv30/nv30_resource.c index a98a6464de8..4d215d2e616 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_resource.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_resource.c @@ -66,7 +66,8 @@ nv30_resource_create(struct pipe_screen *pscreen, static struct pipe_resource * nv30_resource_from_handle(struct pipe_screen *pscreen, const struct pipe_resource *tmpl, - struct winsys_handle *handle) + struct winsys_handle *handle, + unsigned usage) { if (tmpl->target == PIPE_BUFFER) return NULL; diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c index 5be7a3dab76..b105c6aeb80 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c @@ -188,6 +188,10 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: case PIPE_CAP_QUERY_BUFFER_OBJECT: case PIPE_CAP_QUERY_MEMORY_INFO: + case PIPE_CAP_PCI_GROUP: + case PIPE_CAP_PCI_BUS: + case PIPE_CAP_PCI_DEVICE: + case PIPE_CAP_PCI_FUNCTION: return 0; case PIPE_CAP_VENDOR_ID: diff --git a/src/gallium/drivers/nouveau/nv50/nv50_resource.c b/src/gallium/drivers/nouveau/nv50/nv50_resource.c index 5d415ae77eb..ad5f3b814db 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_resource.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_resource.c @@ -22,7 +22,8 @@ nv50_resource_create(struct pipe_screen *screen, static struct pipe_resource * nv50_resource_from_handle(struct pipe_screen * screen, const struct pipe_resource *templ, - struct winsys_handle *whandle) + struct winsys_handle *whandle, + unsigned usage) { if (templ->target == PIPE_BUFFER) return NULL; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c index 0bd5de91d1f..5836bb23764 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c @@ -232,6 +232,10 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: case PIPE_CAP_QUERY_BUFFER_OBJECT: case PIPE_CAP_QUERY_MEMORY_INFO: + case PIPE_CAP_PCI_GROUP: + case PIPE_CAP_PCI_BUS: + case PIPE_CAP_PCI_DEVICE: + case PIPE_CAP_PCI_FUNCTION: return 0; case PIPE_CAP_VENDOR_ID: @@ -301,7 +305,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: return 1; case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED: - return 0; + return 1; case PIPE_SHADER_CAP_SUBROUTINES: return 0; /* please inline, or provide function declarations */ case PIPE_SHADER_CAP_INTEGERS: diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c index 6a09808807a..8504ba466cc 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_state.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c @@ -1245,7 +1245,7 @@ nv50_set_global_bindings(struct pipe_context *pipe, nouveau_bufctx_reset(nv50->bufctx_cp, NV50_BIND_CP_GLOBAL); - nv50->dirty_cp = NV50_NEW_CP_GLOBALS; + nv50->dirty_cp |= NV50_NEW_CP_GLOBALS; } void diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c index 4db73cb7fef..84646f6adb1 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c @@ -1756,6 +1756,7 @@ nv50_blitter_destroy(struct nv50_screen *screen) } } + pipe_mutex_destroy(blitter->mutex); FREE(blitter); } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c index 060f59d0c73..ffbb16f79de 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c @@ -262,35 +262,29 @@ nvc0_compute_validate_globals(struct nvc0_context *nvc0) } } +static struct nvc0_state_validate +validate_list_cp[] = { + { nvc0_compprog_validate, NVC0_NEW_CP_PROGRAM }, + { nvc0_compute_validate_constbufs, NVC0_NEW_CP_CONSTBUF }, + { nvc0_compute_validate_driverconst, NVC0_NEW_CP_DRIVERCONST }, + { nvc0_compute_validate_buffers, NVC0_NEW_CP_BUFFERS }, + { nvc0_compute_validate_textures, NVC0_NEW_CP_TEXTURES }, + { nvc0_compute_validate_samplers, NVC0_NEW_CP_SAMPLERS }, + { nvc0_compute_validate_globals, NVC0_NEW_CP_GLOBALS }, +}; + static bool -nvc0_compute_state_validate(struct nvc0_context *nvc0) +nvc0_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask) { - nvc0_compprog_validate(nvc0); - if (nvc0->dirty_cp & NVC0_NEW_CP_CONSTBUF) - nvc0_compute_validate_constbufs(nvc0); - if (nvc0->dirty_cp & NVC0_NEW_CP_DRIVERCONST) - nvc0_compute_validate_driverconst(nvc0); - if (nvc0->dirty_cp & NVC0_NEW_CP_BUFFERS) - nvc0_compute_validate_buffers(nvc0); - if (nvc0->dirty_cp & NVC0_NEW_CP_TEXTURES) - nvc0_compute_validate_textures(nvc0); - if (nvc0->dirty_cp & NVC0_NEW_CP_SAMPLERS) - nvc0_compute_validate_samplers(nvc0); - if (nvc0->dirty_cp & NVC0_NEW_CP_GLOBALS) - nvc0_compute_validate_globals(nvc0); - - /* TODO: surfaces */ - - nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, false); - - nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_cp); - if (unlikely(nouveau_pushbuf_validate(nvc0->base.pushbuf))) - return false; - if (unlikely(nvc0->state.flushed)) - nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true); + bool ret; - return true; + ret = nvc0_state_validate(nvc0, mask, validate_list_cp, + ARRAY_SIZE(validate_list_cp), &nvc0->dirty_cp, + nvc0->bufctx_cp); + if (unlikely(nvc0->state.flushed)) + nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true); + return ret; } static void @@ -326,7 +320,7 @@ nvc0_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info) unsigned s; int ret; - ret = !nvc0_compute_state_validate(nvc0); + ret = !nvc0_state_validate_cp(nvc0, ~0); if (ret) { NOUVEAU_ERR("Failed to launch grid !\n"); return; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h index 0f1ebb0a6e2..54afe887ebd 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h @@ -262,7 +262,15 @@ void nvc0_tfb_validate(struct nvc0_context *); extern void nvc0_init_state_functions(struct nvc0_context *); /* nvc0_state_validate.c */ -bool nvc0_state_validate(struct nvc0_context *, uint32_t state_mask); +struct nvc0_state_validate { + void (*func)(struct nvc0_context *); + uint32_t states; +}; + +bool nvc0_state_validate(struct nvc0_context *, uint32_t, + struct nvc0_state_validate *, int, uint32_t *, + struct nouveau_bufctx *); +bool nvc0_state_validate_3d(struct nvc0_context *, uint32_t); /* nvc0_surface.c */ extern void nvc0_clear(struct pipe_context *, unsigned buffers, diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c index d2acce7d5be..92ca613cda1 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c @@ -204,10 +204,7 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen, if (screen->base.drm->version >= 0x01000101) { if (screen->compute) { - if (screen->base.class_3d == NVE4_3D_CLASS) { - count += 2; - } else - if (screen->base.class_3d < NVE4_3D_CLASS) { + if (screen->base.class_3d <= NVF0_3D_CLASS) { count += 2; } } @@ -227,29 +224,16 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen, * currently only used by AMD_performance_monitor. */ info->max_active_queries = 1; - - if (screen->base.class_3d == NVE4_3D_CLASS) { - info->num_queries = NVE4_HW_SM_QUERY_COUNT; - return 1; - } else - if (screen->base.class_3d < NVE4_3D_CLASS) { - info->num_queries = NVC0_HW_SM_QUERY_COUNT; - return 1; - } + info->num_queries = nvc0_hw_sm_get_num_queries(screen); + return 1; } } else if (id == NVC0_HW_METRIC_QUERY_GROUP) { if (screen->compute) { - if (screen->base.class_3d == NVE4_3D_CLASS) { - info->name = "Performance metrics"; - info->max_active_queries = 1; - info->num_queries = NVE4_HW_METRIC_QUERY_COUNT; - return 1; - } else - if (screen->base.class_3d < NVE4_3D_CLASS) { + if (screen->base.class_3d <= NVF0_3D_CLASS) { info->name = "Performance metrics"; info->max_active_queries = 1; - info->num_queries = NVC0_HW_METRIC_QUERY_COUNT; + info->num_queries = nvc0_hw_metric_get_num_queries(screen); return 1; } } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c index 7a64b69b1c1..b961cbf652e 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c @@ -24,32 +24,51 @@ #include "nvc0/nvc0_query_hw_metric.h" #include "nvc0/nvc0_query_hw_sm.h" -/* === PERFORMANCE MONITORING METRICS for NVC0:NVE4 === */ -static const char *nvc0_hw_metric_names[] = -{ - "metric-achieved_occupancy", - "metric-branch_efficiency", - "metric-inst_issued", - "metric-inst_per_wrap", - "metric-inst_replay_overhead", - "metric-issued_ipc", - "metric-issue_slots", - "metric-issue_slot_utilization", - "metric-ipc", +#define _Q(t,n) { NVC0_HW_METRIC_QUERY_##t, n } +struct { + unsigned type; + const char *name; +} nvc0_hw_metric_queries[] = { + _Q(ACHIEVED_OCCUPANCY, "metric-achieved_occupancy" ), + _Q(BRANCH_EFFICIENCY, "metric-branch_efficiency" ), + _Q(INST_ISSUED, "metric-inst_issued" ), + _Q(INST_PER_WRAP, "metric-inst_per_wrap" ), + _Q(INST_REPLAY_OVERHEAD, "metric-inst_replay_overhead" ), + _Q(ISSUED_IPC, "metric-issued_ipc" ), + _Q(ISSUE_SLOTS, "metric-issue_slots" ), + _Q(ISSUE_SLOT_UTILIZATION, "metric-issue_slot_utilization" ), + _Q(IPC, "metric-ipc" ), + _Q(SHARED_REPLAY_OVERHEAD, "metric-shared_replay_overhead" ), }; +#undef _Q + +static inline const char * +nvc0_hw_metric_query_get_name(unsigned query_type) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(nvc0_hw_metric_queries); i++) { + if (nvc0_hw_metric_queries[i].type == query_type) + return nvc0_hw_metric_queries[i].name; + } + assert(0); + return NULL; +} + struct nvc0_hw_metric_query_cfg { + unsigned type; uint32_t queries[8]; uint32_t num_queries; }; #define _SM(n) NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_ ##n) -#define _M(n, c) [NVC0_HW_METRIC_QUERY_##n] = c /* ==== Compute capability 2.0 (GF100/GF110) ==== */ static const struct nvc0_hw_metric_query_cfg sm20_achieved_occupancy = { + .type = NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY, .queries[0] = _SM(ACTIVE_WARPS), .queries[1] = _SM(ACTIVE_CYCLES), .num_queries = 2, @@ -58,6 +77,7 @@ sm20_achieved_occupancy = static const struct nvc0_hw_metric_query_cfg sm20_branch_efficiency = { + .type = NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY, .queries[0] = _SM(BRANCH), .queries[1] = _SM(DIVERGENT_BRANCH), .num_queries = 2, @@ -66,6 +86,7 @@ sm20_branch_efficiency = static const struct nvc0_hw_metric_query_cfg sm20_inst_per_wrap = { + .type = NVC0_HW_METRIC_QUERY_INST_PER_WRAP, .queries[0] = _SM(INST_EXECUTED), .queries[1] = _SM(WARPS_LAUNCHED), .num_queries = 2, @@ -74,6 +95,7 @@ sm20_inst_per_wrap = static const struct nvc0_hw_metric_query_cfg sm20_inst_replay_overhead = { + .type = NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD, .queries[0] = _SM(INST_ISSUED), .queries[1] = _SM(INST_EXECUTED), .num_queries = 2, @@ -82,6 +104,16 @@ sm20_inst_replay_overhead = static const struct nvc0_hw_metric_query_cfg sm20_issued_ipc = { + .type = NVC0_HW_METRIC_QUERY_ISSUED_IPC, + .queries[0] = _SM(INST_ISSUED), + .queries[1] = _SM(ACTIVE_CYCLES), + .num_queries = 2, +}; + +static const struct nvc0_hw_metric_query_cfg +sm20_issue_slot_utilization = +{ + .type = NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION, .queries[0] = _SM(INST_ISSUED), .queries[1] = _SM(ACTIVE_CYCLES), .num_queries = 2, @@ -90,6 +122,7 @@ sm20_issued_ipc = static const struct nvc0_hw_metric_query_cfg sm20_ipc = { + .type = NVC0_HW_METRIC_QUERY_IPC, .queries[0] = _SM(INST_EXECUTED), .queries[1] = _SM(ACTIVE_CYCLES), .num_queries = 2, @@ -97,21 +130,20 @@ sm20_ipc = static const struct nvc0_hw_metric_query_cfg *sm20_hw_metric_queries[] = { - _M(ACHIEVED_OCCUPANCY, &sm20_achieved_occupancy), - _M(BRANCH_EFFICIENCY, &sm20_branch_efficiency), - _M(INST_ISSUED, NULL), - _M(INST_PER_WRAP, &sm20_inst_per_wrap), - _M(INST_REPLAY_OVERHEAD, &sm20_inst_replay_overhead), - _M(ISSUED_IPC, &sm20_issued_ipc), - _M(ISSUE_SLOTS, NULL), - _M(ISSUE_SLOT_UTILIZATION, &sm20_issued_ipc), - _M(IPC, &sm20_ipc), + &sm20_achieved_occupancy, + &sm20_branch_efficiency, + &sm20_inst_per_wrap, + &sm20_inst_replay_overhead, + &sm20_issued_ipc, + &sm20_issue_slot_utilization, + &sm20_ipc, }; /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */ static const struct nvc0_hw_metric_query_cfg sm21_inst_issued = { + .type = NVC0_HW_METRIC_QUERY_INST_ISSUED, .queries[0] = _SM(INST_ISSUED1_0), .queries[1] = _SM(INST_ISSUED1_1), .queries[2] = _SM(INST_ISSUED2_0), @@ -122,6 +154,7 @@ sm21_inst_issued = static const struct nvc0_hw_metric_query_cfg sm21_inst_replay_overhead = { + .type = NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD, .queries[0] = _SM(INST_ISSUED1_0), .queries[1] = _SM(INST_ISSUED1_1), .queries[2] = _SM(INST_ISSUED2_0), @@ -133,6 +166,7 @@ sm21_inst_replay_overhead = static const struct nvc0_hw_metric_query_cfg sm21_issued_ipc = { + .type = NVC0_HW_METRIC_QUERY_ISSUED_IPC, .queries[0] = _SM(INST_ISSUED1_0), .queries[1] = _SM(INST_ISSUED1_1), .queries[2] = _SM(INST_ISSUED2_0), @@ -141,44 +175,36 @@ sm21_issued_ipc = .num_queries = 5, }; -static const struct nvc0_hw_metric_query_cfg *sm21_hw_metric_queries[] = +static const struct nvc0_hw_metric_query_cfg +sm21_issue_slot_utilization = { - _M(ACHIEVED_OCCUPANCY, &sm20_achieved_occupancy), - _M(BRANCH_EFFICIENCY, &sm20_branch_efficiency), - _M(INST_ISSUED, &sm21_inst_issued), - _M(INST_PER_WRAP, &sm20_inst_per_wrap), - _M(INST_REPLAY_OVERHEAD, &sm21_inst_replay_overhead), - _M(ISSUED_IPC, &sm21_issued_ipc), - _M(ISSUE_SLOTS, &sm21_inst_issued), - _M(ISSUE_SLOT_UTILIZATION, &sm21_issued_ipc), - _M(IPC, &sm20_ipc), + .type = NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION, + .queries[0] = _SM(INST_ISSUED1_0), + .queries[1] = _SM(INST_ISSUED1_1), + .queries[2] = _SM(INST_ISSUED2_0), + .queries[3] = _SM(INST_ISSUED2_1), + .queries[4] = _SM(ACTIVE_CYCLES), + .num_queries = 5, }; -#undef _SM -#undef _M - -/* === PERFORMANCE MONITORING METRICS for NVE4+ === */ -static const char *nve4_hw_metric_names[] = -{ - "metric-achieved_occupancy", - "metric-branch_efficiency", - "metric-inst_issued", - "metric-inst_per_wrap", - "metric-inst_replay_overhead", - "metric-issued_ipc", - "metric-issue_slots", - "metric-issue_slot_utilization", - "metric-ipc", - "metric-shared_replay_overhead", +static const struct nvc0_hw_metric_query_cfg *sm21_hw_metric_queries[] = +{ + &sm20_achieved_occupancy, + &sm20_branch_efficiency, + &sm21_inst_issued, + &sm20_inst_per_wrap, + &sm21_inst_replay_overhead, + &sm21_issued_ipc, + &sm21_inst_issued, + &sm21_issue_slot_utilization, + &sm20_ipc, }; -#define _SM(n) NVE4_HW_SM_QUERY(NVE4_HW_SM_QUERY_ ##n) -#define _M(n, c) [NVE4_HW_METRIC_QUERY_##n] = c - /* ==== Compute capability 3.0 (GK104/GK106/GK107) ==== */ static const struct nvc0_hw_metric_query_cfg sm30_achieved_occupancy = { + .type = NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY, .queries[0] = _SM(ACTIVE_WARPS), .queries[1] = _SM(ACTIVE_CYCLES), .num_queries = 2, @@ -187,6 +213,7 @@ sm30_achieved_occupancy = static const struct nvc0_hw_metric_query_cfg sm30_branch_efficiency = { + .type = NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY, .queries[0] = _SM(BRANCH), .queries[1] = _SM(DIVERGENT_BRANCH), .num_queries = 2, @@ -195,6 +222,7 @@ sm30_branch_efficiency = static const struct nvc0_hw_metric_query_cfg sm30_inst_issued = { + .type = NVC0_HW_METRIC_QUERY_INST_ISSUED, .queries[0] = _SM(INST_ISSUED1), .queries[1] = _SM(INST_ISSUED2), .num_queries = 2, @@ -203,6 +231,7 @@ sm30_inst_issued = static const struct nvc0_hw_metric_query_cfg sm30_inst_per_wrap = { + .type = NVC0_HW_METRIC_QUERY_INST_PER_WRAP, .queries[0] = _SM(INST_EXECUTED), .queries[1] = _SM(WARPS_LAUNCHED), .num_queries = 2, @@ -211,6 +240,7 @@ sm30_inst_per_wrap = static const struct nvc0_hw_metric_query_cfg sm30_inst_replay_overhead = { + .type = NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD, .queries[0] = _SM(INST_ISSUED1), .queries[1] = _SM(INST_ISSUED2), .queries[2] = _SM(INST_EXECUTED), @@ -220,6 +250,17 @@ sm30_inst_replay_overhead = static const struct nvc0_hw_metric_query_cfg sm30_issued_ipc = { + .type = NVC0_HW_METRIC_QUERY_ISSUED_IPC, + .queries[0] = _SM(INST_ISSUED1), + .queries[1] = _SM(INST_ISSUED2), + .queries[2] = _SM(ACTIVE_CYCLES), + .num_queries = 3, +}; + +static const struct nvc0_hw_metric_query_cfg +sm30_issue_slot_utilization = +{ + .type = NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION, .queries[0] = _SM(INST_ISSUED1), .queries[1] = _SM(INST_ISSUED2), .queries[2] = _SM(ACTIVE_CYCLES), @@ -229,6 +270,7 @@ sm30_issued_ipc = static const struct nvc0_hw_metric_query_cfg sm30_ipc = { + .type = NVC0_HW_METRIC_QUERY_IPC, .queries[0] = _SM(INST_EXECUTED), .queries[1] = _SM(ACTIVE_CYCLES), .num_queries = 2, @@ -237,6 +279,7 @@ sm30_ipc = static const struct nvc0_hw_metric_query_cfg sm30_shared_replay_overhead = { + .type = NVC0_HW_METRIC_QUERY_SHARED_REPLAY_OVERHEAD, .queries[0] = _SM(SHARED_LD_REPLAY), .queries[1] = _SM(SHARED_ST_REPLAY), .queries[2] = _SM(INST_EXECUTED), @@ -245,44 +288,89 @@ sm30_shared_replay_overhead = static const struct nvc0_hw_metric_query_cfg *sm30_hw_metric_queries[] = { - _M(ACHIEVED_OCCUPANCY, &sm30_achieved_occupancy), - _M(BRANCH_EFFICIENCY, &sm30_branch_efficiency), - _M(INST_ISSUED, &sm30_inst_issued), - _M(INST_PER_WRAP, &sm30_inst_per_wrap), - _M(INST_REPLAY_OVERHEAD, &sm30_inst_replay_overhead), - _M(ISSUED_IPC, &sm30_issued_ipc), - _M(ISSUE_SLOTS, &sm30_inst_issued), - _M(ISSUE_SLOT_UTILIZATION, &sm30_issued_ipc), - _M(IPC, &sm30_ipc), - _M(SHARED_REPLAY_OVERHEAD, &sm30_shared_replay_overhead), + &sm30_achieved_occupancy, + &sm30_branch_efficiency, + &sm30_inst_issued, + &sm30_inst_per_wrap, + &sm30_inst_replay_overhead, + &sm30_issued_ipc, + &sm30_inst_issued, + &sm30_issue_slot_utilization, + &sm30_ipc, + &sm30_shared_replay_overhead, +}; + +/* ==== Compute capability 3.5 (GK110) ==== */ +static const struct nvc0_hw_metric_query_cfg *sm35_hw_metric_queries[] = +{ + &sm30_achieved_occupancy, + &sm30_inst_issued, + &sm30_inst_per_wrap, + &sm30_inst_replay_overhead, + &sm30_issued_ipc, + &sm30_inst_issued, + &sm30_issue_slot_utilization, + &sm30_ipc, + &sm30_shared_replay_overhead, }; #undef _SM -#undef _M static inline const struct nvc0_hw_metric_query_cfg ** nvc0_hw_metric_get_queries(struct nvc0_screen *screen) { struct nouveau_device *dev = screen->base.device; - if (dev->chipset == 0xc0 || dev->chipset == 0xc8) - return sm20_hw_metric_queries; - return sm21_hw_metric_queries; + switch (screen->base.class_3d) { + case NVF0_3D_CLASS: + return sm35_hw_metric_queries; + case NVE4_3D_CLASS: + return sm30_hw_metric_queries; + default: + if (dev->chipset == 0xc0 || dev->chipset == 0xc8) + return sm20_hw_metric_queries; + return sm21_hw_metric_queries; + } + assert(0); + return NULL; +} + +unsigned +nvc0_hw_metric_get_num_queries(struct nvc0_screen *screen) +{ + struct nouveau_device *dev = screen->base.device; + + switch (screen->base.class_3d) { + case NVF0_3D_CLASS: + return ARRAY_SIZE(sm35_hw_metric_queries); + case NVE4_3D_CLASS: + return ARRAY_SIZE(sm30_hw_metric_queries); + default: + if (dev->chipset == 0xc0 || dev->chipset == 0xc8) + return ARRAY_SIZE(sm20_hw_metric_queries); + return ARRAY_SIZE(sm21_hw_metric_queries); + } + return 0; } static const struct nvc0_hw_metric_query_cfg * -nvc0_hw_metric_query_get_cfg(struct nvc0_context *nvc0, - struct nvc0_hw_query *hq) +nvc0_hw_metric_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) { const struct nvc0_hw_metric_query_cfg **queries; struct nvc0_screen *screen = nvc0->screen; struct nvc0_query *q = &hq->base; + unsigned num_queries; + unsigned i; - if (screen->base.class_3d >= NVE4_3D_CLASS) - return sm30_hw_metric_queries[q->type - NVE4_HW_METRIC_QUERY(0)]; - + num_queries = nvc0_hw_metric_get_num_queries(screen); queries = nvc0_hw_metric_get_queries(screen); - return queries[q->type - NVC0_HW_METRIC_QUERY(0)]; + + for (i = 0; i < num_queries; i++) { + if (NVC0_HW_METRIC_QUERY(queries[i]->type) == q->type) + return queries[i]; + } + assert(0); + return NULL; } static void @@ -419,47 +507,47 @@ sm21_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8]) static uint64_t sm30_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8]) { - switch (hq->base.type - NVE4_HW_METRIC_QUERY(0)) { - case NVE4_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY: + switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) { + case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY: /* (active_warps / active_cycles) / max. number of warps on a MP */ if (res64[1]) return (res64[0] / (double)res64[1]) / 64; break; - case NVE4_HW_METRIC_QUERY_BRANCH_EFFICIENCY: + case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY: return sm20_hw_metric_calc_result(hq, res64); - case NVE4_HW_METRIC_QUERY_INST_ISSUED: + case NVC0_HW_METRIC_QUERY_INST_ISSUED: /* inst_issued1 + inst_issued2 * 2 */ return res64[0] + res64[1] * 2; - case NVE4_HW_METRIC_QUERY_INST_PER_WRAP: + case NVC0_HW_METRIC_QUERY_INST_PER_WRAP: return sm20_hw_metric_calc_result(hq, res64); - case NVE4_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD: + case NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD: /* (metric-inst_issued - inst_executed) / inst_executed */ if (res64[2]) return (((res64[0] + res64[1] * 2) - res64[2]) / (double)res64[2]); break; - case NVE4_HW_METRIC_QUERY_ISSUED_IPC: + case NVC0_HW_METRIC_QUERY_ISSUED_IPC: /* metric-inst_issued / active_cycles */ if (res64[2]) return (res64[0] + res64[1] * 2) / (double)res64[2]; break; - case NVE4_HW_METRIC_QUERY_ISSUE_SLOTS: + case NVC0_HW_METRIC_QUERY_ISSUE_SLOTS: /* inst_issued1 + inst_issued2 */ return res64[0] + res64[1]; - case NVE4_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION: + case NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION: /* ((metric-issue_slots / 2) / active_cycles) * 100 */ if (res64[2]) return (((res64[0] + res64[1]) / 2) / (double)res64[2]) * 100; break; - case NVE4_HW_METRIC_QUERY_IPC: + case NVC0_HW_METRIC_QUERY_IPC: return sm20_hw_metric_calc_result(hq, res64); - case NVE4_HW_METRIC_QUERY_SHARED_REPLAY_OVERHEAD: + case NVC0_HW_METRIC_QUERY_SHARED_REPLAY_OVERHEAD: /* (shared_load_replay + shared_store_replay) / inst_executed */ if (res64[2]) return (res64[0] + res64[1]) / (double)res64[2]; break; default: debug_printf("invalid metric type: %d\n", - hq->base.type - NVE4_HW_METRIC_QUERY(0)); + hq->base.type - NVC0_HW_METRIC_QUERY(0)); break; } return 0; @@ -487,13 +575,17 @@ nvc0_hw_metric_get_query_result(struct nvc0_context *nvc0, res64[i] = *(uint64_t *)&results[i]; } - if (screen->base.class_3d >= NVE4_3D_CLASS) { + switch (screen->base.class_3d) { + case NVF0_3D_CLASS: + case NVE4_3D_CLASS: value = sm30_hw_metric_calc_result(hq, res64); - } else { + break; + default: if (dev->chipset == 0xc0 || dev->chipset == 0xc8) value = sm20_hw_metric_calc_result(hq, res64); else value = sm21_hw_metric_calc_result(hq, res64); + break; } *(uint64_t *)result = value; @@ -515,8 +607,7 @@ nvc0_hw_metric_create_query(struct nvc0_context *nvc0, unsigned type) struct nvc0_hw_query *hq; unsigned i; - if ((type < NVE4_HW_METRIC_QUERY(0) || type > NVE4_HW_METRIC_QUERY_LAST) && - (type < NVC0_HW_METRIC_QUERY(0) || type > NVC0_HW_METRIC_QUERY_LAST)) + if (type < NVC0_HW_METRIC_QUERY(0) || type > NVC0_HW_METRIC_QUERY_LAST) return NULL; hmq = CALLOC_STRUCT(nvc0_hw_metric_query); @@ -541,46 +632,15 @@ nvc0_hw_metric_create_query(struct nvc0_context *nvc0, unsigned type) return hq; } -static int -nvc0_hw_metric_get_next_query_id(const struct nvc0_hw_metric_query_cfg **queries, - unsigned id) -{ - unsigned i, next = 0; - - for (i = 0; i < NVC0_HW_METRIC_QUERY_COUNT; i++) { - if (!queries[i]) { - next++; - } else - if (i >= id && queries[id + next]) { - break; - } - } - return id + next; -} - int nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id, struct pipe_driver_query_info *info) { - uint16_t class_3d = screen->base.class_3d; int count = 0; if (screen->base.drm->version >= 0x01000101) { - if (screen->compute) { - if (screen->base.class_3d == NVE4_3D_CLASS) { - count += NVE4_HW_METRIC_QUERY_COUNT; - } else - if (class_3d < NVE4_3D_CLASS) { - const struct nvc0_hw_metric_query_cfg **queries = - nvc0_hw_metric_get_queries(screen); - unsigned i; - - for (i = 0; i < NVC0_HW_METRIC_QUERY_COUNT; i++) { - if (queries[i]) - count++; - } - } - } + if (screen->compute) + count = nvc0_hw_metric_get_num_queries(screen); } if (!info) @@ -588,19 +648,12 @@ nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id, if (id < count) { if (screen->compute) { - if (screen->base.class_3d == NVE4_3D_CLASS) { - info->name = nve4_hw_metric_names[id]; - info->query_type = NVE4_HW_METRIC_QUERY(id); - info->group_id = NVC0_HW_METRIC_QUERY_GROUP; - return 1; - } else - if (class_3d < NVE4_3D_CLASS) { - const struct nvc0_hw_metric_query_cfg **queries = + if (screen->base.class_3d <= NVF0_3D_CLASS) { + const struct nvc0_hw_metric_query_cfg **queries = nvc0_hw_metric_get_queries(screen); - id = nvc0_hw_metric_get_next_query_id(queries, id); - info->name = nvc0_hw_metric_names[id]; - info->query_type = NVC0_HW_METRIC_QUERY(id); + info->name = nvc0_hw_metric_query_get_name(queries[id]->type); + info->query_type = NVC0_HW_METRIC_QUERY(queries[id]->type); info->group_id = NVC0_HW_METRIC_QUERY_GROUP; return 1; } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.h index 06cb355db40..3203a8ca2b9 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.h @@ -18,24 +18,7 @@ nvc0_hw_metric_query(struct nvc0_hw_query *hq) /* * Driver metrics queries: */ -#define NVE4_HW_METRIC_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + 3072 + (i)) -#define NVE4_HW_METRIC_QUERY_LAST NVE4_HW_METRIC_QUERY(NVE4_HW_METRIC_QUERY_COUNT - 1) -enum nve4_hw_metric_queries -{ - NVE4_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY = 0, - NVE4_HW_METRIC_QUERY_BRANCH_EFFICIENCY, - NVE4_HW_METRIC_QUERY_INST_ISSUED, - NVE4_HW_METRIC_QUERY_INST_PER_WRAP, - NVE4_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD, - NVE4_HW_METRIC_QUERY_ISSUED_IPC, - NVE4_HW_METRIC_QUERY_ISSUE_SLOTS, - NVE4_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION, - NVE4_HW_METRIC_QUERY_IPC, - NVE4_HW_METRIC_QUERY_SHARED_REPLAY_OVERHEAD, - NVE4_HW_METRIC_QUERY_COUNT -}; - -#define NVC0_HW_METRIC_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + 3072 + (i)) +#define NVC0_HW_METRIC_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + 2048 + (i)) #define NVC0_HW_METRIC_QUERY_LAST NVC0_HW_METRIC_QUERY(NVC0_HW_METRIC_QUERY_COUNT - 1) enum nvc0_hw_metric_queries { @@ -48,6 +31,7 @@ enum nvc0_hw_metric_queries NVC0_HW_METRIC_QUERY_ISSUE_SLOTS, NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION, NVC0_HW_METRIC_QUERY_IPC, + NVC0_HW_METRIC_QUERY_SHARED_REPLAY_OVERHEAD, NVC0_HW_METRIC_QUERY_COUNT }; @@ -56,4 +40,7 @@ nvc0_hw_metric_create_query(struct nvc0_context *, unsigned); int nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *, unsigned, struct pipe_driver_query_info *); +unsigned +nvc0_hw_metric_get_num_queries(struct nvc0_screen *); + #endif diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c index f5f9bb39fd9..db36b8a1b9f 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c @@ -30,59 +30,87 @@ #include "nvc0/nve4_compute.xml.h" #include "nvc0/nvc0_compute.xml.h" -/* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */ - /* NOTE: intentionally using the same names as NV */ -static const char *nve4_hw_sm_query_names[] = -{ - /* MP counters */ - "active_cycles", - "active_warps", - "atom_cas_count", - "atom_count", - "branch", - "divergent_branch", - "gld_request", - "global_ld_mem_divergence_replays", - "global_store_transaction", - "global_st_mem_divergence_replays", - "gred_count", - "gst_request", - "inst_executed", - "inst_issued1", - "inst_issued2", - "l1_global_load_hit", - "l1_global_load_miss", - "__l1_global_load_transactions", - "__l1_global_store_transactions", - "l1_local_load_hit", - "l1_local_load_miss", - "l1_local_store_hit", - "l1_local_store_miss", - "l1_shared_load_transactions", - "l1_shared_store_transactions", - "local_load", - "local_load_transactions", - "local_store", - "local_store_transactions", - "prof_trigger_00", - "prof_trigger_01", - "prof_trigger_02", - "prof_trigger_03", - "prof_trigger_04", - "prof_trigger_05", - "prof_trigger_06", - "prof_trigger_07", - "shared_load", - "shared_load_replay", - "shared_store", - "shared_store_replay", - "sm_cta_launched", - "threads_launched", - "uncached_global_load_transaction", - "warps_launched", +#define _Q(t, n) { NVC0_HW_SM_QUERY_##t, n } +struct { + unsigned type; + const char *name; +} nvc0_hw_sm_queries[] = { + _Q(ACTIVE_CYCLES, "active_cycles" ), + _Q(ACTIVE_WARPS, "active_warps" ), + _Q(ATOM_CAS_COUNT, "atom_cas_count" ), + _Q(ATOM_COUNT, "atom_count" ), + _Q(BRANCH, "branch" ), + _Q(DIVERGENT_BRANCH, "divergent_branch" ), + _Q(GLD_REQUEST, "gld_request" ), + _Q(GLD_MEM_DIV_REPLAY, "global_ld_mem_divergence_replays" ), + _Q(GST_TRANSACTIONS, "global_store_transaction" ), + _Q(GST_MEM_DIV_REPLAY, "global_st_mem_divergence_replays" ), + _Q(GRED_COUNT, "gred_count" ), + _Q(GST_REQUEST, "gst_request" ), + _Q(INST_EXECUTED, "inst_executed" ), + _Q(INST_ISSUED, "inst_issued" ), + _Q(INST_ISSUED1, "inst_issued1" ), + _Q(INST_ISSUED2, "inst_issued2" ), + _Q(INST_ISSUED1_0, "inst_issued1_0" ), + _Q(INST_ISSUED1_1, "inst_issued1_1" ), + _Q(INST_ISSUED2_0, "inst_issued2_0" ), + _Q(INST_ISSUED2_1, "inst_issued2_1" ), + _Q(L1_GLD_HIT, "l1_global_load_hit" ), + _Q(L1_GLD_MISS, "l1_global_load_miss" ), + _Q(L1_GLD_TRANSACTIONS, "__l1_global_load_transactions" ), + _Q(L1_GST_TRANSACTIONS, "__l1_global_store_transactions" ), + _Q(L1_LOCAL_LD_HIT, "l1_local_load_hit" ), + _Q(L1_LOCAL_LD_MISS, "l1_local_load_miss" ), + _Q(L1_LOCAL_ST_HIT, "l1_local_store_hit" ), + _Q(L1_LOCAL_ST_MISS, "l1_local_store_miss" ), + _Q(L1_SHARED_LD_TRANSACTIONS, "l1_shared_load_transactions" ), + _Q(L1_SHARED_ST_TRANSACTIONS, "l1_shared_store_transactions" ), + _Q(LOCAL_LD, "local_load" ), + _Q(LOCAL_LD_TRANSACTIONS, "local_load_transactions" ), + _Q(LOCAL_ST, "local_store" ), + _Q(LOCAL_ST_TRANSACTIONS, "local_store_transactions" ), + _Q(NOT_PRED_OFF_INST_EXECUTED, "not_predicated_off_thread_inst_executed" ), + _Q(PROF_TRIGGER_0, "prof_trigger_00" ), + _Q(PROF_TRIGGER_1, "prof_trigger_01" ), + _Q(PROF_TRIGGER_2, "prof_trigger_02" ), + _Q(PROF_TRIGGER_3, "prof_trigger_03" ), + _Q(PROF_TRIGGER_4, "prof_trigger_04" ), + _Q(PROF_TRIGGER_5, "prof_trigger_05" ), + _Q(PROF_TRIGGER_6, "prof_trigger_06" ), + _Q(PROF_TRIGGER_7, "prof_trigger_07" ), + _Q(SHARED_LD, "shared_load" ), + _Q(SHARED_LD_REPLAY, "shared_load_replay" ), + _Q(SHARED_ST, "shared_store" ), + _Q(SHARED_ST_REPLAY, "shared_store_replay" ), + _Q(SM_CTA_LAUNCHED, "sm_cta_launched" ), + _Q(THREADS_LAUNCHED, "threads_launched" ), + _Q(TH_INST_EXECUTED, "thread_inst_executed" ), + _Q(TH_INST_EXECUTED_0, "thread_inst_executed_0" ), + _Q(TH_INST_EXECUTED_1, "thread_inst_executed_1" ), + _Q(TH_INST_EXECUTED_2, "thread_inst_executed_2" ), + _Q(TH_INST_EXECUTED_3, "thread_inst_executed_3" ), + _Q(UNCACHED_GLD_TRANSACTIONS, "uncached_global_load_transaction" ), + _Q(WARPS_LAUNCHED, "warps_launched" ), }; +#undef _Q + +static inline const char * +nvc0_hw_sm_query_get_name(unsigned query_type) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(nvc0_hw_sm_queries); i++) { + if (nvc0_hw_sm_queries[i].type == query_type) + return nvc0_hw_sm_queries[i].name; + } + assert(0); + return NULL; +} + +/* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */ + /* Code to read out MP counters: They are accessible via mmio, too, but let's * just avoid mapping registers in userspace. We'd have to know which MPs are * enabled/present, too, and that information is not presently exposed. @@ -169,6 +197,49 @@ static const uint64_t nve4_read_hw_sm_counters_code[] = 0x8000000000001de7ULL }; +static const uint64_t nvf0_read_hw_sm_counters_code[] = +{ + /* Same kernel as GK104 */ + 0x0880808080808080ULL, + 0x86400000109c0022ULL, + 0x86400000019c0032ULL, + 0x86400000021c0002ULL, + 0x86400000029c0006ULL, + 0x86400000031c000aULL, + 0x86400000039c000eULL, + 0x86400000041c0012ULL, + 0x08ac1080108c8080ULL, + 0x86400000049c0016ULL, + 0x86400000051c001aULL, + 0x86400000059c001eULL, + 0xdb201c007f9c201eULL, + 0x64c03c00001c002aULL, + 0xc00000020a1c3021ULL, + 0x64c03c00009c002eULL, + 0x0810a0808010b810ULL, + 0xc0000001041c3025ULL, + 0x180000000020003cULL, + 0xdb201c007f9c243eULL, + 0xc1c00000301c2021ULL, + 0xc1c00000081c2431ULL, + 0xc1c00000021c2435ULL, + 0xe0800000069c2026ULL, + 0x08b010b010b010a0ULL, + 0xe0800000061c2022ULL, + 0xe4c03c00051c0032ULL, + 0xe0840000041c282aULL, + 0xe4c03c00059c0036ULL, + 0xe08040007f9c2c2eULL, + 0xe0840000049c3032ULL, + 0xfe800000001c2800ULL, + 0x080000b81080b010ULL, + 0x64c03c00011c0002ULL, + 0xe08040007f9c3436ULL, + 0xfe80000020043010ULL, + 0xfc800000281c3000ULL, + 0x18000000001c003cULL, +}; + /* For simplicity, we will allocate as many group slots as we allocate counter * slots. This means that a single counter which wants to source from 2 groups * will have to be declared as using 2 counter slots. This shouldn't really be @@ -187,69 +258,593 @@ struct nvc0_hw_sm_counter_cfg struct nvc0_hw_sm_query_cfg { + unsigned type; struct nvc0_hw_sm_counter_cfg ctr[8]; uint8_t num_counters; uint8_t norm[2]; /* normalization num,denom */ }; -#define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, { nu, dn } } -#define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, { nu, dn } } +#define _CA(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s } +#define _CB(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s } +#define _Q(n, c) [NVE4_HW_SM_QUERY_##n] = c + +/* ==== Compute capability 3.0 (GK104:GK110) ==== */ +static const struct nvc0_hw_sm_query_cfg +sm30_active_cycles = +{ + .type = NVC0_HW_SM_QUERY_ACTIVE_CYCLES, + .ctr[0] = _CB(0x0001, B6, WARP, 0x00000000), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_active_warps = +{ + .type = NVC0_HW_SM_QUERY_ACTIVE_WARPS, + .ctr[0] = _CB(0x003f, B6, WARP, 0x31483104), + .num_counters = 1, + .norm = { 2, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_atom_cas_count = +{ + .type = NVC0_HW_SM_QUERY_ATOM_CAS_COUNT, + .ctr[0] = _CA(0x0001, B6, BRANCH, 0x000000004), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_atom_count = +{ + .type = NVC0_HW_SM_QUERY_ATOM_COUNT, + .ctr[0] = _CA(0x0001, B6, BRANCH, 0x00000000), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_branch = +{ + .type = NVC0_HW_SM_QUERY_BRANCH, + .ctr[0] = _CA(0x0001, B6, BRANCH, 0x0000000c), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_divergent_branch = +{ + .type = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH, + .ctr[0] = _CA(0x0001, B6, BRANCH, 0x00000010), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_gld_request = +{ + .type = NVC0_HW_SM_QUERY_GLD_REQUEST, + .ctr[0] = _CA(0x0001, B6, LDST, 0x00000010), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_gld_mem_div_replay = +{ + .type = NVC0_HW_SM_QUERY_GLD_MEM_DIV_REPLAY, + .ctr[0] = _CB(0x0001, B6, REPLAY, 0x00000010), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_gst_transactions = +{ + .type = NVC0_HW_SM_QUERY_GST_TRANSACTIONS, + .ctr[0] = _CB(0x0001, B6, MEM, 0x00000004), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_gst_mem_div_replay = +{ + .type = NVC0_HW_SM_QUERY_GST_MEM_DIV_REPLAY, + .ctr[0] = _CB(0x0001, B6, REPLAY, 0x00000014), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_gred_count = +{ + .type = NVC0_HW_SM_QUERY_GRED_COUNT, + .ctr[0] = _CA(0x0001, B6, BRANCH, 0x00000008), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_gst_request = +{ + .type = NVC0_HW_SM_QUERY_GST_REQUEST, + .ctr[0] = _CA(0x0001, B6, LDST, 0x00000014), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_inst_executed = +{ + .type = NVC0_HW_SM_QUERY_INST_EXECUTED, + .ctr[0] = _CA(0x0003, B6, EXEC, 0x00000398), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_inst_issued1 = +{ + .type = NVC0_HW_SM_QUERY_INST_ISSUED1, + .ctr[0] = _CA(0x0001, B6, ISSUE, 0x00000004), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_inst_issued2 = +{ + .type = NVC0_HW_SM_QUERY_INST_ISSUED2, + .ctr[0] = _CA(0x0001, B6, ISSUE, 0x00000008), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_l1_gld_hit = +{ + .type = NVC0_HW_SM_QUERY_L1_GLD_HIT, + .ctr[0] = _CB(0x0001, B6, L1, 0x00000010), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_l1_gld_miss = +{ + .type = NVC0_HW_SM_QUERY_L1_GLD_MISS, + .ctr[0] = _CB(0x0001, B6, L1, 0x00000014), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_l1_gld_transactions = +{ + .type = NVC0_HW_SM_QUERY_L1_GLD_TRANSACTIONS, + .ctr[0] = _CB(0x0001, B6, UNK0F, 0x00000000), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_l1_gst_transactions = +{ + .type = NVC0_HW_SM_QUERY_L1_GST_TRANSACTIONS, + .ctr[0] = _CB(0x0001, B6, UNK0F, 0x00000004), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_l1_local_ld_hit = +{ + .type = NVC0_HW_SM_QUERY_L1_LOCAL_LD_HIT, + .ctr[0] = _CB(0x0001, B6, L1, 0x00000000), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_l1_local_ld_miss = +{ + .type = NVC0_HW_SM_QUERY_L1_LOCAL_LD_MISS, + .ctr[0] = _CB(0x0001, B6, L1, 0x00000004), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_l1_local_st_hit = +{ + .type = NVC0_HW_SM_QUERY_L1_LOCAL_ST_HIT, + .ctr[0] = _CB(0x0001, B6, L1, 0x00000008), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_l1_local_st_miss = +{ + .type = NVC0_HW_SM_QUERY_L1_LOCAL_ST_MISS, + .ctr[0] = _CB(0x0001, B6, L1, 0x0000000c), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_l1_shared_ld_transactions = +{ + .type = NVC0_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS, + .ctr[0] = _CB(0x0001, B6, TRANSACTION, 0x00000008), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_l1_shared_st_transactions = +{ + .type = NVC0_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS, + .ctr[0] = _CB(0x0001, B6, TRANSACTION, 0x0000000c), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_local_ld = +{ + .type = NVC0_HW_SM_QUERY_LOCAL_LD, + .ctr[0] = _CA(0x0001, B6, LDST, 0x00000008), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_local_ld_transactions = +{ + .type = NVC0_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS, + .ctr[0] = _CB(0x0001, B6, TRANSACTION, 0x00000000), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_local_st = +{ + .type = NVC0_HW_SM_QUERY_LOCAL_ST, + .ctr[0] = _CA(0x0001, B6, LDST, 0x0000000c), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_local_st_transactions = +{ + .type = NVC0_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS, + .ctr[0] = _CB(0x0001, B6, TRANSACTION, 0x00000004), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_prof_trigger_0 = +{ + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_0, + .ctr[0] = _CA(0x0001, B6, USER, 0x00000000), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_prof_trigger_1 = +{ + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_1, + .ctr[0] = _CA(0x0001, B6, USER, 0x00000004), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_prof_trigger_2 = +{ + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_2, + .ctr[0] = _CA(0x0001, B6, USER, 0x00000008), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_prof_trigger_3 = +{ + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_3, + .ctr[0] = _CA(0x0001, B6, USER, 0x0000000c), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_prof_trigger_4 = +{ + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_4, + .ctr[0] = _CA(0x0001, B6, USER, 0x00000010), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_prof_trigger_5 = +{ + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_5, + .ctr[0] = _CA(0x0001, B6, USER, 0x00000014), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_prof_trigger_6 = +{ + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_6, + .ctr[0] = _CA(0x0001, B6, USER, 0x00000018), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_prof_trigger_7 = +{ + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_7, + .ctr[0] = _CA(0x0001, B6, USER, 0x0000001c), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_shared_ld = +{ + .type = NVC0_HW_SM_QUERY_SHARED_LD, + .ctr[0] = _CA(0x0001, B6, LDST, 0x00000000), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_shared_ld_replay = +{ + .type = NVC0_HW_SM_QUERY_SHARED_LD_REPLAY, + .ctr[0] = _CB(0x0001, B6, REPLAY, 0x00000008), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_shared_st = +{ + .type = NVC0_HW_SM_QUERY_SHARED_ST, + .ctr[0] = _CA(0x0001, B6, LDST, 0x00000004), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_shared_st_replay = +{ + .type = NVC0_HW_SM_QUERY_SHARED_ST_REPLAY, + .ctr[0] = _CB(0x0001, B6, REPLAY, 0x0000000c), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_sm_cta_launched = +{ + .type = NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED, + .ctr[0] = _CB(0x0001, B6, WARP, 0x0000001c), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_threads_launched = +{ + .type = NVC0_HW_SM_QUERY_THREADS_LAUNCHED, + .ctr[0] = _CA(0x003f, B6, LAUNCH, 0x398a4188), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_uncached_gld_transactions = +{ + .type = NVC0_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS, + .ctr[0] = _CB(0x0001, B6, MEM, 0x00000000), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm30_warps_launched = +{ + .type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED, + .ctr[0] = _CA(0x0001, B6, LAUNCH, 0x00000004), + .num_counters = 1, + .norm = { 1, 1 }, +}; /* NOTES: * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps * inst_executed etc.: we only count a single warp scheduler */ -static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] = -{ - _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1), - _Q1B(ACTIVE_WARPS, 0x003f, B6, WARP, 0x31483104, 2, 1), - _Q1A(ATOM_CAS_COUNT, 0x0001, B6, BRANCH, 0x000000004, 1, 1), - _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1), - _Q1A(BRANCH, 0x0001, B6, BRANCH, 0x0000000c, 1, 1), - _Q1A(DIVERGENT_BRANCH, 0x0001, B6, BRANCH, 0x00000010, 1, 1), - _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1), - _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1), - _Q1B(GST_TRANSACTIONS, 0x0001, B6, MEM, 0x00000004, 1, 1), - _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1), - _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1), - _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1), - _Q1A(INST_EXECUTED, 0x0003, B6, EXEC, 0x00000398, 1, 1), - _Q1A(INST_ISSUED1, 0x0001, B6, ISSUE, 0x00000004, 1, 1), - _Q1A(INST_ISSUED2, 0x0001, B6, ISSUE, 0x00000008, 1, 1), - _Q1B(L1_GLD_HIT, 0x0001, B6, L1, 0x00000010, 1, 1), - _Q1B(L1_GLD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1), - _Q1B(L1_GLD_TRANSACTIONS, 0x0001, B6, UNK0F, 0x00000000, 1, 1), - _Q1B(L1_GST_TRANSACTIONS, 0x0001, B6, UNK0F, 0x00000004, 1, 1), - _Q1B(L1_LOCAL_LD_HIT, 0x0001, B6, L1, 0x00000000, 1, 1), - _Q1B(L1_LOCAL_LD_MISS, 0x0001, B6, L1, 0x00000004, 1, 1), - _Q1B(L1_LOCAL_ST_HIT, 0x0001, B6, L1, 0x00000008, 1, 1), - _Q1B(L1_LOCAL_ST_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1), - _Q1B(L1_SHARED_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1), - _Q1B(L1_SHARED_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1), - _Q1A(LOCAL_LD, 0x0001, B6, LDST, 0x00000008, 1, 1), - _Q1B(LOCAL_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1), - _Q1A(LOCAL_ST, 0x0001, B6, LDST, 0x0000000c, 1, 1), - _Q1B(LOCAL_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1), - _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1), - _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1), - _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1), - _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c, 1, 1), - _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010, 1, 1), - _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1), - _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1), - _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1), - _Q1A(SHARED_LD, 0x0001, B6, LDST, 0x00000000, 1, 1), - _Q1B(SHARED_LD_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1), - _Q1A(SHARED_ST, 0x0001, B6, LDST, 0x00000004, 1, 1), - _Q1B(SHARED_ST_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1), - _Q1B(SM_CTA_LAUNCHED, 0x0001, B6, WARP, 0x0000001c, 1, 1), - _Q1A(THREADS_LAUNCHED, 0x003f, B6, LAUNCH, 0x398a4188, 1, 1), - _Q1B(UNCACHED_GLD_TRANSACTIONS, 0x0001, B6, MEM, 0x00000000, 1, 1), - _Q1A(WARPS_LAUNCHED, 0x0001, B6, LAUNCH, 0x00000004, 1, 1), -}; - -#undef _Q1A -#undef _Q1B +static const struct nvc0_hw_sm_query_cfg *sm30_hw_sm_queries[] = +{ + &sm30_active_cycles, + &sm30_active_warps, + &sm30_atom_cas_count, + &sm30_atom_count, + &sm30_branch, + &sm30_divergent_branch, + &sm30_gld_request, + &sm30_gld_mem_div_replay, + &sm30_gst_transactions, + &sm30_gst_mem_div_replay, + &sm30_gred_count, + &sm30_gst_request, + &sm30_inst_executed, + &sm30_inst_issued1, + &sm30_inst_issued2, + &sm30_l1_gld_hit, + &sm30_l1_gld_miss, + &sm30_l1_gld_transactions, + &sm30_l1_gst_transactions, + &sm30_l1_local_ld_hit, + &sm30_l1_local_ld_miss, + &sm30_l1_local_st_hit, + &sm30_l1_local_st_miss, + &sm30_l1_shared_ld_transactions, + &sm30_l1_shared_st_transactions, + &sm30_local_ld, + &sm30_local_ld_transactions, + &sm30_local_st, + &sm30_local_st_transactions, + &sm30_prof_trigger_0, + &sm30_prof_trigger_1, + &sm30_prof_trigger_2, + &sm30_prof_trigger_3, + &sm30_prof_trigger_4, + &sm30_prof_trigger_5, + &sm30_prof_trigger_6, + &sm30_prof_trigger_7, + &sm30_shared_ld, + &sm30_shared_ld_replay, + &sm30_shared_st, + &sm30_shared_st_replay, + &sm30_sm_cta_launched, + &sm30_threads_launched, + &sm30_uncached_gld_transactions, + &sm30_warps_launched, +}; + +/* ==== Compute capability 3.5 (GK110/GK208) ==== */ +static const struct nvc0_hw_sm_query_cfg +sm35_atom_cas_count = +{ + .type = NVC0_HW_SM_QUERY_ATOM_CAS_COUNT, + .ctr[0] = _CA(0x0001, B6, UNK1A, 0x00000014), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm35_atom_count = +{ + .type = NVC0_HW_SM_QUERY_ATOM_COUNT, + .ctr[0] = _CA(0x0001, B6, UNK1A, 0x00000010), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm35_gred_count = +{ + .type = NVC0_HW_SM_QUERY_GRED_COUNT, + .ctr[0] = _CA(0x0001, B6, UNK1A, 0x00000018), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm35_not_pred_off_inst_executed = +{ + .type = NVC0_HW_SM_QUERY_NOT_PRED_OFF_INST_EXECUTED, + .ctr[0] = _CA(0x003f, B6, UNK14, 0x29062080), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm35_shared_ld_replay = +{ + .type = NVC0_HW_SM_QUERY_SHARED_LD_REPLAY, + .ctr[0] = _CB(0xaaaa, LOGOP, UNK13, 0x00000018), + .ctr[1] = _CB(0x8888, LOGOP, REPLAY, 0x00000151), + .num_counters = 2, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm35_shared_st_replay = +{ + .type = NVC0_HW_SM_QUERY_SHARED_ST_REPLAY, + .ctr[0] = _CB(0xaaaa, LOGOP, UNK13, 0x00000018), + .ctr[1] = _CB(0x8888, LOGOP, REPLAY, 0x000001d1), + .num_counters = 2, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm35_th_inst_executed = +{ + .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED, + .ctr[0] = _CA(0x003f, B6, UNK11, 0x29062080), + .num_counters = 1, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg *sm35_hw_sm_queries[] = +{ + &sm30_active_cycles, + &sm30_active_warps, + &sm35_atom_cas_count, + &sm35_atom_count, + &sm30_gld_request, + &sm30_gld_mem_div_replay, + &sm30_gst_transactions, + &sm30_gst_mem_div_replay, + &sm35_gred_count, + &sm30_gst_request, + &sm30_inst_executed, + &sm30_inst_issued1, + &sm30_inst_issued2, + &sm30_l1_gld_hit, + &sm30_l1_gld_miss, + &sm30_l1_gld_transactions, + &sm30_l1_gst_transactions, + &sm30_l1_local_ld_hit, + &sm30_l1_local_ld_miss, + &sm30_l1_local_st_hit, + &sm30_l1_local_st_miss, + &sm30_l1_shared_ld_transactions, + &sm30_l1_shared_st_transactions, + &sm30_local_ld, + &sm30_local_ld_transactions, + &sm30_local_st, + &sm30_local_st_transactions, + &sm35_not_pred_off_inst_executed, + &sm30_prof_trigger_0, + &sm30_prof_trigger_1, + &sm30_prof_trigger_2, + &sm30_prof_trigger_3, + &sm30_prof_trigger_4, + &sm30_prof_trigger_5, + &sm30_prof_trigger_6, + &sm30_prof_trigger_7, + &sm30_shared_ld, + &sm35_shared_ld_replay, + &sm30_shared_st, + &sm35_shared_st_replay, + &sm30_sm_cta_launched, + &sm35_th_inst_executed, + &sm30_threads_launched, + &sm30_uncached_gld_transactions, + &sm30_warps_launched, +}; + +#undef _Q +#undef _CA +#undef _CB /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */ /* NOTES: @@ -257,43 +852,6 @@ static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] = * because there is a context-switch problem that we need to fix. * Results might be wrong sometimes, be careful! */ -static const char *nvc0_hw_sm_query_names[] = -{ - /* MP counters */ - "active_cycles", - "active_warps", - "atom_count", - "branch", - "divergent_branch", - "gld_request", - "gred_count", - "gst_request", - "inst_executed", - "inst_issued", - "inst_issued1_0", - "inst_issued1_1", - "inst_issued2_0", - "inst_issued2_1", - "local_load", - "local_store", - "prof_trigger_00", - "prof_trigger_01", - "prof_trigger_02", - "prof_trigger_03", - "prof_trigger_04", - "prof_trigger_05", - "prof_trigger_06", - "prof_trigger_07", - "shared_load", - "shared_store", - "threads_launched", - "thread_inst_executed_0", - "thread_inst_executed_1", - "thread_inst_executed_2", - "thread_inst_executed_3", - "warps_launched", -}; - static const uint64_t nvc0_read_hw_sm_counters_code[] = { /* mov b32 $r8 $tidx @@ -345,12 +903,12 @@ static const uint64_t nvc0_read_hw_sm_counters_code[] = }; #define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s } -#define _Q(n, c) [NVC0_HW_SM_QUERY_##n] = c /* ==== Compute capability 2.0 (GF100/GF110) ==== */ static const struct nvc0_hw_sm_query_cfg sm20_active_cycles = { + .type = NVC0_HW_SM_QUERY_ACTIVE_CYCLES, .ctr[0] = _C(0xaaaa, LOGOP, 0x11, 0x000000ff, 0x00000000), .num_counters = 1, .norm = { 1, 1 }, @@ -359,6 +917,7 @@ sm20_active_cycles = static const struct nvc0_hw_sm_query_cfg sm20_active_warps = { + .type = NVC0_HW_SM_QUERY_ACTIVE_WARPS, .ctr[0] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000010), .ctr[1] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000020), .ctr[2] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000030), @@ -372,6 +931,7 @@ sm20_active_warps = static const struct nvc0_hw_sm_query_cfg sm20_atom_count = { + .type = NVC0_HW_SM_QUERY_ATOM_COUNT, .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000030), .num_counters = 1, .norm = { 1, 1 }, @@ -380,6 +940,7 @@ sm20_atom_count = static const struct nvc0_hw_sm_query_cfg sm20_branch = { + .type = NVC0_HW_SM_QUERY_BRANCH, .ctr[0] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000010), .num_counters = 2, @@ -389,6 +950,7 @@ sm20_branch = static const struct nvc0_hw_sm_query_cfg sm20_divergent_branch = { + .type = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH, .ctr[0] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000020), .ctr[1] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000030), .num_counters = 2, @@ -398,6 +960,7 @@ sm20_divergent_branch = static const struct nvc0_hw_sm_query_cfg sm20_gld_request = { + .type = NVC0_HW_SM_QUERY_GLD_REQUEST, .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000030), .num_counters = 1, .norm = { 1, 1 }, @@ -406,6 +969,7 @@ sm20_gld_request = static const struct nvc0_hw_sm_query_cfg sm20_gred_count = { + .type = NVC0_HW_SM_QUERY_GRED_COUNT, .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000040), .num_counters = 1, .norm = { 1, 1 }, @@ -414,6 +978,7 @@ sm20_gred_count = static const struct nvc0_hw_sm_query_cfg sm20_gst_request = { + .type = NVC0_HW_SM_QUERY_GST_REQUEST, .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000060), .num_counters = 1, .norm = { 1, 1 }, @@ -422,6 +987,7 @@ sm20_gst_request = static const struct nvc0_hw_sm_query_cfg sm20_inst_executed = { + .type = NVC0_HW_SM_QUERY_INST_EXECUTED, .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001000), .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001010), .num_counters = 2, @@ -431,6 +997,7 @@ sm20_inst_executed = static const struct nvc0_hw_sm_query_cfg sm20_inst_issued = { + .type = NVC0_HW_SM_QUERY_INST_ISSUED, .ctr[0] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007060), .ctr[1] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007070), .num_counters = 2, @@ -440,6 +1007,7 @@ sm20_inst_issued = static const struct nvc0_hw_sm_query_cfg sm20_local_ld = { + .type = NVC0_HW_SM_QUERY_LOCAL_LD, .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000020), .num_counters = 1, .norm = { 1, 1 }, @@ -448,6 +1016,7 @@ sm20_local_ld = static const struct nvc0_hw_sm_query_cfg sm20_local_st = { + .type = NVC0_HW_SM_QUERY_LOCAL_ST, .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000050), .num_counters = 1, .norm = { 1, 1 }, @@ -456,6 +1025,7 @@ sm20_local_st = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_0 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_0, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000000), .num_counters = 1, .norm = { 1, 1 }, @@ -464,6 +1034,7 @@ sm20_prof_trigger_0 = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_1 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_1, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000010), .num_counters = 1, .norm = { 1, 1 }, @@ -472,6 +1043,7 @@ sm20_prof_trigger_1 = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_2 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_2, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000020), .num_counters = 1, .norm = { 1, 1 }, @@ -480,6 +1052,7 @@ sm20_prof_trigger_2 = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_3 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_3, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000030), .num_counters = 1, .norm = { 1, 1 }, @@ -488,6 +1061,7 @@ sm20_prof_trigger_3 = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_4 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_4, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000040), .num_counters = 1, .norm = { 1, 1 }, @@ -496,6 +1070,7 @@ sm20_prof_trigger_4 = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_5 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_5, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000050), .num_counters = 1, .norm = { 1, 1 }, @@ -504,6 +1079,7 @@ sm20_prof_trigger_5 = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_6 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_6, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000060), .num_counters = 1, .norm = { 1, 1 }, @@ -512,6 +1088,7 @@ sm20_prof_trigger_6 = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_7 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_7, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000070), .num_counters = 1, .norm = { 1, 1 }, @@ -520,6 +1097,7 @@ sm20_prof_trigger_7 = static const struct nvc0_hw_sm_query_cfg sm20_shared_ld = { + .type = NVC0_HW_SM_QUERY_SHARED_LD, .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000010), .num_counters = 1, .norm = { 1, 1 }, @@ -528,6 +1106,7 @@ sm20_shared_ld = static const struct nvc0_hw_sm_query_cfg sm20_shared_st = { + .type = NVC0_HW_SM_QUERY_SHARED_ST, .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000040), .num_counters = 1, .norm = { 1, 1 }, @@ -536,6 +1115,7 @@ sm20_shared_st = static const struct nvc0_hw_sm_query_cfg sm20_threads_launched = { + .type = NVC0_HW_SM_QUERY_THREADS_LAUNCHED, .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000010), .ctr[1] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000020), .ctr[2] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000030), @@ -549,6 +1129,7 @@ sm20_threads_launched = static const struct nvc0_hw_sm_query_cfg sm20_th_inst_executed_0 = { + .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0, .ctr[0] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000010), .ctr[2] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000020), @@ -562,6 +1143,7 @@ sm20_th_inst_executed_0 = static const struct nvc0_hw_sm_query_cfg sm20_th_inst_executed_1 = { + .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1, .ctr[0] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000010), .ctr[2] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000020), @@ -575,6 +1157,7 @@ sm20_th_inst_executed_1 = static const struct nvc0_hw_sm_query_cfg sm20_warps_launched = { + .type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED, .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000000), .num_counters = 1, .norm = { 1, 1 }, @@ -582,44 +1165,39 @@ sm20_warps_launched = static const struct nvc0_hw_sm_query_cfg *sm20_hw_sm_queries[] = { - _Q(ACTIVE_CYCLES, &sm20_active_cycles), - _Q(ACTIVE_WARPS, &sm20_active_warps), - _Q(ATOM_COUNT, &sm20_atom_count), - _Q(BRANCH, &sm20_branch), - _Q(DIVERGENT_BRANCH, &sm20_divergent_branch), - _Q(GLD_REQUEST, &sm20_gld_request), - _Q(GRED_COUNT, &sm20_gred_count), - _Q(GST_REQUEST, &sm20_gst_request), - _Q(INST_EXECUTED, &sm20_inst_executed), - _Q(INST_ISSUED, &sm20_inst_issued), - _Q(INST_ISSUED1_0, NULL), - _Q(INST_ISSUED1_1, NULL), - _Q(INST_ISSUED2_0, NULL), - _Q(INST_ISSUED2_1, NULL), - _Q(LOCAL_LD, &sm20_local_ld), - _Q(LOCAL_ST, &sm20_local_st), - _Q(PROF_TRIGGER_0, &sm20_prof_trigger_0), - _Q(PROF_TRIGGER_1, &sm20_prof_trigger_1), - _Q(PROF_TRIGGER_2, &sm20_prof_trigger_2), - _Q(PROF_TRIGGER_3, &sm20_prof_trigger_3), - _Q(PROF_TRIGGER_4, &sm20_prof_trigger_4), - _Q(PROF_TRIGGER_5, &sm20_prof_trigger_5), - _Q(PROF_TRIGGER_6, &sm20_prof_trigger_6), - _Q(PROF_TRIGGER_7, &sm20_prof_trigger_7), - _Q(SHARED_LD, &sm20_shared_ld), - _Q(SHARED_ST, &sm20_shared_st), - _Q(THREADS_LAUNCHED, &sm20_threads_launched), - _Q(TH_INST_EXECUTED_0, &sm20_th_inst_executed_0), - _Q(TH_INST_EXECUTED_1, &sm20_th_inst_executed_1), - _Q(TH_INST_EXECUTED_2, NULL), - _Q(TH_INST_EXECUTED_3, NULL), - _Q(WARPS_LAUNCHED, &sm20_warps_launched), + &sm20_active_cycles, + &sm20_active_warps, + &sm20_atom_count, + &sm20_branch, + &sm20_divergent_branch, + &sm20_gld_request, + &sm20_gred_count, + &sm20_gst_request, + &sm20_inst_executed, + &sm20_inst_issued, + &sm20_local_ld, + &sm20_local_st, + &sm20_prof_trigger_0, + &sm20_prof_trigger_1, + &sm20_prof_trigger_2, + &sm20_prof_trigger_3, + &sm20_prof_trigger_4, + &sm20_prof_trigger_5, + &sm20_prof_trigger_6, + &sm20_prof_trigger_7, + &sm20_shared_ld, + &sm20_shared_st, + &sm20_threads_launched, + &sm20_th_inst_executed_0, + &sm20_th_inst_executed_1, + &sm20_warps_launched, }; /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */ static const struct nvc0_hw_sm_query_cfg sm21_inst_executed = { + .type = NVC0_HW_SM_QUERY_INST_EXECUTED, .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000010), .ctr[2] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000020), @@ -630,6 +1208,7 @@ sm21_inst_executed = static const struct nvc0_hw_sm_query_cfg sm21_inst_issued1_0 = { + .type = NVC0_HW_SM_QUERY_INST_ISSUED1_0, .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000010), .num_counters = 1, .norm = { 1, 1 }, @@ -638,6 +1217,7 @@ sm21_inst_issued1_0 = static const struct nvc0_hw_sm_query_cfg sm21_inst_issued1_1 = { + .type = NVC0_HW_SM_QUERY_INST_ISSUED1_1, .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000040), .num_counters = 1, .norm = { 1, 1 }, @@ -646,6 +1226,7 @@ sm21_inst_issued1_1 = static const struct nvc0_hw_sm_query_cfg sm21_inst_issued2_0 = { + .type = NVC0_HW_SM_QUERY_INST_ISSUED2_0, .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000020), .num_counters = 1, .norm = { 1, 1 }, @@ -654,6 +1235,7 @@ sm21_inst_issued2_0 = static const struct nvc0_hw_sm_query_cfg sm21_inst_issued2_1 = { + .type = NVC0_HW_SM_QUERY_INST_ISSUED2_1, .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000050), .num_counters = 1, .norm = { 1, 1 }, @@ -662,6 +1244,7 @@ sm21_inst_issued2_1 = static const struct nvc0_hw_sm_query_cfg sm21_th_inst_executed_0 = { + .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0, .ctr[0] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000010), .ctr[2] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000020), @@ -675,6 +1258,7 @@ sm21_th_inst_executed_0 = static const struct nvc0_hw_sm_query_cfg sm21_th_inst_executed_1 = { + .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1, .ctr[0] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000010), .ctr[2] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000020), @@ -688,6 +1272,7 @@ sm21_th_inst_executed_1 = static const struct nvc0_hw_sm_query_cfg sm21_th_inst_executed_2 = { + .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2, .ctr[0] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000010), .ctr[2] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000020), @@ -701,6 +1286,7 @@ sm21_th_inst_executed_2 = static const struct nvc0_hw_sm_query_cfg sm21_th_inst_executed_3 = { + .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3, .ctr[0] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000010), .ctr[2] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000020), @@ -713,41 +1299,39 @@ sm21_th_inst_executed_3 = static const struct nvc0_hw_sm_query_cfg *sm21_hw_sm_queries[] = { - _Q(ACTIVE_CYCLES, &sm20_active_cycles), - _Q(ACTIVE_WARPS, &sm20_active_warps), - _Q(ATOM_COUNT, &sm20_atom_count), - _Q(BRANCH, &sm20_branch), - _Q(DIVERGENT_BRANCH, &sm20_divergent_branch), - _Q(GLD_REQUEST, &sm20_gld_request), - _Q(GRED_COUNT, &sm20_gred_count), - _Q(GST_REQUEST, &sm20_gst_request), - _Q(INST_EXECUTED, &sm21_inst_executed), - _Q(INST_ISSUED, NULL), - _Q(INST_ISSUED1_0, &sm21_inst_issued1_0), - _Q(INST_ISSUED1_1, &sm21_inst_issued1_1), - _Q(INST_ISSUED2_0, &sm21_inst_issued2_0), - _Q(INST_ISSUED2_1, &sm21_inst_issued2_1), - _Q(LOCAL_LD, &sm20_local_ld), - _Q(LOCAL_ST, &sm20_local_st), - _Q(PROF_TRIGGER_0, &sm20_prof_trigger_0), - _Q(PROF_TRIGGER_1, &sm20_prof_trigger_1), - _Q(PROF_TRIGGER_2, &sm20_prof_trigger_2), - _Q(PROF_TRIGGER_3, &sm20_prof_trigger_3), - _Q(PROF_TRIGGER_4, &sm20_prof_trigger_4), - _Q(PROF_TRIGGER_5, &sm20_prof_trigger_5), - _Q(PROF_TRIGGER_6, &sm20_prof_trigger_6), - _Q(PROF_TRIGGER_7, &sm20_prof_trigger_7), - _Q(SHARED_LD, &sm20_shared_ld), - _Q(SHARED_ST, &sm20_shared_st), - _Q(THREADS_LAUNCHED, &sm20_threads_launched), - _Q(TH_INST_EXECUTED_0, &sm21_th_inst_executed_0), - _Q(TH_INST_EXECUTED_1, &sm21_th_inst_executed_1), - _Q(TH_INST_EXECUTED_2, &sm21_th_inst_executed_2), - _Q(TH_INST_EXECUTED_3, &sm21_th_inst_executed_3), - _Q(WARPS_LAUNCHED, &sm20_warps_launched), + &sm20_active_cycles, + &sm20_active_warps, + &sm20_atom_count, + &sm20_branch, + &sm20_divergent_branch, + &sm20_gld_request, + &sm20_gred_count, + &sm20_gst_request, + &sm21_inst_executed, + &sm21_inst_issued1_0, + &sm21_inst_issued1_1, + &sm21_inst_issued2_0, + &sm21_inst_issued2_1, + &sm20_local_ld, + &sm20_local_st, + &sm20_prof_trigger_0, + &sm20_prof_trigger_1, + &sm20_prof_trigger_2, + &sm20_prof_trigger_3, + &sm20_prof_trigger_4, + &sm20_prof_trigger_5, + &sm20_prof_trigger_6, + &sm20_prof_trigger_7, + &sm20_shared_ld, + &sm20_shared_st, + &sm20_threads_launched, + &sm21_th_inst_executed_0, + &sm21_th_inst_executed_1, + &sm21_th_inst_executed_2, + &sm21_th_inst_executed_3, + &sm20_warps_launched, }; -#undef _Q #undef _C static inline const struct nvc0_hw_sm_query_cfg ** @@ -755,26 +1339,55 @@ nvc0_hw_sm_get_queries(struct nvc0_screen *screen) { struct nouveau_device *dev = screen->base.device; - if (dev->chipset == 0xc0 || dev->chipset == 0xc8) - return sm20_hw_sm_queries; - return sm21_hw_sm_queries; + switch (screen->base.class_3d) { + case NVF0_3D_CLASS: + return sm35_hw_sm_queries; + case NVE4_3D_CLASS: + return sm30_hw_sm_queries; + default: + if (dev->chipset == 0xc0 || dev->chipset == 0xc8) + return sm20_hw_sm_queries; + return sm21_hw_sm_queries; + } + assert(0); + return NULL; +} + +unsigned +nvc0_hw_sm_get_num_queries(struct nvc0_screen *screen) +{ + struct nouveau_device *dev = screen->base.device; + + switch (screen->base.class_3d) { + case NVF0_3D_CLASS: + return ARRAY_SIZE(sm35_hw_sm_queries); + case NVE4_3D_CLASS: + return ARRAY_SIZE(sm30_hw_sm_queries); + default: + if (dev->chipset == 0xc0 || dev->chipset == 0xc8) + return ARRAY_SIZE(sm20_hw_sm_queries); + return ARRAY_SIZE(sm21_hw_sm_queries); + } + return 0; } static const struct nvc0_hw_sm_query_cfg * nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) { + const struct nvc0_hw_sm_query_cfg **queries; struct nvc0_screen *screen = nvc0->screen; struct nvc0_query *q = &hq->base; + unsigned num_queries; + unsigned i; - if (screen->base.class_3d >= NVE4_3D_CLASS) - return &nve4_hw_sm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC]; + num_queries = nvc0_hw_sm_get_num_queries(screen); + queries = nvc0_hw_sm_get_queries(screen); - if (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST) { - const struct nvc0_hw_sm_query_cfg **queries = - nvc0_hw_sm_get_queries(screen); - return queries[q->type - NVC0_HW_SM_QUERY(0)]; + for (i = 0; i < num_queries; i++) { + if (NVC0_HW_SM_QUERY(queries[i]->type) == q->type) + return queries[i]; } - debug_printf("invalid query type: %d\n", q->type); + assert(0); return NULL; } @@ -929,6 +1542,37 @@ nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) return true; } +static inline struct nvc0_program * +nvc0_hw_sm_get_program(struct nvc0_screen *screen) +{ + struct nvc0_program *prog; + + prog = CALLOC_STRUCT(nvc0_program); + if (!prog) + return NULL; + + prog->type = PIPE_SHADER_COMPUTE; + prog->translated = true; + prog->parm_size = 12; + + if (screen->base.class_3d == NVE4_3D_CLASS || + screen->base.class_3d == NVF0_3D_CLASS) { + if (screen->base.class_3d == NVE4_3D_CLASS) { + prog->code = (uint32_t *)nve4_read_hw_sm_counters_code; + prog->code_size = sizeof(nve4_read_hw_sm_counters_code); + } else { + prog->code = (uint32_t *)nvf0_read_hw_sm_counters_code; + prog->code_size = sizeof(nvf0_read_hw_sm_counters_code); + } + prog->num_gprs = 14; + } else { + prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code; + prog->code_size = sizeof(nvc0_read_hw_sm_counters_code); + prog->num_gprs = 12; + } + return prog; +} + static void nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) { @@ -944,22 +1588,8 @@ nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) const uint grid[3] = { screen->mp_count, screen->gpc_count, 1 }; unsigned c, i; - if (unlikely(!screen->pm.prog)) { - struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program); - prog->type = PIPE_SHADER_COMPUTE; - prog->translated = true; - prog->parm_size = 12; - if (is_nve4) { - prog->code = (uint32_t *)nve4_read_hw_sm_counters_code; - prog->code_size = sizeof(nve4_read_hw_sm_counters_code); - prog->num_gprs = 14; - } else { - prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code; - prog->code_size = sizeof(nvc0_read_hw_sm_counters_code); - prog->num_gprs = 12; - } - screen->pm.prog = prog; - } + if (unlikely(!screen->pm.prog)) + screen->pm.prog = nvc0_hw_sm_get_program(screen); /* disable all counting */ PUSH_SPACE(push, 8); @@ -1132,8 +1762,7 @@ nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type) if (nvc0->screen->base.drm->version < 0x01000101) return NULL; - if ((type < NVE4_HW_SM_QUERY(0) || type > NVE4_HW_SM_QUERY_LAST) && - (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST)) + if (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST) return NULL; hsq = CALLOC_STRUCT(nvc0_hw_sm_query); @@ -1201,23 +1830,6 @@ nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type) return hq; } -static int -nvc0_hw_sm_get_next_query_id(const struct nvc0_hw_sm_query_cfg **queries, - unsigned id) -{ - unsigned i, next = 0; - - for (i = 0; i < NVC0_HW_SM_QUERY_COUNT; i++) { - if (!queries[i]) { - next++; - } else - if (i >= id && queries[id + next]) { - break; - } - } - return id + next; -} - int nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id, struct pipe_driver_query_info *info) @@ -1225,21 +1837,8 @@ nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id, int count = 0; if (screen->base.drm->version >= 0x01000101) { - if (screen->compute) { - if (screen->base.class_3d == NVE4_3D_CLASS) { - count += NVE4_HW_SM_QUERY_COUNT; - } else - if (screen->base.class_3d < NVE4_3D_CLASS) { - const struct nvc0_hw_sm_query_cfg **queries = - nvc0_hw_sm_get_queries(screen); - unsigned i; - - for (i = 0; i < NVC0_HW_SM_QUERY_COUNT; i++) { - if (queries[i]) - count++; - } - } - } + if (screen->compute) + count = nvc0_hw_sm_get_num_queries(screen); } if (!info) @@ -1247,19 +1846,12 @@ nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id, if (id < count) { if (screen->compute) { - if (screen->base.class_3d == NVE4_3D_CLASS) { - info->name = nve4_hw_sm_query_names[id]; - info->query_type = NVE4_HW_SM_QUERY(id); - info->group_id = NVC0_HW_SM_QUERY_GROUP; - return 1; - } else - if (screen->base.class_3d < NVE4_3D_CLASS) { + if (screen->base.class_3d <= NVF0_3D_CLASS) { const struct nvc0_hw_sm_query_cfg **queries = nvc0_hw_sm_get_queries(screen); - id = nvc0_hw_sm_get_next_query_id(queries, id); - info->name = nvc0_hw_sm_query_names[id]; - info->query_type = NVC0_HW_SM_QUERY(id); + info->name = nvc0_hw_sm_query_get_name(queries[id]->type); + info->query_type = NVC0_HW_SM_QUERY(queries[id]->type); info->group_id = NVC0_HW_SM_QUERY_GROUP; return 1; } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h index 94d55a04ff8..65d6c8b3167 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h @@ -17,78 +17,45 @@ nvc0_hw_sm_query(struct nvc0_hw_query *hq) /* * Performance counter queries: */ -#define NVE4_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + (i)) -#define NVE4_HW_SM_QUERY_LAST NVE4_HW_SM_QUERY(NVE4_HW_SM_QUERY_COUNT - 1) -enum nve4_hw_sm_queries -{ - NVE4_HW_SM_QUERY_ACTIVE_CYCLES = 0, - NVE4_HW_SM_QUERY_ACTIVE_WARPS, - NVE4_HW_SM_QUERY_ATOM_CAS_COUNT, - NVE4_HW_SM_QUERY_ATOM_COUNT, - NVE4_HW_SM_QUERY_BRANCH, - NVE4_HW_SM_QUERY_DIVERGENT_BRANCH, - NVE4_HW_SM_QUERY_GLD_REQUEST, - NVE4_HW_SM_QUERY_GLD_MEM_DIV_REPLAY, - NVE4_HW_SM_QUERY_GST_TRANSACTIONS, - NVE4_HW_SM_QUERY_GST_MEM_DIV_REPLAY, - NVE4_HW_SM_QUERY_GRED_COUNT, - NVE4_HW_SM_QUERY_GST_REQUEST, - NVE4_HW_SM_QUERY_INST_EXECUTED, - NVE4_HW_SM_QUERY_INST_ISSUED1, - NVE4_HW_SM_QUERY_INST_ISSUED2, - NVE4_HW_SM_QUERY_L1_GLD_HIT, - NVE4_HW_SM_QUERY_L1_GLD_MISS, - NVE4_HW_SM_QUERY_L1_GLD_TRANSACTIONS, - NVE4_HW_SM_QUERY_L1_GST_TRANSACTIONS, - NVE4_HW_SM_QUERY_L1_LOCAL_LD_HIT, - NVE4_HW_SM_QUERY_L1_LOCAL_LD_MISS, - NVE4_HW_SM_QUERY_L1_LOCAL_ST_HIT, - NVE4_HW_SM_QUERY_L1_LOCAL_ST_MISS, - NVE4_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS, - NVE4_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS, - NVE4_HW_SM_QUERY_LOCAL_LD, - NVE4_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS, - NVE4_HW_SM_QUERY_LOCAL_ST, - NVE4_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS, - NVE4_HW_SM_QUERY_PROF_TRIGGER_0, - NVE4_HW_SM_QUERY_PROF_TRIGGER_1, - NVE4_HW_SM_QUERY_PROF_TRIGGER_2, - NVE4_HW_SM_QUERY_PROF_TRIGGER_3, - NVE4_HW_SM_QUERY_PROF_TRIGGER_4, - NVE4_HW_SM_QUERY_PROF_TRIGGER_5, - NVE4_HW_SM_QUERY_PROF_TRIGGER_6, - NVE4_HW_SM_QUERY_PROF_TRIGGER_7, - NVE4_HW_SM_QUERY_SHARED_LD, - NVE4_HW_SM_QUERY_SHARED_LD_REPLAY, - NVE4_HW_SM_QUERY_SHARED_ST, - NVE4_HW_SM_QUERY_SHARED_ST_REPLAY, - NVE4_HW_SM_QUERY_SM_CTA_LAUNCHED, - NVE4_HW_SM_QUERY_THREADS_LAUNCHED, - NVE4_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS, - NVE4_HW_SM_QUERY_WARPS_LAUNCHED, - NVE4_HW_SM_QUERY_COUNT -}; - -#define NVC0_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + 2048 + (i)) +#define NVC0_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + (i)) #define NVC0_HW_SM_QUERY_LAST NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_COUNT - 1) enum nvc0_hw_sm_queries { NVC0_HW_SM_QUERY_ACTIVE_CYCLES = 0, NVC0_HW_SM_QUERY_ACTIVE_WARPS, + NVC0_HW_SM_QUERY_ATOM_CAS_COUNT, NVC0_HW_SM_QUERY_ATOM_COUNT, NVC0_HW_SM_QUERY_BRANCH, NVC0_HW_SM_QUERY_DIVERGENT_BRANCH, NVC0_HW_SM_QUERY_GLD_REQUEST, + NVC0_HW_SM_QUERY_GLD_MEM_DIV_REPLAY, + NVC0_HW_SM_QUERY_GST_TRANSACTIONS, + NVC0_HW_SM_QUERY_GST_MEM_DIV_REPLAY, NVC0_HW_SM_QUERY_GRED_COUNT, NVC0_HW_SM_QUERY_GST_REQUEST, NVC0_HW_SM_QUERY_INST_EXECUTED, NVC0_HW_SM_QUERY_INST_ISSUED, + NVC0_HW_SM_QUERY_INST_ISSUED1, + NVC0_HW_SM_QUERY_INST_ISSUED2, NVC0_HW_SM_QUERY_INST_ISSUED1_0, NVC0_HW_SM_QUERY_INST_ISSUED1_1, NVC0_HW_SM_QUERY_INST_ISSUED2_0, NVC0_HW_SM_QUERY_INST_ISSUED2_1, + NVC0_HW_SM_QUERY_L1_GLD_HIT, + NVC0_HW_SM_QUERY_L1_GLD_MISS, + NVC0_HW_SM_QUERY_L1_GLD_TRANSACTIONS, + NVC0_HW_SM_QUERY_L1_GST_TRANSACTIONS, + NVC0_HW_SM_QUERY_L1_LOCAL_LD_HIT, + NVC0_HW_SM_QUERY_L1_LOCAL_LD_MISS, + NVC0_HW_SM_QUERY_L1_LOCAL_ST_HIT, + NVC0_HW_SM_QUERY_L1_LOCAL_ST_MISS, + NVC0_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS, + NVC0_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS, NVC0_HW_SM_QUERY_LOCAL_LD, + NVC0_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS, NVC0_HW_SM_QUERY_LOCAL_ST, + NVC0_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS, + NVC0_HW_SM_QUERY_NOT_PRED_OFF_INST_EXECUTED, NVC0_HW_SM_QUERY_PROF_TRIGGER_0, NVC0_HW_SM_QUERY_PROF_TRIGGER_1, NVC0_HW_SM_QUERY_PROF_TRIGGER_2, @@ -98,12 +65,17 @@ enum nvc0_hw_sm_queries NVC0_HW_SM_QUERY_PROF_TRIGGER_6, NVC0_HW_SM_QUERY_PROF_TRIGGER_7, NVC0_HW_SM_QUERY_SHARED_LD, + NVC0_HW_SM_QUERY_SHARED_LD_REPLAY, NVC0_HW_SM_QUERY_SHARED_ST, + NVC0_HW_SM_QUERY_SHARED_ST_REPLAY, + NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED, NVC0_HW_SM_QUERY_THREADS_LAUNCHED, + NVC0_HW_SM_QUERY_TH_INST_EXECUTED, NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0, NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1, NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2, NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3, + NVC0_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS, NVC0_HW_SM_QUERY_WARPS_LAUNCHED, NVC0_HW_SM_QUERY_COUNT }; @@ -113,4 +85,7 @@ nvc0_hw_sm_create_query(struct nvc0_context *, unsigned); int nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *, unsigned, struct pipe_driver_query_info *); +unsigned +nvc0_hw_sm_get_num_queries(struct nvc0_screen *); + #endif diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c b/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c index 7fbc6e1fd8e..c034d0fd011 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c @@ -19,7 +19,8 @@ nvc0_resource_create(struct pipe_screen *screen, static struct pipe_resource * nvc0_resource_from_handle(struct pipe_screen * screen, const struct pipe_resource *templ, - struct winsys_handle *whandle) + struct winsys_handle *whandle, + unsigned usage) { if (templ->target == PIPE_BUFFER) { return NULL; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index 37620ea8ba6..3c5b1da2063 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -225,6 +225,10 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY: case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: case PIPE_CAP_QUERY_MEMORY_INFO: + case PIPE_CAP_PCI_GROUP: + case PIPE_CAP_PCI_BUS: + case PIPE_CAP_PCI_DEVICE: + case PIPE_CAP_PCI_FUNCTION: return 0; case PIPE_CAP_VENDOR_ID: @@ -324,7 +328,7 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: return 1; case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED: - return 0; + return 1; case PIPE_SHADER_CAP_SUBROUTINES: return 1; case PIPE_SHADER_CAP_INTEGERS: @@ -333,8 +337,9 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, return 1; case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: return 1; - case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: + return 1; + case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: return 0; case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c index 6b02ed5680a..01fe7ce9bfc 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c @@ -309,7 +309,6 @@ nvc0_tfb_validate(struct nvc0_context *nvc0) if (!(nvc0->dirty_3d & NVC0_NEW_3D_TFB_TARGETS)) return; - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TFB); for (b = 0; b < nvc0->num_tfbbufs; ++b) { struct nvc0_so_target *targ = nvc0_so_target(nvc0->tfbbuf[b]); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c index 7ccce9ff6bf..090a0395432 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c @@ -1184,8 +1184,10 @@ nvc0_set_transform_feedback_targets(struct pipe_context *pipe, } nvc0->num_tfbbufs = num_targets; - if (nvc0->tfbbuf_dirty) + if (nvc0->tfbbuf_dirty) { + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TFB); nvc0->dirty_3d |= NVC0_NEW_3D_TFB_TARGETS; + } } static void @@ -1340,7 +1342,7 @@ nvc0_set_global_bindings(struct pipe_context *pipe, nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_GLOBAL); - nvc0->dirty_cp = NVC0_NEW_CP_GLOBALS; + nvc0->dirty_cp |= NVC0_NEW_CP_GLOBALS; } void diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c index fbf45ceca2d..c0ed5c0043d 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c @@ -672,10 +672,8 @@ nvc0_switch_pipe_context(struct nvc0_context *ctx_to) ctx_to->screen->cur_ctx = ctx_to; } -static struct state_validate { - void (*func)(struct nvc0_context *); - uint32_t states; -} validate_list[] = { +static struct nvc0_state_validate +validate_list_3d[] = { { nvc0_validate_fb, NVC0_NEW_3D_FRAMEBUFFER }, { nvc0_validate_blend, NVC0_NEW_3D_BLEND }, { nvc0_validate_zsa, NVC0_NEW_3D_ZSA }, @@ -714,7 +712,9 @@ static struct state_validate { }; bool -nvc0_state_validate(struct nvc0_context *nvc0, uint32_t mask) +nvc0_state_validate(struct nvc0_context *nvc0, uint32_t mask, + struct nvc0_state_validate *validate_list, int size, + uint32_t *dirty, struct nouveau_bufctx *bufctx) { uint32_t state_mask; int ret; @@ -723,26 +723,38 @@ nvc0_state_validate(struct nvc0_context *nvc0, uint32_t mask) if (nvc0->screen->cur_ctx != nvc0) nvc0_switch_pipe_context(nvc0); - state_mask = nvc0->dirty_3d & mask; + state_mask = *dirty & mask; if (state_mask) { - for (i = 0; i < ARRAY_SIZE(validate_list); ++i) { - struct state_validate *validate = &validate_list[i]; + for (i = 0; i < size; ++i) { + struct nvc0_state_validate *validate = &validate_list[i]; if (state_mask & validate->states) validate->func(nvc0); } - nvc0->dirty_3d &= ~state_mask; + *dirty &= ~state_mask; - nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, false); + nvc0_bufctx_fence(nvc0, bufctx, false); } - nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_3d); + nouveau_pushbuf_bufctx(nvc0->base.pushbuf, bufctx); ret = nouveau_pushbuf_validate(nvc0->base.pushbuf); + return !ret; +} + +bool +nvc0_state_validate_3d(struct nvc0_context *nvc0, uint32_t mask) +{ + bool ret; + + ret = nvc0_state_validate(nvc0, mask, validate_list_3d, + ARRAY_SIZE(validate_list_3d), &nvc0->dirty_3d, + nvc0->bufctx_3d); + if (unlikely(nvc0->state.flushed)) { nvc0->state.flushed = false; nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, true); } - return !ret; + return ret; } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c index 49577969d3d..e8b3a4d549a 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c @@ -693,7 +693,7 @@ nvc0_clear(struct pipe_context *pipe, unsigned buffers, uint32_t mode = 0; /* don't need NEW_BLEND, COLOR_MASK doesn't affect CLEAR_BUFFERS */ - if (!nvc0_state_validate(nvc0, NVC0_NEW_3D_FRAMEBUFFER)) + if (!nvc0_state_validate_3d(nvc0, NVC0_NEW_3D_FRAMEBUFFER)) return; if (buffers & PIPE_CLEAR_COLOR && fb->nr_cbufs) { @@ -1195,7 +1195,7 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info) nvc0_blitctx_prepare_state(blit); - nvc0_state_validate(nvc0, ~0); + nvc0_state_validate_3d(nvc0, ~0); x_range = (float)info->src.box.width / (float)info->dst.box.width; y_range = (float)info->src.box.height / (float)info->dst.box.height; @@ -1203,8 +1203,8 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info) x0 = (float)info->src.box.x - x_range * (float)info->dst.box.x; y0 = (float)info->src.box.y - y_range * (float)info->dst.box.y; - x1 = x0 + 16384.0f * x_range; - y1 = y0 + 16384.0f * y_range; + x1 = x0 + 32768.0f * x_range; + y1 = y0 + 32768.0f * y_range; x0 *= (float)(1 << nv50_miptree(src)->ms_x); x1 *= (float)(1 << nv50_miptree(src)->ms_x); @@ -1315,14 +1315,14 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info) *(vbuf++) = fui(y0); *(vbuf++) = fui(z); - *(vbuf++) = fui(16384 << nv50_miptree(dst)->ms_x); + *(vbuf++) = fui(32768 << nv50_miptree(dst)->ms_x); *(vbuf++) = fui(0.0f); *(vbuf++) = fui(x1); *(vbuf++) = fui(y0); *(vbuf++) = fui(z); *(vbuf++) = fui(0.0f); - *(vbuf++) = fui(16384 << nv50_miptree(dst)->ms_y); + *(vbuf++) = fui(32768 << nv50_miptree(dst)->ms_y); *(vbuf++) = fui(x0); *(vbuf++) = fui(y1); *(vbuf++) = fui(z); @@ -1644,6 +1644,7 @@ nvc0_blitter_destroy(struct nvc0_screen *screen) } } + pipe_mutex_destroy(blitter->mutex); FREE(blitter); } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c index 647aa10ec35..e0e0ad2a0f7 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c @@ -969,7 +969,7 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) IMMED_NVC0(push, NVC0_3D(PATCH_VERTICES), nvc0->state.patch_vertices); } - nvc0_state_validate(nvc0, ~0); + nvc0_state_validate_3d(nvc0, ~0); if (nvc0->vertprog->vp.need_draw_parameters) { PUSH_SPACE(push, 9); diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c index 4a4e8367d28..b3d841461d6 100644 --- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c +++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c @@ -301,34 +301,31 @@ nve4_compute_set_tex_handles(struct nvc0_context *nvc0) nvc0->samplers_dirty[s] = 0; } +static struct nvc0_state_validate +validate_list_cp[] = { + { nvc0_compprog_validate, NVC0_NEW_CP_PROGRAM }, + { nve4_compute_validate_textures, NVC0_NEW_CP_TEXTURES }, + { nve4_compute_validate_samplers, NVC0_NEW_CP_SAMPLERS }, + { nve4_compute_set_tex_handles, NVC0_NEW_CP_TEXTURES | + NVC0_NEW_CP_SAMPLERS }, + { nve4_compute_validate_surfaces, NVC0_NEW_CP_SURFACES }, + { nvc0_compute_validate_globals, NVC0_NEW_CP_GLOBALS }, +}; static bool -nve4_compute_state_validate(struct nvc0_context *nvc0) +nve4_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask) { - nvc0_compprog_validate(nvc0); - if (nvc0->dirty_cp & NVC0_NEW_CP_TEXTURES) - nve4_compute_validate_textures(nvc0); - if (nvc0->dirty_cp & NVC0_NEW_CP_SAMPLERS) - nve4_compute_validate_samplers(nvc0); - if (nvc0->dirty_cp & (NVC0_NEW_CP_TEXTURES | NVC0_NEW_CP_SAMPLERS)) - nve4_compute_set_tex_handles(nvc0); - if (nvc0->dirty_cp & NVC0_NEW_CP_SURFACES) - nve4_compute_validate_surfaces(nvc0); - if (nvc0->dirty_cp & NVC0_NEW_CP_GLOBALS) - nvc0_compute_validate_globals(nvc0); - - nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, false); - - nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_cp); - if (unlikely(nouveau_pushbuf_validate(nvc0->base.pushbuf))) - return false; + bool ret; + + ret = nvc0_state_validate(nvc0, mask, validate_list_cp, + ARRAY_SIZE(validate_list_cp), &nvc0->dirty_cp, + nvc0->bufctx_cp); + if (unlikely(nvc0->state.flushed)) nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true); - - return true; + return ret; } - static void nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input, const uint *block_layout, @@ -447,7 +444,7 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info) BCTX_REFN_bo(nvc0->bufctx_cp, CP_DESC, NOUVEAU_BO_GART | NOUVEAU_BO_RD, desc_bo); - ret = !nve4_compute_state_validate(nvc0); + ret = !nve4_state_validate_cp(nvc0, ~0); if (ret) goto out; diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.xml.h b/src/gallium/drivers/nouveau/nvc0/nve4_compute.xml.h index 3fff1122b8f..32018580579 100644 --- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.xml.h +++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.xml.h @@ -294,6 +294,9 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NVE4_COMPUTE_MP_PM_A_SIGSEL_LAUNCH 0x00000003 #define NVE4_COMPUTE_MP_PM_A_SIGSEL_EXEC 0x00000004 #define NVE4_COMPUTE_MP_PM_A_SIGSEL_ISSUE 0x00000005 +#define NVE4_COMPUTE_MP_PM_A_SIGSEL_UNK11 0x00000011 +#define NVE4_COMPUTE_MP_PM_A_SIGSEL_UNK14 0x00000014 +#define NVE4_COMPUTE_MP_PM_A_SIGSEL_UNK1A 0x0000001a #define NVE4_COMPUTE_MP_PM_A_SIGSEL_LDST 0x0000001b #define NVE4_COMPUTE_MP_PM_A_SIGSEL_BRANCH 0x0000001c @@ -307,6 +310,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NVE4_COMPUTE_MP_PM_B_SIGSEL_UNK0F 0x0000000f #define NVE4_COMPUTE_MP_PM_B_SIGSEL_L1 0x00000010 #define NVE4_COMPUTE_MP_PM_B_SIGSEL_MEM 0x00000011 +#define NVE4_COMPUTE_MP_PM_B_SIGSEL_UNK13 0x00000013 #define NVE4_COMPUTE_MP_PM_SRCSEL(i0) (0x0000339c + 0x4*(i0)) #define NVE4_COMPUTE_MP_PM_SRCSEL__ESIZE 0x00000004 diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c index 7fad7ad6a43..1c3bb64f0e4 100644 --- a/src/gallium/drivers/r300/r300_screen.c +++ b/src/gallium/drivers/r300/r300_screen.c @@ -259,6 +259,14 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param) return r300screen->info.vram_size >> 20; case PIPE_CAP_UMA: return 0; + case PIPE_CAP_PCI_GROUP: + return r300screen->info.pci_domain; + case PIPE_CAP_PCI_BUS: + return r300screen->info.pci_bus; + case PIPE_CAP_PCI_DEVICE: + return r300screen->info.pci_dev; + case PIPE_CAP_PCI_FUNCTION: + return r300screen->info.pci_func; } return 0; } diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c index e90e741a353..57456c6d867 100644 --- a/src/gallium/drivers/r300/r300_texture.c +++ b/src/gallium/drivers/r300/r300_texture.c @@ -971,7 +971,8 @@ static void r300_texture_destroy(struct pipe_screen *screen, boolean r300_resource_get_handle(struct pipe_screen* screen, struct pipe_resource *texture, - struct winsys_handle *whandle) + struct winsys_handle *whandle, + unsigned usage) { struct radeon_winsys *rws = r300_screen(screen)->rws; struct r300_resource* tex = (struct r300_resource*)texture; @@ -1005,6 +1006,7 @@ r300_texture_create_object(struct r300_screen *rscreen, { struct radeon_winsys *rws = rscreen->rws; struct r300_resource *tex = NULL; + struct radeon_bo_metadata tiling = {}; tex = CALLOC_STRUCT(r300_resource); if (!tex) { @@ -1059,10 +1061,10 @@ r300_texture_create_object(struct r300_screen *rscreen, util_format_is_depth_or_stencil(base->format) ? "depth" : "color"); } - rws->buffer_set_tiling(tex->buf, NULL, - tex->tex.microtile, tex->tex.macrotile[0], - 0, 0, 0, 0, 0, 0, 0, - tex->tex.stride_in_bytes[0], false); + tiling.microtile = tex->tex.microtile; + tiling.macrotile = tex->tex.macrotile[0]; + tiling.stride = tex->tex.stride_in_bytes[0]; + rws->buffer_set_metadata(tex->buf, &tiling); return tex; @@ -1097,13 +1099,14 @@ struct pipe_resource *r300_texture_create(struct pipe_screen *screen, struct pipe_resource *r300_texture_from_handle(struct pipe_screen *screen, const struct pipe_resource *base, - struct winsys_handle *whandle) + struct winsys_handle *whandle, + unsigned usage) { struct r300_screen *rscreen = r300_screen(screen); struct radeon_winsys *rws = rscreen->rws; struct pb_buffer *buffer; - enum radeon_bo_layout microtile, macrotile; unsigned stride; + struct radeon_bo_metadata tiling = {}; /* Support only 2D textures without mipmaps */ if ((base->target != PIPE_TEXTURE_2D && @@ -1117,25 +1120,24 @@ struct pipe_resource *r300_texture_from_handle(struct pipe_screen *screen, if (!buffer) return NULL; - rws->buffer_get_tiling(buffer, µtile, ¯otile, NULL, NULL, NULL, - NULL, NULL, NULL); + rws->buffer_get_metadata(buffer, &tiling); /* Enforce a microtiled zbuffer. */ if (util_format_is_depth_or_stencil(base->format) && - microtile == RADEON_LAYOUT_LINEAR) { + tiling.microtile == RADEON_LAYOUT_LINEAR) { switch (util_format_get_blocksize(base->format)) { case 4: - microtile = RADEON_LAYOUT_TILED; + tiling.microtile = RADEON_LAYOUT_TILED; break; case 2: - microtile = RADEON_LAYOUT_SQUARETILED; + tiling.microtile = RADEON_LAYOUT_SQUARETILED; break; } } return (struct pipe_resource*) - r300_texture_create_object(rscreen, base, microtile, macrotile, + r300_texture_create_object(rscreen, base, tiling.microtile, tiling.macrotile, stride, buffer); } diff --git a/src/gallium/drivers/r300/r300_texture.h b/src/gallium/drivers/r300/r300_texture.h index 213bdffc2ed..4c339429eca 100644 --- a/src/gallium/drivers/r300/r300_texture.h +++ b/src/gallium/drivers/r300/r300_texture.h @@ -25,6 +25,7 @@ #include "pipe/p_compiler.h" #include "pipe/p_format.h" +#include "pipe/p_screen.h" struct pipe_screen; struct pipe_context; @@ -62,12 +63,14 @@ void r300_texture_setup_format_state(struct r300_screen *screen, boolean r300_resource_get_handle(struct pipe_screen* screen, struct pipe_resource *texture, - struct winsys_handle *whandle); + struct winsys_handle *whandle, + unsigned usage); struct pipe_resource* r300_texture_from_handle(struct pipe_screen* screen, const struct pipe_resource* base, - struct winsys_handle *whandle); + struct winsys_handle *whandle, + unsigned usage); struct pipe_resource* r300_texture_create(struct pipe_screen* screen, diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index a12638a5bdb..83313cb28cf 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -989,13 +989,6 @@ void evergreen_init_color_surface_rat(struct r600_context *rctx, MAX2(64, rctx->screen->b.info.pipe_interleave_bytes / block_size); unsigned pitch = align(pipe_buffer->width0, pitch_alignment); - /* XXX: This is copied from evergreen_init_color_surface(). I don't - * know why this is necessary. - */ - if (pipe_buffer->usage == PIPE_USAGE_STAGING) { - endian = ENDIAN_NONE; - } - surf->cb_color_base = r600_resource(pipe_buffer)->gpu_address >> 8; surf->cb_color_pitch = (pitch / 8) - 1; @@ -1146,11 +1139,7 @@ void evergreen_init_color_surface(struct r600_context *rctx, swap = r600_translate_colorswap(surf->base.format); assert(swap != ~0); - if (rtex->resource.b.b.usage == PIPE_USAGE_STAGING) { - endian = ENDIAN_NONE; - } else { - endian = r600_colorformat_endian_swap(format); - } + endian = r600_colorformat_endian_swap(format); /* blend clamp should be set for all NORM/SRGB types */ if (ntype == V_028C70_NUMBER_UNORM || ntype == V_028C70_NUMBER_SNORM || diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index 5a6ce71414c..7018088d204 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -439,7 +439,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) return PIPE_ENDIAN_LITTLE; case PIPE_CAP_VENDOR_ID: - return 0x1002; + return ATI_VENDOR_ID; case PIPE_CAP_DEVICE_ID: return rscreen->b.info.pci_id; case PIPE_CAP_ACCELERATED: @@ -450,6 +450,14 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) return 0; case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: return rscreen->b.chip_class >= R700; + case PIPE_CAP_PCI_GROUP: + return rscreen->b.info.pci_domain; + case PIPE_CAP_PCI_BUS: + return rscreen->b.info.pci_bus; + case PIPE_CAP_PCI_DEVICE: + return rscreen->b.info.pci_dev; + case PIPE_CAP_PCI_FUNCTION: + return rscreen->b.info.pci_func; } return 0; } diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c index f60e30486a2..f9026197b26 100644 --- a/src/gallium/drivers/r600/r600_state.c +++ b/src/gallium/drivers/r600/r600_state.c @@ -930,11 +930,7 @@ static void r600_init_color_surface(struct r600_context *rctx, swap = r600_translate_colorswap(surf->base.format); assert(swap != ~0); - if (rtex->resource.b.b.usage == PIPE_USAGE_STAGING) { - endian = ENDIAN_NONE; - } else { - endian = r600_colorformat_endian_swap(format); - } + endian = r600_colorformat_endian_swap(format); /* set blend bypass according to docs if SINT/UINT or 8/24 COLOR variants */ diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c index aa3a085c6d2..2211e07ceba 100644 --- a/src/gallium/drivers/r600/r600_state_common.c +++ b/src/gallium/drivers/r600/r600_state_common.c @@ -645,21 +645,21 @@ static void r600_set_sampler_views(struct pipe_context *pipe, unsigned shader, if (rviews[i]) { struct r600_texture *rtex = (struct r600_texture*)rviews[i]->base.texture; + bool is_buffer = rviews[i]->base.texture->target == PIPE_BUFFER; - if (rviews[i]->base.texture->target != PIPE_BUFFER) { - if (rtex->is_depth && !rtex->is_flushing_texture) { - dst->views.compressed_depthtex_mask |= 1 << i; - } else { - dst->views.compressed_depthtex_mask &= ~(1 << i); - } + if (!is_buffer && rtex->is_depth && !rtex->is_flushing_texture) { + dst->views.compressed_depthtex_mask |= 1 << i; + } else { + dst->views.compressed_depthtex_mask &= ~(1 << i); + } - /* Track compressed colorbuffers. */ - if (rtex->cmask.size) { - dst->views.compressed_colortex_mask |= 1 << i; - } else { - dst->views.compressed_colortex_mask &= ~(1 << i); - } + /* Track compressed colorbuffers. */ + if (!is_buffer && rtex->cmask.size) { + dst->views.compressed_colortex_mask |= 1 << i; + } else { + dst->views.compressed_colortex_mask &= ~(1 << i); } + /* Changing from array to non-arrays textures and vice versa requires * updating TEX_ARRAY_OVERRIDE in sampler states on R6xx-R7xx. */ if (rctx->b.chip_class <= R700 && @@ -693,6 +693,26 @@ static void r600_set_sampler_views(struct pipe_context *pipe, unsigned shader, } } +static void r600_update_compressed_colortex_mask(struct r600_samplerview_state *views) +{ + uint32_t mask = views->enabled_mask; + + while (mask) { + unsigned i = u_bit_scan(&mask); + struct pipe_resource *res = views->views[i]->base.texture; + + if (res && res->target != PIPE_BUFFER) { + struct r600_texture *rtex = (struct r600_texture *)res; + + if (rtex->cmask.size) { + views->compressed_colortex_mask |= 1 << i; + } else { + views->compressed_colortex_mask &= ~(1 << i); + } + } + } +} + static void r600_set_viewport_states(struct pipe_context *ctx, unsigned start_slot, unsigned num_viewports, @@ -1457,6 +1477,16 @@ static bool r600_update_derived_state(struct r600_context *rctx) if (!rctx->blitter->running) { unsigned i; + unsigned counter; + + counter = p_atomic_read(&rctx->screen->b.compressed_colortex_counter); + if (counter != rctx->b.last_compressed_colortex_counter) { + rctx->b.last_compressed_colortex_counter = counter; + + for (i = 0; i < PIPE_SHADER_TYPES; ++i) { + r600_update_compressed_colortex_mask(&rctx->samplers[i].views); + } + } /* Decompress textures if needed. */ for (i = 0; i < PIPE_SHADER_TYPES; i++) { @@ -1672,7 +1702,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info struct radeon_winsys_cs *cs = rctx->b.gfx.cs; bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off; uint64_t mask; - unsigned num_patches; + unsigned num_patches, dirty_fb_counter; if (!info.indirect && !info.count && (info.indexed || !info.count_from_stream_output)) { return; @@ -1688,6 +1718,13 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL); } + /* Re-emit the framebuffer state if needed. */ + dirty_fb_counter = p_atomic_read(&rctx->b.screen->dirty_fb_counter); + if (dirty_fb_counter != rctx->b.last_dirty_fb_counter) { + rctx->b.last_dirty_fb_counter = dirty_fb_counter; + r600_mark_atom_dirty(rctx, &rctx->framebuffer.atom); + } + if (!r600_update_derived_state(rctx)) { /* useless to render because current rendering command * can't be achieved diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c index b384baa9237..33ba0fbca9b 100644 --- a/src/gallium/drivers/radeon/r600_buffer_common.c +++ b/src/gallium/drivers/radeon/r600_buffer_common.c @@ -213,6 +213,10 @@ static bool r600_invalidate_buffer(struct r600_common_context *rctx, struct r600_resource *rbuffer) { + /* Shared buffers can't be reallocated. */ + if (rbuffer->is_shared) + return false; + /* In AMD_pinned_memory, the user pointer association only gets * broken when the buffer is explicitly re-allocated. */ @@ -294,6 +298,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, * in which case it can be mapped unsynchronized. */ if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED) && usage & PIPE_TRANSFER_WRITE && + !rbuffer->is_shared && !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) { usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } @@ -311,12 +316,17 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, if (r600_invalidate_buffer(rctx, rbuffer)) { /* At this point, the buffer is always idle. */ usage |= PIPE_TRANSFER_UNSYNCHRONIZED; + } else { + /* Fall back to a temporary buffer. */ + usage |= PIPE_TRANSFER_DISCARD_RANGE; } } - else if ((usage & PIPE_TRANSFER_DISCARD_RANGE) && - !(usage & PIPE_TRANSFER_UNSYNCHRONIZED) && - !(rscreen->debug_flags & DBG_NO_DISCARD_RANGE) && - r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) { + + if ((usage & PIPE_TRANSFER_DISCARD_RANGE) && + !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | + PIPE_TRANSFER_PERSISTENT)) && + !(rscreen->debug_flags & DBG_NO_DISCARD_RANGE) && + r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) { assert(usage & PIPE_TRANSFER_WRITE); /* Check if mapping this buffer would cause waiting for the GPU. */ @@ -341,7 +351,8 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, } /* Using a staging buffer in GTT for larger reads is much faster. */ else if ((usage & PIPE_TRANSFER_READ) && - !(usage & PIPE_TRANSFER_WRITE) && + !(usage & (PIPE_TRANSFER_WRITE | + PIPE_TRANSFER_PERSISTENT)) && rbuffer->domains == RADEON_DOMAIN_VRAM && r600_can_dma_copy_buffer(rctx, 0, box->x, box->width)) { struct r600_resource *staging; @@ -453,6 +464,7 @@ r600_alloc_buffer_struct(struct pipe_screen *screen, rbuffer->b.vtbl = &r600_buffer_vtbl; rbuffer->buf = NULL; rbuffer->TC_L2_dirty = false; + rbuffer->is_shared = false; util_range_init(&rbuffer->valid_buffer_range); return rbuffer; } diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h index b77b1321d73..cf8dcf7ea88 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.h +++ b/src/gallium/drivers/radeon/r600_pipe_common.h @@ -43,6 +43,8 @@ #include "util/u_suballoc.h" #include "util/u_transfer.h" +#define ATI_VENDOR_ID 0x1002 + #define R600_RESOURCE_FLAG_TRANSFER (PIPE_RESOURCE_FLAG_DRV_PRIV << 0) #define R600_RESOURCE_FLAG_FLUSHED_DEPTH (PIPE_RESOURCE_FLAG_DRV_PRIV << 1) #define R600_RESOURCE_FLAG_FORCE_TILING (PIPE_RESOURCE_FLAG_DRV_PRIV << 2) @@ -166,6 +168,10 @@ struct r600_resource { * use TC L2. */ bool TC_L2_dirty; + + /* Whether the resource has been exported via resource_get_handle. */ + bool is_shared; + unsigned external_usage; /* PIPE_HANDLE_USAGE_* */ }; struct r600_transfer { @@ -218,7 +224,7 @@ struct r600_texture { struct r600_fmask_info fmask; struct r600_cmask_info cmask; struct r600_resource *cmask_buffer; - struct r600_resource *dcc_buffer; + unsigned dcc_offset; /* 0 = disabled */ unsigned cb_color_info; /* fast clear enable bit */ unsigned color_clear_value[2]; @@ -321,6 +327,23 @@ struct r600_common_screen { /* Performance counters. */ struct r600_perfcounters *perfcounters; + + /* If pipe_screen wants to re-emit the framebuffer state of all + * contexts, it should atomically increment this. Each context will + * compare this with its own last known value of the counter before + * drawing and re-emit the framebuffer state accordingly. + */ + unsigned dirty_fb_counter; + + /* Atomically increment this counter when an existing texture's + * metadata is enabled or disabled in a way that requires changing + * contexts' compressed texture binding masks. + */ + unsigned compressed_colortex_counter; + + void (*query_opaque_metadata)(struct r600_common_screen *rscreen, + struct r600_texture *rtex, + struct radeon_bo_metadata *md); }; /* This encapsulates a state or an operation which can emitted into the GPU @@ -388,6 +411,8 @@ struct r600_common_context { struct pipe_fence_handle *last_sdma_fence; unsigned initial_gfx_cs_size; unsigned gpu_reset_counter; + unsigned last_dirty_fb_counter; + unsigned last_compressed_colortex_counter; struct u_upload_mgr *uploader; struct u_suballocator *allocator_so_filled_size; @@ -464,6 +489,9 @@ struct r600_common_context { unsigned first_layer, unsigned last_layer, unsigned first_sample, unsigned last_sample); + void (*decompress_dcc)(struct pipe_context *ctx, + struct r600_texture *rtex); + /* Reallocate the buffer and update all resource bindings where * the buffer is bound, including all resource descriptors. */ void (*invalidate_buffer)(struct pipe_context *ctx, struct pipe_resource *buf); diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c index 0b31d0a1f01..115c7289c4c 100644 --- a/src/gallium/drivers/radeon/r600_texture.c +++ b/src/gallium/drivers/radeon/r600_texture.c @@ -228,31 +228,145 @@ static int r600_setup_surface(struct pipe_screen *screen, return 0; } -static boolean r600_texture_get_handle(struct pipe_screen* screen, - struct pipe_resource *ptex, - struct winsys_handle *whandle) +static void r600_texture_init_metadata(struct r600_texture *rtex, + struct radeon_bo_metadata *metadata) { - struct r600_texture *rtex = (struct r600_texture*)ptex; - struct r600_resource *resource = &rtex->resource; struct radeon_surf *surface = &rtex->surface; + + memset(metadata, 0, sizeof(*metadata)); + metadata->microtile = surface->level[0].mode >= RADEON_SURF_MODE_1D ? + RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR; + metadata->macrotile = surface->level[0].mode >= RADEON_SURF_MODE_2D ? + RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR; + metadata->pipe_config = surface->pipe_config; + metadata->bankw = surface->bankw; + metadata->bankh = surface->bankh; + metadata->tile_split = surface->tile_split; + metadata->stencil_tile_split = surface->stencil_tile_split; + metadata->mtilea = surface->mtilea; + metadata->num_banks = surface->num_banks; + metadata->stride = surface->level[0].pitch_bytes; + metadata->scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0; +} + +static void r600_dirty_all_framebuffer_states(struct r600_common_screen *rscreen) +{ + p_atomic_inc(&rscreen->dirty_fb_counter); +} + +static void r600_eliminate_fast_color_clear(struct r600_common_screen *rscreen, + struct r600_texture *rtex) +{ + struct pipe_context *ctx = rscreen->aux_context; + + pipe_mutex_lock(rscreen->aux_context_lock); + ctx->flush_resource(ctx, &rtex->resource.b.b); + ctx->flush(ctx, NULL, 0); + pipe_mutex_unlock(rscreen->aux_context_lock); +} + +static void r600_texture_disable_cmask(struct r600_common_screen *rscreen, + struct r600_texture *rtex) +{ + if (!rtex->cmask.size) + return; + + assert(rtex->resource.b.b.nr_samples <= 1); + + /* Disable CMASK. */ + memset(&rtex->cmask, 0, sizeof(rtex->cmask)); + rtex->cmask.base_address_reg = rtex->resource.gpu_address >> 8; + + if (rscreen->chip_class >= SI) + rtex->cb_color_info &= ~SI_S_028C70_FAST_CLEAR(1); + else + rtex->cb_color_info &= ~EG_S_028C70_FAST_CLEAR(1); + + if (rtex->cmask_buffer != &rtex->resource) + pipe_resource_reference((struct pipe_resource**)&rtex->cmask_buffer, NULL); + + /* Notify all contexts about the change. */ + r600_dirty_all_framebuffer_states(rscreen); + p_atomic_inc(&rscreen->compressed_colortex_counter); +} + +static void r600_texture_disable_dcc(struct r600_common_screen *rscreen, + struct r600_texture *rtex) +{ + struct r600_common_context *rctx = + (struct r600_common_context *)rscreen->aux_context; + + if (!rtex->dcc_offset) + return; + + /* Decompress DCC. */ + pipe_mutex_lock(rscreen->aux_context_lock); + rctx->decompress_dcc(&rctx->b, rtex); + rctx->b.flush(&rctx->b, NULL, 0); + pipe_mutex_unlock(rscreen->aux_context_lock); + + /* Disable DCC. */ + rtex->dcc_offset = 0; + rtex->cb_color_info &= ~VI_S_028C70_DCC_ENABLE(1); + + /* Notify all contexts about the change. */ + r600_dirty_all_framebuffer_states(rscreen); +} + +static boolean r600_texture_get_handle(struct pipe_screen* screen, + struct pipe_resource *resource, + struct winsys_handle *whandle, + unsigned usage) +{ struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; + struct r600_resource *res = (struct r600_resource*)resource; + struct r600_texture *rtex = (struct r600_texture*)resource; + struct radeon_bo_metadata metadata; - rscreen->ws->buffer_set_tiling(resource->buf, - NULL, - surface->level[0].mode >= RADEON_SURF_MODE_1D ? - RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR, - surface->level[0].mode >= RADEON_SURF_MODE_2D ? - RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR, - surface->pipe_config, - surface->bankw, surface->bankh, - surface->tile_split, - surface->stencil_tile_split, - surface->mtilea, surface->num_banks, - surface->level[0].pitch_bytes, - (surface->flags & RADEON_SURF_SCANOUT) != 0); - - return rscreen->ws->buffer_get_handle(resource->buf, - surface->level[0].pitch_bytes, whandle); + /* This is not supported now, but it might be required for OpenCL + * interop in the future. + */ + if (resource->target != PIPE_BUFFER && + (resource->nr_samples > 1 || rtex->is_depth)) + return NULL; + + if (!res->is_shared) { + res->is_shared = true; + res->external_usage = usage; + + if (resource->target != PIPE_BUFFER) { + /* Since shader image stores don't support DCC on VI, + * disable it for external clients that want write + * access. + */ + if (usage & PIPE_HANDLE_USAGE_WRITE) + r600_texture_disable_dcc(rscreen, rtex); + + if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH)) { + /* Eliminate fast clear (both CMASK and DCC) */ + r600_eliminate_fast_color_clear(rscreen, rtex); + + /* Disable CMASK if flush_resource isn't going + * to be called. + */ + r600_texture_disable_cmask(rscreen, rtex); + } + + /* Set metadata. */ + r600_texture_init_metadata(rtex, &metadata); + if (rscreen->query_opaque_metadata) + rscreen->query_opaque_metadata(rscreen, rtex, + &metadata); + + rscreen->ws->buffer_set_metadata(res->buf, &metadata); + } + } else { + assert(res->external_usage == usage); + } + + return rscreen->ws->buffer_get_handle(res->buf, + rtex->surface.level[0].pitch_bytes, + whandle); } static void r600_texture_destroy(struct pipe_screen *screen, @@ -268,7 +382,6 @@ static void r600_texture_destroy(struct pipe_screen *screen, if (rtex->cmask_buffer != &rtex->resource) { pipe_resource_reference((struct pipe_resource**)&rtex->cmask_buffer, NULL); } - pipe_resource_reference((struct pipe_resource**)&rtex->dcc_buffer, NULL); pb_reference(&resource->buf, NULL); FREE(rtex); } @@ -489,25 +602,8 @@ static void r600_texture_alloc_cmask_separate(struct r600_common_screen *rscreen rtex->cb_color_info |= SI_S_028C70_FAST_CLEAR(1); else rtex->cb_color_info |= EG_S_028C70_FAST_CLEAR(1); -} -static void vi_texture_alloc_dcc_separate(struct r600_common_screen *rscreen, - struct r600_texture *rtex) -{ - if (rscreen->debug_flags & DBG_NO_DCC) - return; - - rtex->dcc_buffer = (struct r600_resource *) - r600_aligned_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM, - PIPE_USAGE_DEFAULT, rtex->surface.dcc_size, rtex->surface.dcc_alignment); - if (rtex->dcc_buffer == NULL) { - return; - } - - r600_screen_clear_buffer(rscreen, &rtex->dcc_buffer->b.b, 0, rtex->surface.dcc_size, - 0xFFFFFFFF, true); - - rtex->cb_color_info |= VI_S_028C70_DCC_ENABLE(1); + p_atomic_inc(&rscreen->compressed_colortex_counter); } static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen, @@ -644,10 +740,10 @@ void r600_print_texture_info(struct r600_texture *rtex, FILE *f) rtex->htile_buffer->buf->alignment, rtex->htile.pitch, rtex->htile.height, rtex->htile.xalign, rtex->htile.yalign); - if (rtex->dcc_buffer) { - fprintf(f, " DCC: size=%u, alignment=%u\n", - rtex->dcc_buffer->b.b.width0, - rtex->dcc_buffer->buf->alignment); + if (rtex->dcc_offset) { + fprintf(f, " DCC: offset=%u, size=%"PRIu64", alignment=%"PRIu64"\n", + rtex->dcc_offset, rtex->surface.dcc_size, + rtex->surface.dcc_alignment); for (i = 0; i <= rtex->surface.last_level; i++) fprintf(f, " DCCLevel[%i]: offset=%"PRIu64"\n", i, rtex->surface.level[i].dcc_offset); @@ -745,8 +841,14 @@ r600_texture_create_object(struct pipe_screen *screen, return NULL; } } - if (rtex->surface.dcc_size) - vi_texture_alloc_dcc_separate(rscreen, rtex); + + if (!buf && rtex->surface.dcc_size && + !(rscreen->debug_flags & DBG_NO_DCC)) { + /* Reserve space for the DCC buffer. */ + rtex->dcc_offset = align(rtex->size, rtex->surface.dcc_alignment); + rtex->size = rtex->dcc_offset + rtex->surface.dcc_size; + rtex->cb_color_info |= VI_S_028C70_DCC_ENABLE(1); + } } /* Now create the backing buffer. */ @@ -768,6 +870,12 @@ r600_texture_create_object(struct pipe_screen *screen, rtex->cmask.offset, rtex->cmask.size, 0xCCCCCCCC, true); } + if (rtex->dcc_offset) { + r600_screen_clear_buffer(rscreen, &rtex->resource.b.b, + rtex->dcc_offset, + rtex->surface.dcc_size, + 0xFFFFFFFF, true); + } /* Initialize the CMASK base register value. */ rtex->cmask.base_address_reg = @@ -877,16 +985,17 @@ struct pipe_resource *r600_texture_create(struct pipe_screen *screen, static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen, const struct pipe_resource *templ, - struct winsys_handle *whandle) + struct winsys_handle *whandle, + unsigned usage) { struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; struct pb_buffer *buf = NULL; unsigned stride = 0; unsigned array_mode; - enum radeon_bo_layout micro, macro; struct radeon_surf surface; - bool scanout; int r; + struct radeon_bo_metadata metadata = {}; + struct r600_texture *rtex; /* Support only 2D textures without mipmaps */ if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT) || @@ -897,15 +1006,17 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen if (!buf) return NULL; - rscreen->ws->buffer_get_tiling(buf, µ, ¯o, - &surface.bankw, &surface.bankh, - &surface.tile_split, - &surface.stencil_tile_split, - &surface.mtilea, &scanout); + rscreen->ws->buffer_get_metadata(buf, &metadata); - if (macro == RADEON_LAYOUT_TILED) + surface.bankw = metadata.bankw; + surface.bankh = metadata.bankh; + surface.tile_split = metadata.tile_split; + surface.stencil_tile_split = metadata.stencil_tile_split; + surface.mtilea = metadata.mtilea; + + if (metadata.macrotile == RADEON_LAYOUT_TILED) array_mode = RADEON_SURF_MODE_2D; - else if (micro == RADEON_LAYOUT_TILED) + else if (metadata.microtile == RADEON_LAYOUT_TILED) array_mode = RADEON_SURF_MODE_1D; else array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED; @@ -915,11 +1026,17 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen return NULL; } - if (scanout) + if (metadata.scanout) surface.flags |= RADEON_SURF_SCANOUT; - return (struct pipe_resource *)r600_texture_create_object(screen, templ, - stride, buf, &surface); + rtex = r600_texture_create_object(screen, templ, + stride, buf, &surface); + if (!rtex) + return NULL; + + rtex->resource.is_shared = true; + rtex->resource.external_usage = usage; + return &rtex->resource.b.b; } bool r600_init_flushed_depth_texture(struct pipe_context *ctx, @@ -1450,6 +1567,14 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx, continue; } + /* shared textures can't use fast clear without an explicit flush, + * because there is no way to communicate the clear color among + * all clients + */ + if (tex->resource.is_shared && + !(tex->resource.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH)) + continue; + /* fast color clear with 1D tiling doesn't work on old kernels and CIK */ if (tex->surface.level[0].mode == RADEON_SURF_MODE_1D && rctx->chip_class >= CIK && @@ -1458,7 +1583,7 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx, continue; } - if (tex->dcc_buffer) { + if (tex->dcc_offset) { uint32_t reset_value; bool clear_words_needed; @@ -1467,8 +1592,9 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx, vi_get_fast_clear_parameters(fb->cbufs[i]->format, color, &reset_value, &clear_words_needed); - rctx->clear_buffer(&rctx->b, &tex->dcc_buffer->b.b, - 0, tex->surface.dcc_size, reset_value, true); + rctx->clear_buffer(&rctx->b, &tex->resource.b.b, + tex->dcc_offset, tex->surface.dcc_size, + reset_value, true); if (clear_words_needed) tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level; diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c index 367aabc7a18..b8efc58eaab 100644 --- a/src/gallium/drivers/radeon/radeon_uvd.c +++ b/src/gallium/drivers/radeon/radeon_uvd.c @@ -207,7 +207,7 @@ static uint32_t profile2stream_type(struct ruvd_decoder *dec, unsigned family) } } -static unsigned calc_ctx_size(struct ruvd_decoder *dec) +static unsigned calc_ctx_size_h265_main(struct ruvd_decoder *dec) { unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH); unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT); @@ -224,6 +224,39 @@ static unsigned calc_ctx_size(struct ruvd_decoder *dec) return ((width + 255) / 16) * ((height + 255) / 16) * 16 * max_references + 52 * 1024; } +static unsigned calc_ctx_size_h265_main10(struct ruvd_decoder *dec, struct pipe_h265_picture_desc *pic) +{ + unsigned block_size, log2_ctb_size, width_in_ctb, height_in_ctb, num_16x16_block_per_ctb; + unsigned context_buffer_size_per_ctb_row, cm_buffer_size, max_mb_address, db_left_tile_pxl_size; + unsigned db_left_tile_ctx_size = 4096 / 16 * (32 + 16 * 4); + + unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH); + unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT); + unsigned coeff_10bit = (pic->pps->sps->bit_depth_luma_minus8 || pic->pps->sps->bit_depth_chroma_minus8) ? 2 : 1; + + unsigned max_references = dec->base.max_references + 1; + + if (dec->base.width * dec->base.height >= 4096*2000) + max_references = MAX2(max_references, 8); + else + max_references = MAX2(max_references, 17); + + block_size = (1 << (pic->pps->sps->log2_min_luma_coding_block_size_minus3 + 3)); + log2_ctb_size = block_size + pic->pps->sps->log2_diff_max_min_luma_coding_block_size; + + width_in_ctb = (width + ((1 << log2_ctb_size) - 1)) >> log2_ctb_size; + height_in_ctb = (height + ((1 << log2_ctb_size) - 1)) >> log2_ctb_size; + + num_16x16_block_per_ctb = ((1 << log2_ctb_size) >> 4) * ((1 << log2_ctb_size) >> 4); + context_buffer_size_per_ctb_row = align(width_in_ctb * num_16x16_block_per_ctb * 16, 256); + max_mb_address = (unsigned) ceil(height * 8 / 2048.0); + + cm_buffer_size = max_references * context_buffer_size_per_ctb_row * height_in_ctb; + db_left_tile_pxl_size = coeff_10bit * (max_mb_address * 2 * 2048 + 1024); + + return cm_buffer_size + db_left_tile_ctx_size + db_left_tile_pxl_size; +} + /* calculate size of reference picture buffer */ static unsigned calc_dpb_size(struct ruvd_decoder *dec) { @@ -305,7 +338,10 @@ static unsigned calc_dpb_size(struct ruvd_decoder *dec) width = align (width, 16); height = align (height, 16); - dpb_size = align((width * height * 3) / 2, 256) * max_references; + if (dec->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) + dpb_size = align((width * height * 9) / 4, 256) * max_references; + else + dpb_size = align((width * height * 3) / 2, 256) * max_references; break; case PIPE_VIDEO_FORMAT_VC1: @@ -596,6 +632,15 @@ static struct ruvd_h265 get_h265_msg(struct ruvd_decoder *dec, struct pipe_video result.direct_reflist[i][j] = pic->RefPicList[i][j]; } + if ((pic->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) && + (target->buffer_format == PIPE_FORMAT_NV12)) { + result.p010_mode = 0; + result.luma_10to8 = 5; + result.chroma_10to8 = 5; + result.sclr_luma10to8 = 4; + result.sclr_chroma10to8 = 4; + } + /* TODO result.highestTid; result.isNonRef; @@ -971,6 +1016,17 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder, case PIPE_VIDEO_FORMAT_HEVC: dec->msg->body.decode.codec.h265 = get_h265_msg(dec, target, (struct pipe_h265_picture_desc*)picture); + if (dec->ctx.res == NULL) { + unsigned ctx_size; + if (dec->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) + ctx_size = calc_ctx_size_h265_main10(dec, (struct pipe_h265_picture_desc*)picture); + else + ctx_size = calc_ctx_size_h265_main(dec); + if (!rvid_create_buffer(dec->screen, &dec->ctx, ctx_size, PIPE_USAGE_DEFAULT)) { + RVID_ERR("Can't allocated context buffer.\n"); + } + rvid_clear_buffer(decoder->context, &dec->ctx); + } break; case PIPE_VIDEO_FORMAT_VC1: @@ -1123,15 +1179,6 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context, rvid_clear_buffer(context, &dec->dpb); - if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_FORMAT_HEVC) { - unsigned ctx_size = calc_ctx_size(dec); - if (!rvid_create_buffer(dec->screen, &dec->ctx, ctx_size, PIPE_USAGE_DEFAULT)) { - RVID_ERR("Can't allocated context buffer.\n"); - goto error; - } - rvid_clear_buffer(context, &dec->ctx); - } - map_msg_fb_it_buf(dec); dec->msg->size = sizeof(*dec->msg); dec->msg->msg_type = RUVD_MSG_CREATE; diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c index 41603b32403..087d9422c04 100644 --- a/src/gallium/drivers/radeon/radeon_vce.c +++ b/src/gallium/drivers/radeon/radeon_vce.c @@ -404,7 +404,8 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, if (rscreen->info.drm_major == 3) enc->use_vm = true; - if ((rscreen->info.drm_major > 2) || (rscreen->info.drm_minor >= 42)) + if ((rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42) || + rscreen->info.drm_major == 3) enc->use_vui = true; if (rscreen->info.family >= CHIP_TONGA && rscreen->info.family != CHIP_STONEY) diff --git a/src/gallium/drivers/radeon/radeon_video.c b/src/gallium/drivers/radeon/radeon_video.c index ec29d8cb754..24b0eed51d2 100644 --- a/src/gallium/drivers/radeon/radeon_video.c +++ b/src/gallium/drivers/radeon/radeon_video.c @@ -237,6 +237,7 @@ int rvid_get_video_param(struct pipe_screen *screen, case PIPE_VIDEO_CAP_SUPPORTED: switch (codec) { case PIPE_VIDEO_FORMAT_MPEG12: + return profile != PIPE_VIDEO_PROFILE_MPEG1; case PIPE_VIDEO_FORMAT_MPEG4: case PIPE_VIDEO_FORMAT_MPEG4_AVC: if (rscreen->family < CHIP_PALM) @@ -247,8 +248,11 @@ int rvid_get_video_param(struct pipe_screen *screen, return true; case PIPE_VIDEO_FORMAT_HEVC: /* Carrizo only supports HEVC Main */ - return rscreen->family >= CHIP_CARRIZO && - profile == PIPE_VIDEO_PROFILE_HEVC_MAIN; + if (rscreen->family >= CHIP_STONEY) + return (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN || + profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10); + else if (rscreen->family >= CHIP_CARRIZO) + return profile == PIPE_VIDEO_PROFILE_HEVC_MAIN; default: return false; } @@ -257,7 +261,7 @@ int rvid_get_video_param(struct pipe_screen *screen, case PIPE_VIDEO_CAP_MAX_WIDTH: return (rscreen->family < CHIP_TONGA) ? 2048 : 4096; case PIPE_VIDEO_CAP_MAX_HEIGHT: - return (rscreen->family < CHIP_TONGA) ? 1152 : 2304; + return (rscreen->family < CHIP_TONGA) ? 1152 : 4096; case PIPE_VIDEO_CAP_PREFERED_FORMAT: return PIPE_FORMAT_NV12; case PIPE_VIDEO_CAP_PREFERS_INTERLACED: @@ -296,6 +300,7 @@ int rvid_get_video_param(struct pipe_screen *screen, case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH: return 41; case PIPE_VIDEO_PROFILE_HEVC_MAIN: + case PIPE_VIDEO_PROFILE_HEVC_MAIN_10: return 186; default: return 0; diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h index 1160d235062..b8a065957a7 100644 --- a/src/gallium/drivers/radeon/radeon_winsys.h +++ b/src/gallium/drivers/radeon/radeon_winsys.h @@ -231,6 +231,12 @@ struct radeon_winsys_cs { }; struct radeon_info { + /* PCI info: domain:bus:dev:func */ + uint32_t pci_domain; + uint32_t pci_bus; + uint32_t pci_dev; + uint32_t pci_func; + /* Device info. */ uint32_t pci_id; enum radeon_family family; @@ -276,6 +282,31 @@ struct radeon_info { uint32_t cik_macrotile_mode_array[16]; }; +/* Tiling info for display code, DRI sharing, and other data. */ +struct radeon_bo_metadata { + /* Tiling flags describing the texture layout for display code + * and DRI sharing. + */ + enum radeon_bo_layout microtile; + enum radeon_bo_layout macrotile; + unsigned pipe_config; + unsigned bankw; + unsigned bankh; + unsigned tile_split; + unsigned stencil_tile_split; + unsigned mtilea; + unsigned num_banks; + unsigned stride; + bool scanout; + + /* Additional metadata associated with the buffer, in bytes. + * The maximum size is 64 * 4. This is opaque for the winsys & kernel. + * Supported by amdgpu only. + */ + uint32_t size_metadata; + uint32_t metadata[64]; +}; + enum radeon_feature_id { RADEON_FID_R300_HYPERZ_ACCESS, /* ZMask + HiZ */ RADEON_FID_R300_CMASK_ACCESS, @@ -454,45 +485,24 @@ struct radeon_winsys { enum radeon_bo_usage usage); /** - * Return tiling flags describing a memory layout of a buffer object. + * Return buffer metadata. + * (tiling info for display code, DRI sharing, and other data) * * \param buf A winsys buffer object to get the flags from. - * \param macrotile A pointer to the return value of the microtile flag. - * \param microtile A pointer to the return value of the macrotile flag. - * - * \note microtile and macrotile are not bitmasks! + * \param md Metadata */ - void (*buffer_get_tiling)(struct pb_buffer *buf, - enum radeon_bo_layout *microtile, - enum radeon_bo_layout *macrotile, - unsigned *bankw, unsigned *bankh, - unsigned *tile_split, - unsigned *stencil_tile_split, - unsigned *mtilea, - bool *scanout); + void (*buffer_get_metadata)(struct pb_buffer *buf, + struct radeon_bo_metadata *md); /** - * Set tiling flags describing a memory layout of a buffer object. + * Set buffer metadata. + * (tiling info for display code, DRI sharing, and other data) * * \param buf A winsys buffer object to set the flags for. - * \param cs A command stream to flush if the buffer is referenced by it. - * \param macrotile A macrotile flag. - * \param microtile A microtile flag. - * \param stride A stride of the buffer in bytes, for texturing. - * - * \note microtile and macrotile are not bitmasks! - */ - void (*buffer_set_tiling)(struct pb_buffer *buf, - struct radeon_winsys_cs *rcs, - enum radeon_bo_layout microtile, - enum radeon_bo_layout macrotile, - unsigned pipe_config, - unsigned bankw, unsigned bankh, - unsigned tile_split, - unsigned stencil_tile_split, - unsigned mtilea, unsigned num_banks, - unsigned stride, - bool scanout); + * \param md Metadata + */ + void (*buffer_set_metadata)(struct pb_buffer *buf, + struct radeon_bo_metadata *md); /** * Get a winsys buffer from a winsys handle. The internal structure diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c index 76913914b38..6eb62dcc890 100644 --- a/src/gallium/drivers/radeonsi/cik_sdma.c +++ b/src/gallium/drivers/radeonsi/cik_sdma.c @@ -243,7 +243,7 @@ void cik_sdma_copy(struct pipe_context *ctx, if (src->format != dst->format || rdst->surface.nsamples > 1 || rsrc->surface.nsamples > 1 || (rdst->dirty_level_mask | rdst->stencil_dirty_level_mask) & (1 << dst_level) || - rdst->dcc_buffer || rsrc->dcc_buffer) { + rdst->dcc_offset || rsrc->dcc_offset) { goto fallback; } diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index 115877060ba..f9a6de48f6b 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -241,8 +241,9 @@ static void si_blit_decompress_depth_in_place(struct si_context *sctx, si_mark_atom_dirty(sctx, &sctx->db_render_state); } -void si_flush_depth_textures(struct si_context *sctx, - struct si_textures_info *textures) +static void +si_flush_depth_textures(struct si_context *sctx, + struct si_textures_info *textures) { unsigned i; unsigned mask = textures->depth_texture_mask; @@ -271,18 +272,29 @@ void si_flush_depth_textures(struct si_context *sctx, static void si_blit_decompress_color(struct pipe_context *ctx, struct r600_texture *rtex, unsigned first_level, unsigned last_level, - unsigned first_layer, unsigned last_layer) + unsigned first_layer, unsigned last_layer, + bool need_dcc_decompress) { struct si_context *sctx = (struct si_context *)ctx; unsigned layer, level, checked_last_layer, max_layer; - if (!rtex->dirty_level_mask) + if (!rtex->dirty_level_mask && !need_dcc_decompress) return; for (level = first_level; level <= last_level; level++) { - if (!(rtex->dirty_level_mask & (1 << level))) + void* custom_blend; + + if (!(rtex->dirty_level_mask & (1 << level)) && !need_dcc_decompress) continue; + if (rtex->dcc_offset && need_dcc_decompress) { + custom_blend = sctx->custom_blend_dcc_decompress; + } else if (rtex->fmask.size) { + custom_blend = sctx->custom_blend_decompress; + } else { + custom_blend = sctx->custom_blend_fastclear; + } + /* The smaller the mipmap level, the less layers there are * as far as 3D textures are concerned. */ max_layer = util_max_layer(&rtex->resource.b.b, level); @@ -298,9 +310,7 @@ static void si_blit_decompress_color(struct pipe_context *ctx, cbsurf = ctx->create_surface(ctx, &rtex->resource.b.b, &surf_tmpl); si_blitter_begin(ctx, SI_DECOMPRESS); - util_blitter_custom_color(sctx->blitter, cbsurf, - rtex->fmask.size ? sctx->custom_blend_decompress : - sctx->custom_blend_fastclear); + util_blitter_custom_color(sctx->blitter, cbsurf, custom_blend); si_blitter_end(ctx); pipe_surface_reference(&cbsurf, NULL); @@ -314,8 +324,9 @@ static void si_blit_decompress_color(struct pipe_context *ctx, } } -void si_decompress_color_textures(struct si_context *sctx, - struct si_textures_info *textures) +static void +si_decompress_color_textures(struct si_context *sctx, + struct si_textures_info *textures) { unsigned i; unsigned mask = textures->compressed_colortex_mask; @@ -330,11 +341,37 @@ void si_decompress_color_textures(struct si_context *sctx, assert(view); tex = (struct r600_texture *)view->texture; - assert(tex->cmask.size || tex->fmask.size || tex->dcc_buffer); + assert(tex->cmask.size || tex->fmask.size || tex->dcc_offset); si_blit_decompress_color(&sctx->b.b, tex, view->u.tex.first_level, view->u.tex.last_level, - 0, util_max_layer(&tex->resource.b.b, view->u.tex.first_level)); + 0, util_max_layer(&tex->resource.b.b, view->u.tex.first_level), + false); + } +} + +void si_decompress_textures(struct si_context *sctx) +{ + unsigned compressed_colortex_counter; + + if (sctx->blitter->running) + return; + + /* Update the compressed_colortex_mask if necessary. */ + compressed_colortex_counter = p_atomic_read(&sctx->screen->b.compressed_colortex_counter); + if (compressed_colortex_counter != sctx->b.last_compressed_colortex_counter) { + sctx->b.last_compressed_colortex_counter = compressed_colortex_counter; + si_update_compressed_colortex_masks(sctx); + } + + /* Flush depth textures which need to be flushed. */ + for (int i = 0; i < SI_NUM_SHADERS; i++) { + if (sctx->samplers[i].depth_texture_mask) { + si_flush_depth_textures(sctx, &sctx->samplers[i]); + } + if (sctx->samplers[i].compressed_colortex_mask) { + si_decompress_color_textures(sctx, &sctx->samplers[i]); + } } } @@ -483,9 +520,9 @@ static void si_decompress_subresource(struct pipe_context *ctx, si_blit_decompress_depth_in_place(sctx, rtex, true, level, level, first_layer, last_layer); - } else if (rtex->fmask.size || rtex->cmask.size || rtex->dcc_buffer) { + } else if (rtex->fmask.size || rtex->cmask.size || rtex->dcc_offset) { si_blit_decompress_color(ctx, rtex, level, level, - first_layer, last_layer); + first_layer, last_layer, false); } } @@ -712,7 +749,7 @@ static bool do_hardware_msaa_resolve(struct pipe_context *ctx, dst->surface.level[info->dst.level].mode >= RADEON_SURF_MODE_1D && !(dst->surface.flags & RADEON_SURF_SCANOUT) && (!dst->cmask.size || !dst->dirty_level_mask) && /* dst cannot be fast-cleared */ - !dst->dcc_buffer) { + !dst->dcc_offset) { si_blitter_begin(ctx, SI_COLOR_RESOLVE | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND)); util_blitter_custom_resolve_color(sctx->blitter, @@ -761,12 +798,23 @@ static void si_flush_resource(struct pipe_context *ctx, assert(res->target != PIPE_BUFFER); - if (!rtex->is_depth && rtex->cmask.size) { + if (!rtex->is_depth && (rtex->cmask.size || rtex->dcc_offset)) { si_blit_decompress_color(ctx, rtex, 0, res->last_level, - 0, util_max_layer(res, 0)); + 0, util_max_layer(res, 0), false); } } +static void si_decompress_dcc(struct pipe_context *ctx, + struct r600_texture *rtex) +{ + if (!rtex->dcc_offset) + return; + + si_blit_decompress_color(ctx, rtex, 0, rtex->resource.b.b.last_level, + 0, util_max_layer(&rtex->resource.b.b, 0), + true); +} + static void si_pipe_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, unsigned offset, unsigned size, @@ -836,4 +884,5 @@ void si_init_blit_functions(struct si_context *sctx) sctx->b.b.blit = si_blit; sctx->b.b.flush_resource = si_flush_resource; sctx->b.blit_decompress_depth = si_blit_decompress_depth; + sctx->b.decompress_dcc = si_decompress_dcc; } diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 345f2bbc381..d12b3e6b28a 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -150,20 +150,17 @@ static void si_release_sampler_views(struct si_sampler_views *views) si_release_descriptors(&views->desc); } -static void si_sampler_view_add_buffers(struct si_context *sctx, - struct si_sampler_view *rview) +static void si_sampler_view_add_buffer(struct si_context *sctx, + struct pipe_resource *resource) { - if (rview->resource) { - radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, - rview->resource, RADEON_USAGE_READ, - r600_get_sampler_view_priority(rview->resource)); - } + struct r600_resource *rres = (struct r600_resource*)resource; - if (rview->dcc_buffer && rview->dcc_buffer != rview->resource) { - radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, - rview->dcc_buffer, RADEON_USAGE_READ, - RADEON_PRIO_DCC); - } + if (!resource) + return; + + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rres, + RADEON_USAGE_READ, + r600_get_sampler_view_priority(rres)); } static void si_sampler_views_begin_new_cs(struct si_context *sctx, @@ -174,10 +171,8 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx, /* Add buffers to the CS. */ while (mask) { int i = u_bit_scan64(&mask); - struct si_sampler_view *rview = - (struct si_sampler_view*)views->views[i]; - si_sampler_view_add_buffers(sctx, rview); + si_sampler_view_add_buffer(sctx, views->views[i]->texture); } if (!views->desc.buffer) @@ -190,15 +185,20 @@ static void si_set_sampler_view(struct si_context *sctx, struct si_sampler_views *views, unsigned slot, struct pipe_sampler_view *view) { - if (views->views[slot] == view) + struct si_sampler_view *rview = (struct si_sampler_view*)view; + + if (view && view->texture && view->texture->target != PIPE_BUFFER && + G_008F28_COMPRESSION_EN(rview->state[6]) && + ((struct r600_texture*)view->texture)->dcc_offset == 0) { + rview->state[6] &= C_008F28_COMPRESSION_EN & + C_008F28_ALPHA_IS_ON_MSB; + } else if (views->views[slot] == view) return; if (view) { - struct si_sampler_view *rview = - (struct si_sampler_view*)view; - struct r600_texture *rtex = (struct r600_texture*)view->texture; + struct r600_texture *rtex = (struct r600_texture *)view->texture; - si_sampler_view_add_buffers(sctx, rview); + si_sampler_view_add_buffer(sctx, view->texture); pipe_sampler_view_reference(&views->views[slot], view); memcpy(views->desc.list + slot * 16, rview->state, 8*4); @@ -229,6 +229,12 @@ static void si_set_sampler_view(struct si_context *sctx, views->desc.list_dirty = true; } +static bool is_compressed_colortex(struct r600_texture *rtex) +{ + return rtex->cmask.size || rtex->fmask.size || + (rtex->dcc_offset && rtex->dirty_level_mask); +} + static void si_set_sampler_views(struct pipe_context *ctx, unsigned shader, unsigned start, unsigned count, @@ -262,8 +268,7 @@ static void si_set_sampler_views(struct pipe_context *ctx, } else { samplers->depth_texture_mask &= ~(1 << slot); } - if (rtex->cmask.size || rtex->fmask.size || - (rtex->dcc_buffer && rtex->dirty_level_mask)) { + if (is_compressed_colortex(rtex)) { samplers->compressed_colortex_mask |= 1 << slot; } else { samplers->compressed_colortex_mask &= ~(1 << slot); @@ -275,6 +280,27 @@ static void si_set_sampler_views(struct pipe_context *ctx, } } +static void +si_samplers_update_compressed_colortex_mask(struct si_textures_info *samplers) +{ + uint64_t mask = samplers->views.desc.enabled_mask; + + while (mask) { + int i = u_bit_scan64(&mask); + struct pipe_resource *res = samplers->views.views[i]->texture; + + if (res && res->target != PIPE_BUFFER) { + struct r600_texture *rtex = (struct r600_texture *)res; + + if (is_compressed_colortex(rtex)) { + samplers->compressed_colortex_mask |= 1 << i; + } else { + samplers->compressed_colortex_mask &= ~(1 << i); + } + } + } +} + /* SAMPLER STATES */ static void si_bind_sampler_states(struct pipe_context *ctx, unsigned shader, @@ -303,6 +329,7 @@ static void si_bind_sampler_states(struct pipe_context *ctx, unsigned shader, */ if (samplers->views.views[i] && samplers->views.views[i]->texture && + samplers->views.views[i]->texture->target != PIPE_BUFFER && ((struct r600_texture*)samplers->views.views[i]->texture)->fmask.size) continue; @@ -767,6 +794,19 @@ static void si_desc_reset_buffer_offset(struct pipe_context *ctx, S_008F04_BASE_ADDRESS_HI(va >> 32); } +/* TEXTURE METADATA ENABLE/DISABLE */ + +/* CMASK can be enabled (for fast clear) and disabled (for texture export) + * while the texture is bound, possibly by a different context. In that case, + * call this function to update compressed_colortex_masks. + */ +void si_update_compressed_colortex_masks(struct si_context *sctx) +{ + for (int i = 0; i < SI_NUM_SHADERS; ++i) { + si_samplers_update_compressed_colortex_mask(&sctx->samplers[i]); + } +} + /* BUFFER DISCARD/INVALIDATION */ /* Reallocate a buffer a update all resource bindings where the buffer is diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c index 240d96190a9..0efca193951 100644 --- a/src/gallium/drivers/radeonsi/si_dma.c +++ b/src/gallium/drivers/radeonsi/si_dma.c @@ -249,7 +249,7 @@ void si_dma_copy(struct pipe_context *ctx, (rdst->dirty_level_mask | rdst->stencil_dirty_level_mask) & (1 << dst_level) || rdst->cmask.size || rdst->fmask.size || rsrc->cmask.size || rsrc->fmask.size || - rdst->dcc_buffer || rsrc->dcc_buffer) { + rdst->dcc_offset || rsrc->dcc_offset) { goto fallback; } diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 37fd4a25d59..8b50a49cba0 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -68,6 +68,8 @@ static void si_destroy_context(struct pipe_context *context) sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_decompress); if (sctx->custom_blend_fastclear) sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_fastclear); + if (sctx->custom_blend_dcc_decompress) + sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_dcc_decompress); util_unreference_framebuffer_state(&sctx->framebuffer.state); if (sctx->blitter) @@ -418,7 +420,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) return PIPE_ENDIAN_LITTLE; case PIPE_CAP_VENDOR_ID: - return 0x1002; + return ATI_VENDOR_ID; case PIPE_CAP_DEVICE_ID: return sscreen->b.info.pci_id; case PIPE_CAP_ACCELERATED: @@ -427,6 +429,14 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) return sscreen->b.info.vram_size >> 20; case PIPE_CAP_UMA: return 0; + case PIPE_CAP_PCI_GROUP: + return sscreen->b.info.pci_domain; + case PIPE_CAP_PCI_BUS: + return sscreen->b.info.pci_bus; + case PIPE_CAP_PCI_DEVICE: + return sscreen->b.info.pci_dev; + case PIPE_CAP_PCI_FUNCTION: + return sscreen->b.info.pci_func; } return 0; } @@ -611,6 +621,8 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws) sscreen->b.b.is_format_supported = si_is_format_supported; sscreen->b.b.resource_create = r600_resource_create_common; + si_init_screen_state_functions(sscreen); + if (!r600_common_screen_init(&sscreen->b, ws) || !si_init_gs_info(sscreen) || !si_init_shader_cache(sscreen)) { diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index ef860a58b83..0fef5f72098 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -120,8 +120,6 @@ struct si_blend_color { struct si_sampler_view { struct pipe_sampler_view base; struct list_head list; - struct r600_resource *resource; - struct r600_resource *dcc_buffer; /* [0..7] = image descriptor * [4..7] = buffer descriptor */ uint32_t state[8]; @@ -197,6 +195,7 @@ struct si_context { void *custom_blend_resolve; void *custom_blend_decompress; void *custom_blend_fastclear; + void *custom_blend_dcc_decompress; void *pstipple_sampler_state; struct si_screen *screen; struct pipe_fence_handle *last_gfx_fence; @@ -334,10 +333,7 @@ void cik_sdma_copy(struct pipe_context *ctx, /* si_blit.c */ void si_init_blit_functions(struct si_context *sctx); -void si_flush_depth_textures(struct si_context *sctx, - struct si_textures_info *textures); -void si_decompress_color_textures(struct si_context *sctx, - struct si_textures_info *textures); +void si_decompress_textures(struct si_context *sctx); void si_resource_copy_region(struct pipe_context *ctx, struct pipe_resource *dst, unsigned dst_level, diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index b23b17ad77b..f823af188c7 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -34,6 +34,7 @@ #include "util/u_format_s3tc.h" #include "util/u_memory.h" #include "util/u_pstipple.h" +#include "util/u_resource.h" /* Initialize an external atom (owned by ../radeon). */ static void @@ -2250,11 +2251,7 @@ static void si_initialize_color_surface(struct si_context *sctx, } assert(format != V_028C70_COLOR_INVALID); swap = r600_translate_colorswap(surf->base.format); - if (rtex->resource.b.b.usage == PIPE_USAGE_STAGING) { - endian = V_028C70_ENDIAN_NONE; - } else { - endian = si_colorformat_endian_swap(format); - } + endian = si_colorformat_endian_swap(format); /* blend clamp should be set for all NORM/SRGB types */ if (ntype == V_028C70_NUMBER_UNORM || @@ -2322,9 +2319,8 @@ static void si_initialize_color_surface(struct si_context *sctx, surf->cb_color_info = color_info; surf->cb_color_attrib = color_attrib; - if (sctx->b.chip_class >= VI && rtex->dcc_buffer) { + if (sctx->b.chip_class >= VI && rtex->dcc_offset) { unsigned max_uncompressed_block_size = 2; - uint64_t dcc_offset = rtex->surface.level[level].dcc_offset; if (rtex->surface.nsamples > 1) { if (rtex->surface.bpe == 1) @@ -2335,7 +2331,9 @@ static void si_initialize_color_surface(struct si_context *sctx, surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(max_uncompressed_block_size) | S_028C78_INDEPENDENT_64B_BLOCKS(1); - surf->cb_dcc_base = (rtex->dcc_buffer->gpu_address + dcc_offset) >> 8; + surf->cb_dcc_base = (rtex->resource.gpu_address + + rtex->dcc_offset + + rtex->surface.level[level].dcc_offset) >> 8; } if (rtex->fmask.size) { @@ -2674,12 +2672,6 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom RADEON_PRIO_CMASK); } - if (tex->dcc_buffer && tex->dcc_buffer != &tex->resource) { - radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, - tex->dcc_buffer, RADEON_USAGE_READWRITE, - RADEON_PRIO_DCC); - } - radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, sctx->b.chip_class >= VI ? 14 : 13); radeon_emit(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */ @@ -2802,105 +2794,73 @@ static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples) */ /** - * Create a sampler view. - * - * @param ctx context - * @param texture texture - * @param state sampler view template - * @param width0 width0 override (for compressed textures as int) - * @param height0 height0 override (for compressed textures as int) - * @param force_level set the base address to the level (for compressed textures) + * Build the sampler view descriptor for a buffer texture. + * @param state 256-bit descriptor; only the high 128 bits are filled in */ -struct pipe_sampler_view * -si_create_sampler_view_custom(struct pipe_context *ctx, - struct pipe_resource *texture, - const struct pipe_sampler_view *state, - unsigned width0, unsigned height0, - unsigned force_level) +static void +si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf, + enum pipe_format format, + unsigned first_element, unsigned last_element, + uint32_t *state) { - struct si_context *sctx = (struct si_context*)ctx; - struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view); - struct r600_texture *tmp = (struct r600_texture*)texture; const struct util_format_description *desc; - unsigned format, num_format, base_level, first_level, last_level; - uint32_t pitch = 0; - unsigned char state_swizzle[4], swizzle[4]; - unsigned height, depth, width; - enum pipe_format pipe_format = state->format; - struct radeon_surf_level *surflevel; int first_non_void; uint64_t va; - unsigned last_layer = state->u.tex.last_layer; + unsigned stride; + unsigned num_records; + unsigned num_format, data_format; - if (!view) - return NULL; - - /* initialize base object */ - view->base = *state; - view->base.texture = NULL; - view->base.reference.count = 1; - view->base.context = ctx; - - /* NULL resource, obey swizzle (only ZERO and ONE make sense). */ - if (!texture) { - view->state[3] = S_008F1C_DST_SEL_X(si_map_swizzle(state->swizzle_r)) | - S_008F1C_DST_SEL_Y(si_map_swizzle(state->swizzle_g)) | - S_008F1C_DST_SEL_Z(si_map_swizzle(state->swizzle_b)) | - S_008F1C_DST_SEL_W(si_map_swizzle(state->swizzle_a)) | - S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D); - return &view->base; - } - - pipe_resource_reference(&view->base.texture, texture); - view->resource = &tmp->resource; - - if (state->format == PIPE_FORMAT_X24S8_UINT || - state->format == PIPE_FORMAT_S8X24_UINT || - state->format == PIPE_FORMAT_X32_S8X24_UINT || - state->format == PIPE_FORMAT_S8_UINT) - view->is_stencil_sampler = true; - - /* Buffer resource. */ - if (texture->target == PIPE_BUFFER) { - unsigned stride, num_records; - - desc = util_format_description(state->format); - first_non_void = util_format_get_first_non_void_channel(state->format); - stride = desc->block.bits / 8; - va = tmp->resource.gpu_address + state->u.buf.first_element*stride; - format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void); - num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void); + desc = util_format_description(format); + first_non_void = util_format_get_first_non_void_channel(format); + stride = desc->block.bits / 8; + va = buf->gpu_address + first_element * stride; + num_format = si_translate_buffer_numformat(&screen->b.b, desc, first_non_void); + data_format = si_translate_buffer_dataformat(&screen->b.b, desc, first_non_void); - num_records = state->u.buf.last_element + 1 - state->u.buf.first_element; - num_records = MIN2(num_records, texture->width0 / stride); + num_records = last_element + 1 - first_element; + num_records = MIN2(num_records, buf->b.b.width0 / stride); - if (sctx->b.chip_class >= VI) - num_records *= stride; - - view->state[4] = va; - view->state[5] = S_008F04_BASE_ADDRESS_HI(va >> 32) | - S_008F04_STRIDE(stride); - view->state[6] = num_records; - view->state[7] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) | - S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) | - S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) | - S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])) | - S_008F0C_NUM_FORMAT(num_format) | - S_008F0C_DATA_FORMAT(format); + if (screen->b.chip_class >= VI) + num_records *= stride; - LIST_ADDTAIL(&view->list, &sctx->b.texture_buffers); - return &view->base; - } - - state_swizzle[0] = state->swizzle_r; - state_swizzle[1] = state->swizzle_g; - state_swizzle[2] = state->swizzle_b; - state_swizzle[3] = state->swizzle_a; + state[4] = va; + state[5] = S_008F04_BASE_ADDRESS_HI(va >> 32) | + S_008F04_STRIDE(stride); + state[6] = num_records; + state[7] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) | + S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) | + S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) | + S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])) | + S_008F0C_NUM_FORMAT(num_format) | + S_008F0C_DATA_FORMAT(data_format); +} - surflevel = tmp->surface.level; +/** + * Build the sampler view descriptor for a texture. + */ +static void +si_make_texture_descriptor(struct si_screen *screen, + struct r600_texture *tex, + enum pipe_texture_target target, + enum pipe_format pipe_format, + const unsigned char state_swizzle[4], + unsigned base_level, unsigned first_level, unsigned last_level, + unsigned first_layer, unsigned last_layer, + unsigned width, unsigned height, unsigned depth, + uint32_t *state, + uint32_t *fmask_state) +{ + struct pipe_resource *res = &tex->resource.b.b; + const struct radeon_surf_level *surflevel = tex->surface.level; + const struct util_format_description *desc; + unsigned char swizzle[4]; + int first_non_void; + unsigned num_format, data_format; + uint32_t pitch; + uint64_t va; /* Texturing with separate depth and stencil. */ - if (tmp->is_depth && !tmp->is_flushing_texture) { + if (tex->is_depth && !tex->is_flushing_texture) { switch (pipe_format) { case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: pipe_format = PIPE_FORMAT_Z32_FLOAT; @@ -2914,7 +2874,7 @@ si_create_sampler_view_custom(struct pipe_context *ctx, case PIPE_FORMAT_S8X24_UINT: case PIPE_FORMAT_X32_S8X24_UINT: pipe_format = PIPE_FORMAT_S8_UINT; - surflevel = tmp->surface.stencil_level; + surflevel = tex->surface.stencil_level; break; default:; } @@ -3008,89 +2968,63 @@ si_create_sampler_view_custom(struct pipe_context *ctx, } } - format = si_translate_texformat(ctx->screen, pipe_format, desc, first_non_void); - if (format == ~0) { - format = 0; - } - - base_level = 0; - first_level = state->u.tex.first_level; - last_level = state->u.tex.last_level; - width = width0; - height = height0; - depth = texture->depth0; - - if (force_level) { - assert(force_level == first_level && - force_level == last_level); - base_level = force_level; - first_level = 0; - last_level = 0; - width = u_minify(width, force_level); - height = u_minify(height, force_level); - depth = u_minify(depth, force_level); + data_format = si_translate_texformat(&screen->b.b, pipe_format, desc, first_non_void); + if (data_format == ~0) { + data_format = 0; } - pitch = surflevel[base_level].nblk_x * util_format_get_blockwidth(pipe_format); - - if (texture->target == PIPE_TEXTURE_1D_ARRAY) { + if (res->target == PIPE_TEXTURE_1D_ARRAY) { height = 1; - depth = texture->array_size; - } else if (texture->target == PIPE_TEXTURE_2D_ARRAY) { - depth = texture->array_size; - } else if (texture->target == PIPE_TEXTURE_CUBE_ARRAY) - depth = texture->array_size / 6; + depth = res->array_size; + } else if (res->target == PIPE_TEXTURE_2D_ARRAY) { + depth = res->array_size; + } else if (res->target == PIPE_TEXTURE_CUBE_ARRAY) + depth = res->array_size / 6; - /* This is not needed if state trackers set last_layer correctly. */ - if (state->target == PIPE_TEXTURE_1D || - state->target == PIPE_TEXTURE_2D || - state->target == PIPE_TEXTURE_RECT || - state->target == PIPE_TEXTURE_CUBE) - last_layer = state->u.tex.first_layer; - - va = tmp->resource.gpu_address + surflevel[base_level].offset; - - view->state[0] = va >> 8; - view->state[1] = (S_008F14_BASE_ADDRESS_HI(va >> 40) | - S_008F14_DATA_FORMAT(format) | - S_008F14_NUM_FORMAT(num_format)); - view->state[2] = (S_008F18_WIDTH(width - 1) | - S_008F18_HEIGHT(height - 1)); - view->state[3] = (S_008F1C_DST_SEL_X(si_map_swizzle(swizzle[0])) | - S_008F1C_DST_SEL_Y(si_map_swizzle(swizzle[1])) | - S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) | - S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) | - S_008F1C_BASE_LEVEL(texture->nr_samples > 1 ? - 0 : first_level) | - S_008F1C_LAST_LEVEL(texture->nr_samples > 1 ? - util_logbase2(texture->nr_samples) : - last_level) | - S_008F1C_TILING_INDEX(si_tile_mode_index(tmp, base_level, false)) | - S_008F1C_POW2_PAD(texture->last_level > 0) | - S_008F1C_TYPE(si_tex_dim(texture->target, state->target, - texture->nr_samples))); - view->state[4] = (S_008F20_DEPTH(depth - 1) | S_008F20_PITCH(pitch - 1)); - view->state[5] = (S_008F24_BASE_ARRAY(state->u.tex.first_layer) | - S_008F24_LAST_ARRAY(last_layer)); - - if (tmp->dcc_buffer) { - uint64_t dcc_offset = surflevel[base_level].dcc_offset; + pitch = surflevel[base_level].nblk_x * util_format_get_blockwidth(pipe_format); + va = tex->resource.gpu_address + surflevel[base_level].offset; + + state[0] = va >> 8; + state[1] = (S_008F14_BASE_ADDRESS_HI(va >> 40) | + S_008F14_DATA_FORMAT(data_format) | + S_008F14_NUM_FORMAT(num_format)); + state[2] = (S_008F18_WIDTH(width - 1) | + S_008F18_HEIGHT(height - 1)); + state[3] = (S_008F1C_DST_SEL_X(si_map_swizzle(swizzle[0])) | + S_008F1C_DST_SEL_Y(si_map_swizzle(swizzle[1])) | + S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) | + S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) | + S_008F1C_BASE_LEVEL(res->nr_samples > 1 ? + 0 : first_level) | + S_008F1C_LAST_LEVEL(res->nr_samples > 1 ? + util_logbase2(res->nr_samples) : + last_level) | + S_008F1C_TILING_INDEX(si_tile_mode_index(tex, base_level, false)) | + S_008F1C_POW2_PAD(res->last_level > 0) | + S_008F1C_TYPE(si_tex_dim(res->target, target, res->nr_samples))); + state[4] = (S_008F20_DEPTH(depth - 1) | S_008F20_PITCH(pitch - 1)); + state[5] = (S_008F24_BASE_ARRAY(first_layer) | + S_008F24_LAST_ARRAY(last_layer)); + + if (tex->dcc_offset) { unsigned swap = r600_translate_colorswap(pipe_format); - view->state[6] = S_008F28_COMPRESSION_EN(1) | S_008F28_ALPHA_IS_ON_MSB(swap <= 1); - view->state[7] = (tmp->dcc_buffer->gpu_address + dcc_offset) >> 8; - view->dcc_buffer = tmp->dcc_buffer; + state[6] = S_008F28_COMPRESSION_EN(1) | S_008F28_ALPHA_IS_ON_MSB(swap <= 1); + state[7] = (tex->resource.gpu_address + + tex->dcc_offset + + surflevel[base_level].dcc_offset) >> 8; } else { - view->state[6] = 0; - view->state[7] = 0; + state[6] = 0; + state[7] = 0; } /* Initialize the sampler view for FMASK. */ - if (tmp->fmask.size) { - uint64_t va = tmp->resource.gpu_address + tmp->fmask.offset; + if (tex->fmask.size) { uint32_t fmask_format; - switch (texture->nr_samples) { + va = tex->resource.gpu_address + tex->fmask.offset; + + switch (res->nr_samples) { case 2: fmask_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F2; break; @@ -3105,27 +3039,129 @@ si_create_sampler_view_custom(struct pipe_context *ctx, fmask_format = V_008F14_IMG_DATA_FORMAT_INVALID; } - view->fmask_state[0] = va >> 8; - view->fmask_state[1] = S_008F14_BASE_ADDRESS_HI(va >> 40) | - S_008F14_DATA_FORMAT(fmask_format) | - S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_UINT); - view->fmask_state[2] = S_008F18_WIDTH(width - 1) | - S_008F18_HEIGHT(height - 1); - view->fmask_state[3] = S_008F1C_DST_SEL_X(V_008F1C_SQ_SEL_X) | - S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) | - S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | - S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) | - S_008F1C_TILING_INDEX(tmp->fmask.tile_mode_index) | - S_008F1C_TYPE(si_tex_dim(texture->target, - state->target, 0)); - view->fmask_state[4] = S_008F20_DEPTH(depth - 1) | - S_008F20_PITCH(tmp->fmask.pitch_in_pixels - 1); - view->fmask_state[5] = S_008F24_BASE_ARRAY(state->u.tex.first_layer) | - S_008F24_LAST_ARRAY(last_layer); - view->fmask_state[6] = 0; - view->fmask_state[7] = 0; + fmask_state[0] = va >> 8; + fmask_state[1] = S_008F14_BASE_ADDRESS_HI(va >> 40) | + S_008F14_DATA_FORMAT(fmask_format) | + S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_UINT); + fmask_state[2] = S_008F18_WIDTH(width - 1) | + S_008F18_HEIGHT(height - 1); + fmask_state[3] = S_008F1C_DST_SEL_X(V_008F1C_SQ_SEL_X) | + S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) | + S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | + S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) | + S_008F1C_TILING_INDEX(tex->fmask.tile_mode_index) | + S_008F1C_TYPE(si_tex_dim(res->target, target, 0)); + fmask_state[4] = S_008F20_DEPTH(depth - 1) | + S_008F20_PITCH(tex->fmask.pitch_in_pixels - 1); + fmask_state[5] = S_008F24_BASE_ARRAY(first_layer) | + S_008F24_LAST_ARRAY(last_layer); + fmask_state[6] = 0; + fmask_state[7] = 0; + } +} + +/** + * Create a sampler view. + * + * @param ctx context + * @param texture texture + * @param state sampler view template + * @param width0 width0 override (for compressed textures as int) + * @param height0 height0 override (for compressed textures as int) + * @param force_level set the base address to the level (for compressed textures) + */ +struct pipe_sampler_view * +si_create_sampler_view_custom(struct pipe_context *ctx, + struct pipe_resource *texture, + const struct pipe_sampler_view *state, + unsigned width0, unsigned height0, + unsigned force_level) +{ + struct si_context *sctx = (struct si_context*)ctx; + struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view); + struct r600_texture *tmp = (struct r600_texture*)texture; + unsigned base_level, first_level, last_level; + unsigned char state_swizzle[4]; + unsigned height, depth, width; + unsigned last_layer = state->u.tex.last_layer; + + if (!view) + return NULL; + + /* initialize base object */ + view->base = *state; + view->base.texture = NULL; + view->base.reference.count = 1; + view->base.context = ctx; + + /* NULL resource, obey swizzle (only ZERO and ONE make sense). */ + if (!texture) { + view->state[3] = S_008F1C_DST_SEL_X(si_map_swizzle(state->swizzle_r)) | + S_008F1C_DST_SEL_Y(si_map_swizzle(state->swizzle_g)) | + S_008F1C_DST_SEL_Z(si_map_swizzle(state->swizzle_b)) | + S_008F1C_DST_SEL_W(si_map_swizzle(state->swizzle_a)) | + S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D); + return &view->base; } + pipe_resource_reference(&view->base.texture, texture); + + if (state->format == PIPE_FORMAT_X24S8_UINT || + state->format == PIPE_FORMAT_S8X24_UINT || + state->format == PIPE_FORMAT_X32_S8X24_UINT || + state->format == PIPE_FORMAT_S8_UINT) + view->is_stencil_sampler = true; + + /* Buffer resource. */ + if (texture->target == PIPE_BUFFER) { + si_make_buffer_descriptor(sctx->screen, + (struct r600_resource *)texture, + state->format, + state->u.buf.first_element, + state->u.buf.last_element, + view->state); + + LIST_ADDTAIL(&view->list, &sctx->b.texture_buffers); + return &view->base; + } + + state_swizzle[0] = state->swizzle_r; + state_swizzle[1] = state->swizzle_g; + state_swizzle[2] = state->swizzle_b; + state_swizzle[3] = state->swizzle_a; + + base_level = 0; + first_level = state->u.tex.first_level; + last_level = state->u.tex.last_level; + width = width0; + height = height0; + depth = texture->depth0; + + if (force_level) { + assert(force_level == first_level && + force_level == last_level); + base_level = force_level; + first_level = 0; + last_level = 0; + width = u_minify(width, force_level); + height = u_minify(height, force_level); + depth = u_minify(depth, force_level); + } + + /* This is not needed if state trackers set last_layer correctly. */ + if (state->target == PIPE_TEXTURE_1D || + state->target == PIPE_TEXTURE_2D || + state->target == PIPE_TEXTURE_RECT || + state->target == PIPE_TEXTURE_CUBE) + last_layer = state->u.tex.first_layer; + + si_make_texture_descriptor(sctx->screen, tmp, state->target, + state->format, state_swizzle, + base_level, first_level, last_level, + state->u.tex.first_layer, last_layer, + width, height, depth, + view->state, view->fmask_state); + return &view->base; } @@ -3144,7 +3180,7 @@ static void si_sampler_view_destroy(struct pipe_context *ctx, { struct si_sampler_view *view = (struct si_sampler_view *)state; - if (view->resource && view->resource->b.b.target == PIPE_BUFFER) + if (state->texture && state->texture->target == PIPE_BUFFER) LIST_DELINIT(&view->list); pipe_resource_reference(&state->texture, NULL); @@ -3522,6 +3558,7 @@ void si_init_state_functions(struct si_context *sctx) sctx->custom_blend_resolve = si_create_blend_custom(sctx, V_028808_CB_RESOLVE); sctx->custom_blend_decompress = si_create_blend_custom(sctx, V_028808_CB_FMASK_DECOMPRESS); sctx->custom_blend_fastclear = si_create_blend_custom(sctx, V_028808_CB_ELIMINATE_FAST_CLEAR); + sctx->custom_blend_dcc_decompress = si_create_blend_custom(sctx, V_028808_CB_DCC_DECOMPRESS); sctx->b.b.set_clip_state = si_set_clip_state; sctx->b.b.set_scissor_states = si_set_scissor_states; @@ -3564,6 +3601,68 @@ void si_init_state_functions(struct si_context *sctx) si_init_config(sctx); } +static void si_query_opaque_metadata(struct r600_common_screen *rscreen, + struct r600_texture *rtex, + struct radeon_bo_metadata *md) +{ + struct si_screen *sscreen = (struct si_screen*)rscreen; + struct pipe_resource *res = &rtex->resource.b.b; + static const unsigned char swizzle[] = { + PIPE_SWIZZLE_RED, + PIPE_SWIZZLE_GREEN, + PIPE_SWIZZLE_BLUE, + PIPE_SWIZZLE_ALPHA + }; + uint32_t desc[8], i; + bool is_array = util_resource_is_array_texture(res); + + /* DRM 2.x.x doesn't support this. */ + if (rscreen->info.drm_major != 3) + return; + + assert(rtex->fmask.size == 0); + + /* Metadata image format format version 1: + * [0] = 1 (metadata format identifier) + * [1] = (VENDOR_ID << 16) | PCI_ID + * [2:9] = image descriptor for the whole resource + * [2] is always 0, because the base address is cleared + * [9] is the DCC offset bits [39:8] from the beginning of + * the buffer + * [10:10+LAST_LEVEL] = mipmap level offset bits [39:8] for each level + */ + + md->metadata[0] = 1; /* metadata image format version 1 */ + + /* TILE_MODE_INDEX is ambiguous without a PCI ID. */ + md->metadata[1] = (ATI_VENDOR_ID << 16) | rscreen->info.pci_id; + + si_make_texture_descriptor(sscreen, rtex, res->target, res->format, + swizzle, 0, 0, res->last_level, 0, + is_array ? res->array_size - 1 : 0, + res->width0, res->height0, res->depth0, + desc, NULL); + + /* Clear the base address and set the relative DCC offset. */ + desc[0] = 0; + desc[1] &= C_008F14_BASE_ADDRESS_HI; + desc[7] = rtex->dcc_offset >> 8; + + /* Dwords [2:9] contain the image descriptor. */ + memcpy(&md->metadata[2], desc, sizeof(desc)); + + /* Dwords [10:..] contain the mipmap level offsets. */ + for (i = 0; i <= res->last_level; i++) + md->metadata[10+i] = rtex->surface.level[i].offset >> 8; + + md->size_metadata = (11 + res->last_level) * 4; +} + +void si_init_screen_state_functions(struct si_screen *sscreen) +{ + sscreen->b.query_opaque_metadata = si_query_opaque_metadata; +} + static void si_write_harvested_raster_configs(struct si_context *sctx, struct si_pm4_state *pm4, diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 40792cbc1d5..60c34f19e55 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -249,6 +249,7 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx); void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer, const uint8_t *ptr, unsigned size, uint32_t *const_offset); void si_shader_change_notify(struct si_context *sctx); +void si_update_compressed_colortex_masks(struct si_context *sctx); void si_emit_shader_userdata(struct si_context *sctx, struct r600_atom *atom); /* si_state.c */ @@ -263,6 +264,7 @@ boolean si_is_format_supported(struct pipe_screen *screen, unsigned sample_count, unsigned usage); void si_init_state_functions(struct si_context *sctx); +void si_init_screen_state_functions(struct si_screen *sscreen); unsigned cik_bank_wh(unsigned bankwh); unsigned cik_db_pipe_config(struct si_screen *sscreen, unsigned tile_mode); unsigned cik_macro_tile_aspect(unsigned macro_tile_aspect); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 91ccd073267..84b850a2992 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -33,21 +33,6 @@ #include "util/u_upload_mgr.h" #include "util/u_prim.h" -static void si_decompress_textures(struct si_context *sctx) -{ - if (!sctx->blitter->running) { - /* Flush depth textures which need to be flushed. */ - for (int i = 0; i < SI_NUM_SHADERS; i++) { - if (sctx->samplers[i].depth_texture_mask) { - si_flush_depth_textures(sctx, &sctx->samplers[i]); - } - if (sctx->samplers[i].compressed_colortex_mask) { - si_decompress_color_textures(sctx, &sctx->samplers[i]); - } - } - } -} - static unsigned si_conv_pipe_prim(unsigned mode) { static const unsigned prim_conv[] = { @@ -763,7 +748,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) struct si_context *sctx = (struct si_context *)ctx; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; struct pipe_index_buffer ib = {}; - unsigned mask; + unsigned mask, dirty_fb_counter; if (!info->count && !info->indirect && (info->indexed || !info->count_from_stream_output)) @@ -782,6 +767,16 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) return; } + /* Re-emit the framebuffer state if needed. */ + dirty_fb_counter = p_atomic_read(&sctx->b.screen->dirty_fb_counter); + if (dirty_fb_counter != sctx->b.last_dirty_fb_counter) { + sctx->b.last_dirty_fb_counter = dirty_fb_counter; + sctx->framebuffer.dirty_cbufs |= + ((1 << sctx->framebuffer.state.nr_cbufs) - 1); + sctx->framebuffer.dirty_zsbuf = true; + si_mark_atom_dirty(sctx, &sctx->framebuffer.atom); + } + si_decompress_textures(sctx); /* Set the rasterization primitive type. diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 321b87d80a6..5fe1f7960f3 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1154,6 +1154,9 @@ static void *si_create_shader_selector(struct pipe_context *ctx, break; } + if (sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]) + sel->db_shader_control |= S_02880C_DEPTH_BEFORE_SHADER(1); + /* Compile the main shader part for use with a prolog and/or epilog. */ if (sel->type != PIPE_SHADER_GEOMETRY && !sscreen->use_monolithic_shaders) { diff --git a/src/gallium/drivers/rbug/rbug_screen.c b/src/gallium/drivers/rbug/rbug_screen.c index ac764029a2f..c2950e4a703 100644 --- a/src/gallium/drivers/rbug/rbug_screen.c +++ b/src/gallium/drivers/rbug/rbug_screen.c @@ -160,13 +160,14 @@ rbug_screen_resource_create(struct pipe_screen *_screen, static struct pipe_resource * rbug_screen_resource_from_handle(struct pipe_screen *_screen, const struct pipe_resource *templ, - struct winsys_handle *handle) + struct winsys_handle *handle, + unsigned usage) { struct rbug_screen *rb_screen = rbug_screen(_screen); struct pipe_screen *screen = rb_screen->screen; struct pipe_resource *result; - result = screen->resource_from_handle(screen, templ, handle); + result = screen->resource_from_handle(screen, templ, handle, usage); result = rbug_resource_create(rbug_screen(_screen), result); @@ -176,14 +177,15 @@ rbug_screen_resource_from_handle(struct pipe_screen *_screen, static boolean rbug_screen_resource_get_handle(struct pipe_screen *_screen, struct pipe_resource *_resource, - struct winsys_handle *handle) + struct winsys_handle *handle, + unsigned usage) { struct rbug_screen *rb_screen = rbug_screen(_screen); struct rbug_resource *rb_resource = rbug_resource(_resource); struct pipe_screen *screen = rb_screen->screen; struct pipe_resource *resource = rb_resource->resource; - return screen->resource_get_handle(screen, resource, handle); + return screen->resource_get_handle(screen, resource, handle, usage); } diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c index 097ffe6f920..bfd3598fc57 100644 --- a/src/gallium/drivers/softpipe/sp_screen.c +++ b/src/gallium/drivers/softpipe/sp_screen.c @@ -266,6 +266,10 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: case PIPE_CAP_QUERY_BUFFER_OBJECT: case PIPE_CAP_QUERY_MEMORY_INFO: + case PIPE_CAP_PCI_GROUP: + case PIPE_CAP_PCI_BUS: + case PIPE_CAP_PCI_DEVICE: + case PIPE_CAP_PCI_FUNCTION: return 0; } /* should only get here on unhandled cases */ diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c index e3e28a3ef32..5703ca2dedb 100644 --- a/src/gallium/drivers/softpipe/sp_tex_sample.c +++ b/src/gallium/drivers/softpipe/sp_tex_sample.c @@ -1047,7 +1047,7 @@ img_filter_2d_linear_repeat_POT(const struct sp_sampler_view *sp_sview, } /* interpolate R, G, B, A */ - for (c = 0; c < TGSI_QUAD_SIZE; c++) { + for (c = 0; c < TGSI_NUM_CHANNELS; c++) { rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw, tx[0][c], tx[1][c], tx[2][c], tx[3][c]); @@ -1063,7 +1063,7 @@ static inline void img_filter_2d_nearest_repeat_POT(const struct sp_sampler_view *sp_sview, const struct sp_sampler *sp_samp, const struct img_filter_args *args, - float rgba[TGSI_QUAD_SIZE]) + float *rgba) { const unsigned xpot = pot_level_size(sp_sview->xpot, args->level); const unsigned ypot = pot_level_size(sp_sview->ypot, args->level); @@ -1085,7 +1085,7 @@ img_filter_2d_nearest_repeat_POT(const struct sp_sampler_view *sp_sview, addr.bits.z = sp_sview->base.u.tex.first_layer; out = get_texel_2d_no_border(sp_sview, addr, x0, y0); - for (c = 0; c < TGSI_QUAD_SIZE; c++) + for (c = 0; c < TGSI_NUM_CHANNELS; c++) rgba[TGSI_NUM_CHANNELS*c] = out[c]; if (DEBUG_TEX) { @@ -1098,7 +1098,7 @@ static inline void img_filter_2d_nearest_clamp_POT(const struct sp_sampler_view *sp_sview, const struct sp_sampler *sp_samp, const struct img_filter_args *args, - float rgba[TGSI_QUAD_SIZE]) + float *rgba) { const unsigned xpot = pot_level_size(sp_sview->xpot, args->level); const unsigned ypot = pot_level_size(sp_sview->ypot, args->level); @@ -1128,7 +1128,7 @@ img_filter_2d_nearest_clamp_POT(const struct sp_sampler_view *sp_sview, y0 = ypot - 1; out = get_texel_2d_no_border(sp_sview, addr, x0, y0); - for (c = 0; c < TGSI_QUAD_SIZE; c++) + for (c = 0; c < TGSI_NUM_CHANNELS; c++) rgba[TGSI_NUM_CHANNELS*c] = out[c]; if (DEBUG_TEX) { @@ -1141,7 +1141,7 @@ static void img_filter_1d_nearest(const struct sp_sampler_view *sp_sview, const struct sp_sampler *sp_samp, const struct img_filter_args *args, - float rgba[TGSI_QUAD_SIZE]) + float *rgba) { const struct pipe_resource *texture = sp_sview->base.texture; const int width = u_minify(texture->width0, args->level); @@ -1159,7 +1159,7 @@ img_filter_1d_nearest(const struct sp_sampler_view *sp_sview, out = get_texel_1d_array(sp_sview, sp_samp, addr, x, sp_sview->base.u.tex.first_layer); - for (c = 0; c < TGSI_QUAD_SIZE; c++) + for (c = 0; c < TGSI_NUM_CHANNELS; c++) rgba[TGSI_NUM_CHANNELS*c] = out[c]; if (DEBUG_TEX) { @@ -1191,7 +1191,7 @@ img_filter_1d_array_nearest(const struct sp_sampler_view *sp_sview, sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x); out = get_texel_1d_array(sp_sview, sp_samp, addr, x, layer); - for (c = 0; c < TGSI_QUAD_SIZE; c++) + for (c = 0; c < TGSI_NUM_CHANNELS; c++) rgba[TGSI_NUM_CHANNELS*c] = out[c]; if (DEBUG_TEX) { @@ -1225,7 +1225,7 @@ img_filter_2d_nearest(const struct sp_sampler_view *sp_sview, sp_samp->nearest_texcoord_t(args->t, height, args->offset[1], &y); out = get_texel_2d(sp_sview, sp_samp, addr, x, y); - for (c = 0; c < TGSI_QUAD_SIZE; c++) + for (c = 0; c < TGSI_NUM_CHANNELS; c++) rgba[TGSI_NUM_CHANNELS*c] = out[c]; if (DEBUG_TEX) { @@ -1260,7 +1260,7 @@ img_filter_2d_array_nearest(const struct sp_sampler_view *sp_sview, sp_samp->nearest_texcoord_t(args->t, height, args->offset[1], &y); out = get_texel_2d_array(sp_sview, sp_samp, addr, x, y, layer); - for (c = 0; c < TGSI_QUAD_SIZE; c++) + for (c = 0; c < TGSI_NUM_CHANNELS; c++) rgba[TGSI_NUM_CHANNELS*c] = out[c]; if (DEBUG_TEX) { @@ -1304,7 +1304,7 @@ img_filter_cube_nearest(const struct sp_sampler_view *sp_sview, } out = get_texel_cube_array(sp_sview, sp_samp, addr, x, y, layerface); - for (c = 0; c < TGSI_QUAD_SIZE; c++) + for (c = 0; c < TGSI_NUM_CHANNELS; c++) rgba[TGSI_NUM_CHANNELS*c] = out[c]; if (DEBUG_TEX) { @@ -1340,7 +1340,7 @@ img_filter_cube_array_nearest(const struct sp_sampler_view *sp_sview, sp_samp->nearest_texcoord_t(args->t, height, args->offset[1], &y); out = get_texel_cube_array(sp_sview, sp_samp, addr, x, y, layerface); - for (c = 0; c < TGSI_QUAD_SIZE; c++) + for (c = 0; c < TGSI_NUM_CHANNELS; c++) rgba[TGSI_NUM_CHANNELS*c] = out[c]; if (DEBUG_TEX) { @@ -1375,7 +1375,7 @@ img_filter_3d_nearest(const struct sp_sampler_view *sp_sview, addr.bits.level = args->level; out = get_texel_3d(sp_sview, sp_samp, addr, x, y, z); - for (c = 0; c < TGSI_QUAD_SIZE; c++) + for (c = 0; c < TGSI_NUM_CHANNELS; c++) rgba[TGSI_NUM_CHANNELS*c] = out[c]; } @@ -1407,7 +1407,7 @@ img_filter_1d_linear(const struct sp_sampler_view *sp_sview, sp_sview->base.u.tex.first_layer); /* interpolate R, G, B, A */ - for (c = 0; c < TGSI_QUAD_SIZE; c++) + for (c = 0; c < TGSI_NUM_CHANNELS; c++) rgba[TGSI_NUM_CHANNELS*c] = lerp(xw, tx0[c], tx1[c]); } @@ -1439,7 +1439,7 @@ img_filter_1d_array_linear(const struct sp_sampler_view *sp_sview, tx1 = get_texel_1d_array(sp_sview, sp_samp, addr, x1, layer); /* interpolate R, G, B, A */ - for (c = 0; c < TGSI_QUAD_SIZE; c++) + for (c = 0; c < TGSI_NUM_CHANNELS; c++) rgba[TGSI_NUM_CHANNELS*c] = lerp(xw, tx0[c], tx1[c]); } @@ -1541,13 +1541,13 @@ img_filter_2d_linear(const struct sp_sampler_view *sp_sview, tx[3] = get_texel_2d(sp_sview, sp_samp, addr, x1, y1); if (args->gather_only) { - for (c = 0; c < TGSI_QUAD_SIZE; c++) + for (c = 0; c < TGSI_NUM_CHANNELS; c++) rgba[TGSI_NUM_CHANNELS*c] = get_gather_value(sp_sview, c, args->gather_comp, tx); } else { /* interpolate R, G, B, A */ - for (c = 0; c < TGSI_QUAD_SIZE; c++) + for (c = 0; c < TGSI_NUM_CHANNELS; c++) rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw, tx[0][c], tx[1][c], tx[2][c], tx[3][c]); @@ -1587,13 +1587,13 @@ img_filter_2d_array_linear(const struct sp_sampler_view *sp_sview, tx[3] = get_texel_2d_array(sp_sview, sp_samp, addr, x1, y1, layer); if (args->gather_only) { - for (c = 0; c < TGSI_QUAD_SIZE; c++) + for (c = 0; c < TGSI_NUM_CHANNELS; c++) rgba[TGSI_NUM_CHANNELS*c] = get_gather_value(sp_sview, c, args->gather_comp, tx); } else { /* interpolate R, G, B, A */ - for (c = 0; c < TGSI_QUAD_SIZE; c++) + for (c = 0; c < TGSI_NUM_CHANNELS; c++) rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw, tx[0][c], tx[1][c], tx[2][c], tx[3][c]); @@ -1652,13 +1652,13 @@ img_filter_cube_linear(const struct sp_sampler_view *sp_sview, } if (args->gather_only) { - for (c = 0; c < TGSI_QUAD_SIZE; c++) + for (c = 0; c < TGSI_NUM_CHANNELS; c++) rgba[TGSI_NUM_CHANNELS*c] = get_gather_value(sp_sview, c, args->gather_comp, tx); } else { /* interpolate R, G, B, A */ - for (c = 0; c < TGSI_QUAD_SIZE; c++) + for (c = 0; c < TGSI_NUM_CHANNELS; c++) rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw, tx[0][c], tx[1][c], tx[2][c], tx[3][c]); @@ -1720,13 +1720,13 @@ img_filter_cube_array_linear(const struct sp_sampler_view *sp_sview, } if (args->gather_only) { - for (c = 0; c < TGSI_QUAD_SIZE; c++) + for (c = 0; c < TGSI_NUM_CHANNELS; c++) rgba[TGSI_NUM_CHANNELS*c] = get_gather_value(sp_sview, c, args->gather_comp, tx); } else { /* interpolate R, G, B, A */ - for (c = 0; c < TGSI_QUAD_SIZE; c++) + for (c = 0; c < TGSI_NUM_CHANNELS; c++) rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw, tx[0][c], tx[1][c], tx[2][c], tx[3][c]); @@ -1771,7 +1771,7 @@ img_filter_3d_linear(const struct sp_sampler_view *sp_sview, tx13 = get_texel_3d(sp_sview, sp_samp, addr, x1, y1, z1); /* interpolate R, G, B, A */ - for (c = 0; c < TGSI_QUAD_SIZE; c++) + for (c = 0; c < TGSI_NUM_CHANNELS; c++) rgba[TGSI_NUM_CHANNELS*c] = lerp_3d(xw, yw, zw, tx00[c], tx01[c], tx02[c], tx03[c], @@ -2209,6 +2209,7 @@ img_filter_2d_ewa(const struct sp_sampler_view *sp_sview, const float t[TGSI_QUAD_SIZE], const float p[TGSI_QUAD_SIZE], const uint faces[TGSI_QUAD_SIZE], + const int8_t *offset, unsigned level, const float dudx, const float dvdx, const float dudy, const float dvdy, @@ -2268,6 +2269,8 @@ img_filter_2d_ewa(const struct sp_sampler_view *sp_sview, /* F *= formScale; */ /* no need to scale F as we don't use it below here */ args.level = level; + args.offset = offset; + for (j = 0; j < TGSI_QUAD_SIZE; j++) { /* Heckbert MS thesis, p. 59; scan over the bounding box of the ellipse * and incrementally update the value of Ax^2+Bxy*Cy^2; when this @@ -2431,6 +2434,8 @@ mip_filter_linear_aniso(const struct sp_sampler_view *sp_sview, const float dvdy = (t[QUAD_TOP_LEFT] - t[QUAD_BOTTOM_LEFT]) * t_to_v; struct img_filter_args args; + args.offset = filt_args->offset; + if (filt_args->control == TGSI_SAMPLER_LOD_BIAS || filt_args->control == TGSI_SAMPLER_LOD_NONE || /* XXX FIXME */ @@ -2495,6 +2500,11 @@ mip_filter_linear_aniso(const struct sp_sampler_view *sp_sview, args.p = p[j]; args.level = psview->u.tex.last_level; args.face_id = filt_args->faces[j]; + /* + * XXX: we overwrote any linear filter with nearest, so this + * isn't right (albeit if last level is 1x1 and no border it + * will work just the same). + */ min_filter(sp_sview, sp_samp, &args, &rgba[0][j]); } } @@ -2503,8 +2513,8 @@ mip_filter_linear_aniso(const struct sp_sampler_view *sp_sview, * seem to be worth the extra running time. */ img_filter_2d_ewa(sp_sview, sp_samp, min_filter, mag_filter, - s, t, p, filt_args->faces, level0, - dudx, dvdx, dudy, dvdy, rgba); + s, t, p, filt_args->faces, filt_args->offset, + level0, dudx, dvdx, dudy, dvdy, rgba); } if (DEBUG_TEX) { diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c index 52df89504b8..52ec373f8f2 100644 --- a/src/gallium/drivers/softpipe/sp_texture.c +++ b/src/gallium/drivers/softpipe/sp_texture.c @@ -218,7 +218,8 @@ softpipe_resource_destroy(struct pipe_screen *pscreen, static struct pipe_resource * softpipe_resource_from_handle(struct pipe_screen *screen, const struct pipe_resource *templat, - struct winsys_handle *whandle) + struct winsys_handle *whandle, + unsigned usage) { struct sw_winsys *winsys = softpipe_screen(screen)->winsys; struct softpipe_resource *spr = CALLOC_STRUCT(softpipe_resource); @@ -251,7 +252,8 @@ softpipe_resource_from_handle(struct pipe_screen *screen, static boolean softpipe_resource_get_handle(struct pipe_screen *screen, struct pipe_resource *pt, - struct winsys_handle *whandle) + struct winsys_handle *whandle, + unsigned usage) { struct sw_winsys *winsys = softpipe_screen(screen)->winsys; struct softpipe_resource *spr = softpipe_resource(pt); diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c index b10eb45e548..da4281490ae 100644 --- a/src/gallium/drivers/svga/svga_context.c +++ b/src/gallium/drivers/svga/svga_context.c @@ -31,6 +31,7 @@ #include "util/u_memory.h" #include "util/u_bitmask.h" #include "util/u_upload_mgr.h" +#include "os/os_time.h" #include "svga_context.h" #include "svga_screen.h" @@ -299,6 +300,7 @@ void svga_context_flush( struct svga_context *svga, { struct svga_screen *svgascreen = svga_screen(svga->pipe.screen); struct pipe_fence_handle *fence = NULL; + uint64_t t0; svga->curr.nr_fbs = 0; @@ -307,9 +309,14 @@ void svga_context_flush( struct svga_context *svga, */ svga_context_flush_buffers(svga); + svga->hud.command_buffer_size += + svga->swc->get_command_buffer_size(svga->swc); + /* Flush pending commands to hardware: */ + t0 = os_time_get(); svga->swc->flush(svga->swc, &fence); + svga->hud.flush_time += (os_time_get() - t0); svga->hud.num_flushes++; diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h index f1a2041b6cf..1976f98e5c1 100644 --- a/src/gallium/drivers/svga/svga_context.h +++ b/src/gallium/drivers/svga/svga_context.h @@ -52,16 +52,19 @@ #define SVGA_QUERY_MAP_BUFFER_TIME (PIPE_QUERY_DRIVER_SPECIFIC + 4) #define SVGA_QUERY_NUM_RESOURCES_MAPPED (PIPE_QUERY_DRIVER_SPECIFIC + 5) #define SVGA_QUERY_NUM_BYTES_UPLOADED (PIPE_QUERY_DRIVER_SPECIFIC + 6) +#define SVGA_QUERY_COMMAND_BUFFER_SIZE (PIPE_QUERY_DRIVER_SPECIFIC + 7) +#define SVGA_QUERY_FLUSH_TIME (PIPE_QUERY_DRIVER_SPECIFIC + 8) +#define SVGA_QUERY_SURFACE_WRITE_FLUSHES (PIPE_QUERY_DRIVER_SPECIFIC + 9) /* running total counters */ -#define SVGA_QUERY_MEMORY_USED (PIPE_QUERY_DRIVER_SPECIFIC + 7) -#define SVGA_QUERY_NUM_SHADERS (PIPE_QUERY_DRIVER_SPECIFIC + 8) -#define SVGA_QUERY_NUM_RESOURCES (PIPE_QUERY_DRIVER_SPECIFIC + 9) -#define SVGA_QUERY_NUM_STATE_OBJECTS (PIPE_QUERY_DRIVER_SPECIFIC + 10) -#define SVGA_QUERY_NUM_SURFACE_VIEWS (PIPE_QUERY_DRIVER_SPECIFIC + 11) -#define SVGA_QUERY_NUM_GENERATE_MIPMAP (PIPE_QUERY_DRIVER_SPECIFIC + 12) +#define SVGA_QUERY_MEMORY_USED (PIPE_QUERY_DRIVER_SPECIFIC + 10) +#define SVGA_QUERY_NUM_SHADERS (PIPE_QUERY_DRIVER_SPECIFIC + 11) +#define SVGA_QUERY_NUM_RESOURCES (PIPE_QUERY_DRIVER_SPECIFIC + 12) +#define SVGA_QUERY_NUM_STATE_OBJECTS (PIPE_QUERY_DRIVER_SPECIFIC + 13) +#define SVGA_QUERY_NUM_SURFACE_VIEWS (PIPE_QUERY_DRIVER_SPECIFIC + 14) +#define SVGA_QUERY_NUM_GENERATE_MIPMAP (PIPE_QUERY_DRIVER_SPECIFIC + 15) /*SVGA_QUERY_MAX has to be last because it is size of an array*/ -#define SVGA_QUERY_MAX (PIPE_QUERY_DRIVER_SPECIFIC + 13) +#define SVGA_QUERY_MAX (PIPE_QUERY_DRIVER_SPECIFIC + 16) /** * Maximum supported number of constant buffers per shader @@ -502,6 +505,9 @@ struct svga_context uint64_t num_validations; /**< SVGA_QUERY_NUM_VALIDATIONS */ uint64_t map_buffer_time; /**< SVGA_QUERY_MAP_BUFFER_TIME */ uint64_t num_resources_mapped; /**< SVGA_QUERY_NUM_RESOURCES_MAPPED */ + uint64_t command_buffer_size; /**< SVGA_QUERY_COMMAND_BUFFER_SIZE */ + uint64_t flush_time; /**< SVGA_QUERY_FLUSH_TIME */ + uint64_t surface_write_flushes; /**< SVGA_QUERY_SURFACE_WRITE_FLUSHES */ uint64_t num_shaders; /**< SVGA_QUERY_NUM_SHADERS */ uint64_t num_state_objects; /**< SVGA_QUERY_NUM_STATE_OBJECTS */ uint64_t num_surface_views; /**< SVGA_QUERY_NUM_SURFACE_VIEWS */ diff --git a/src/gallium/drivers/svga/svga_pipe_query.c b/src/gallium/drivers/svga/svga_pipe_query.c index 255494a5de7..845f4ef3a1c 100644 --- a/src/gallium/drivers/svga/svga_pipe_query.c +++ b/src/gallium/drivers/svga/svga_pipe_query.c @@ -723,15 +723,18 @@ svga_create_query(struct pipe_context *pipe, case SVGA_QUERY_NUM_DRAW_CALLS: case SVGA_QUERY_NUM_FALLBACKS: case SVGA_QUERY_NUM_FLUSHES: + case SVGA_QUERY_NUM_VALIDATIONS: + case SVGA_QUERY_MAP_BUFFER_TIME: + case SVGA_QUERY_NUM_RESOURCES_MAPPED: + case SVGA_QUERY_NUM_BYTES_UPLOADED: + case SVGA_QUERY_COMMAND_BUFFER_SIZE: + case SVGA_QUERY_FLUSH_TIME: + case SVGA_QUERY_SURFACE_WRITE_FLUSHES: case SVGA_QUERY_MEMORY_USED: case SVGA_QUERY_NUM_SHADERS: case SVGA_QUERY_NUM_RESOURCES: case SVGA_QUERY_NUM_STATE_OBJECTS: - case SVGA_QUERY_NUM_VALIDATIONS: - case SVGA_QUERY_MAP_BUFFER_TIME: case SVGA_QUERY_NUM_SURFACE_VIEWS: - case SVGA_QUERY_NUM_RESOURCES_MAPPED: - case SVGA_QUERY_NUM_BYTES_UPLOADED: case SVGA_QUERY_NUM_GENERATE_MIPMAP: break; default: @@ -792,15 +795,18 @@ svga_destroy_query(struct pipe_context *pipe, struct pipe_query *q) case SVGA_QUERY_NUM_DRAW_CALLS: case SVGA_QUERY_NUM_FALLBACKS: case SVGA_QUERY_NUM_FLUSHES: + case SVGA_QUERY_NUM_VALIDATIONS: + case SVGA_QUERY_MAP_BUFFER_TIME: + case SVGA_QUERY_NUM_RESOURCES_MAPPED: + case SVGA_QUERY_NUM_BYTES_UPLOADED: + case SVGA_QUERY_COMMAND_BUFFER_SIZE: + case SVGA_QUERY_FLUSH_TIME: + case SVGA_QUERY_SURFACE_WRITE_FLUSHES: case SVGA_QUERY_MEMORY_USED: case SVGA_QUERY_NUM_SHADERS: case SVGA_QUERY_NUM_RESOURCES: case SVGA_QUERY_NUM_STATE_OBJECTS: - case SVGA_QUERY_NUM_VALIDATIONS: - case SVGA_QUERY_MAP_BUFFER_TIME: case SVGA_QUERY_NUM_SURFACE_VIEWS: - case SVGA_QUERY_NUM_RESOURCES_MAPPED: - case SVGA_QUERY_NUM_BYTES_UPLOADED: case SVGA_QUERY_NUM_GENERATE_MIPMAP: /* nothing */ break; @@ -884,6 +890,15 @@ svga_begin_query(struct pipe_context *pipe, struct pipe_query *q) case SVGA_QUERY_NUM_BYTES_UPLOADED: sq->begin_count = svga->hud.num_bytes_uploaded; break; + case SVGA_QUERY_COMMAND_BUFFER_SIZE: + sq->begin_count = svga->hud.command_buffer_size; + break; + case SVGA_QUERY_FLUSH_TIME: + sq->begin_count = svga->hud.flush_time; + break; + case SVGA_QUERY_SURFACE_WRITE_FLUSHES: + sq->begin_count = svga->hud.surface_write_flushes; + break; case SVGA_QUERY_MEMORY_USED: case SVGA_QUERY_NUM_SHADERS: case SVGA_QUERY_NUM_RESOURCES: @@ -978,6 +993,15 @@ svga_end_query(struct pipe_context *pipe, struct pipe_query *q) case SVGA_QUERY_NUM_BYTES_UPLOADED: sq->end_count = svga->hud.num_bytes_uploaded; break; + case SVGA_QUERY_COMMAND_BUFFER_SIZE: + sq->end_count = svga->hud.command_buffer_size; + break; + case SVGA_QUERY_FLUSH_TIME: + sq->end_count = svga->hud.flush_time; + break; + case SVGA_QUERY_SURFACE_WRITE_FLUSHES: + sq->end_count = svga->hud.surface_write_flushes; + break; case SVGA_QUERY_MEMORY_USED: case SVGA_QUERY_NUM_SHADERS: case SVGA_QUERY_NUM_RESOURCES: @@ -1073,9 +1097,12 @@ svga_get_query_result(struct pipe_context *pipe, case SVGA_QUERY_NUM_FALLBACKS: case SVGA_QUERY_NUM_FLUSHES: case SVGA_QUERY_NUM_VALIDATIONS: + case SVGA_QUERY_MAP_BUFFER_TIME: case SVGA_QUERY_NUM_RESOURCES_MAPPED: case SVGA_QUERY_NUM_BYTES_UPLOADED: - case SVGA_QUERY_MAP_BUFFER_TIME: + case SVGA_QUERY_COMMAND_BUFFER_SIZE: + case SVGA_QUERY_FLUSH_TIME: + case SVGA_QUERY_SURFACE_WRITE_FLUSHES: vresult->u64 = sq->end_count - sq->begin_count; break; /* These are running total counters */ diff --git a/src/gallium/drivers/svga/svga_resource.c b/src/gallium/drivers/svga/svga_resource.c index 1c3bcd67afa..264ac335405 100644 --- a/src/gallium/drivers/svga/svga_resource.c +++ b/src/gallium/drivers/svga/svga_resource.c @@ -47,7 +47,8 @@ svga_resource_create(struct pipe_screen *screen, static struct pipe_resource * svga_resource_from_handle(struct pipe_screen * screen, const struct pipe_resource *template, - struct winsys_handle *whandle) + struct winsys_handle *whandle, + unsigned usage) { if (template->target == PIPE_BUFFER) return NULL; diff --git a/src/gallium/drivers/svga/svga_resource_texture.c b/src/gallium/drivers/svga/svga_resource_texture.c index 3f754c4d53e..1edb41dabee 100644 --- a/src/gallium/drivers/svga/svga_resource_texture.c +++ b/src/gallium/drivers/svga/svga_resource_texture.c @@ -463,8 +463,10 @@ svga_texture_transfer_map(struct pipe_context *pipe, assert(transfer->usage & PIPE_TRANSFER_WRITE); if ((transfer->usage & PIPE_TRANSFER_UNSYNCHRONIZED) == 0) { svga_surfaces_flush(svga); - if (!sws->surface_is_flushed(sws, surf)) + if (!sws->surface_is_flushed(sws, surf)) { + svga->hud.surface_write_flushes++; svga_context_flush(svga, NULL); + } } } } @@ -1038,7 +1040,12 @@ svga_texture_generate_mipmap(struct pipe_context *pipe, return FALSE; sv = svga_pipe_sampler_view(psv); - svga_validate_pipe_sampler_view(svga, sv); + ret = svga_validate_pipe_sampler_view(svga, sv); + if (ret != PIPE_OK) { + svga_context_flush(svga, NULL); + ret = svga_validate_pipe_sampler_view(svga, sv); + assert(ret == PIPE_OK); + } ret = SVGA3D_vgpu10_GenMips(svga->swc, sv->id, tex->handle); if (ret != PIPE_OK) { diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c index c9abd49ec1e..bcc512041f7 100644 --- a/src/gallium/drivers/svga/svga_screen.c +++ b/src/gallium/drivers/svga/svga_screen.c @@ -360,6 +360,10 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_STRING_MARKER: case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: case PIPE_CAP_QUERY_MEMORY_INFO: + case PIPE_CAP_PCI_GROUP: + case PIPE_CAP_PCI_BUS: + case PIPE_CAP_PCI_DEVICE: + case PIPE_CAP_PCI_FUNCTION: return 0; case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT: return 64; @@ -827,6 +831,12 @@ svga_get_driver_query_info(struct pipe_screen *screen, PIPE_DRIVER_QUERY_TYPE_UINT64), QUERY("num-bytes-uploaded", SVGA_QUERY_NUM_BYTES_UPLOADED, PIPE_DRIVER_QUERY_TYPE_BYTES), + QUERY("command-buffer-size", SVGA_QUERY_COMMAND_BUFFER_SIZE, + PIPE_DRIVER_QUERY_TYPE_BYTES), + QUERY("flush-time", SVGA_QUERY_FLUSH_TIME, + PIPE_DRIVER_QUERY_TYPE_MICROSECONDS), + QUERY("surface-write-flushes", SVGA_QUERY_SURFACE_WRITE_FLUSHES, + PIPE_DRIVER_QUERY_TYPE_UINT64), /* running total counters */ QUERY("memory-used", SVGA_QUERY_MEMORY_USED, diff --git a/src/gallium/drivers/svga/svga_screen_cache.c b/src/gallium/drivers/svga/svga_screen_cache.c index 5b441295715..321c564e7f5 100644 --- a/src/gallium/drivers/svga/svga_screen_cache.c +++ b/src/gallium/drivers/svga/svga_screen_cache.c @@ -563,8 +563,14 @@ svga_screen_cache_dump(const struct svga_screen *svgascreen) struct svga_host_surface_cache_entry *entry = LIST_ENTRY(struct svga_host_surface_cache_entry, curr, bucket_head); - if (entry->key.format != 37) { - debug_printf(" %u x %u x %u format %u\n", + if (entry->key.format == SVGA3D_BUFFER) { + debug_printf(" %p: buffer %u bytes\n", + entry->handle, + entry->key.size.width); + } + else { + debug_printf(" %p: %u x %u x %u format %u\n", + entry->handle, entry->key.size.width, entry->key.size.height, entry->key.size.depth, diff --git a/src/gallium/drivers/svga/svga_state_sampler.c b/src/gallium/drivers/svga/svga_state_sampler.c index e7b540cc707..6179a797fa2 100644 --- a/src/gallium/drivers/svga/svga_state_sampler.c +++ b/src/gallium/drivers/svga/svga_state_sampler.c @@ -103,8 +103,17 @@ svga_validate_pipe_sampler_view(struct svga_context *svga, SVGA3dSurfaceFormat format; SVGA3dResourceType resourceDim; SVGA3dShaderResourceViewDesc viewDesc; + enum pipe_format pformat = sv->base.format; - format = svga_translate_format(ss, sv->base.format, + /* vgpu10 cannot create a BGRX view for a BGRA resource, so force it to + * create a BGRA view. + */ + if (pformat == PIPE_FORMAT_B8G8R8X8_UNORM && + sv->base.texture->format == PIPE_FORMAT_B8G8R8A8_UNORM) { + pformat = PIPE_FORMAT_B8G8R8A8_UNORM; + } + + format = svga_translate_format(ss, pformat, PIPE_BIND_SAMPLER_VIEW); assert(format != SVGA3D_FORMAT_INVALID); diff --git a/src/gallium/drivers/svga/svga_winsys.h b/src/gallium/drivers/svga/svga_winsys.h index 562c6690fc1..0ad6b5e6c76 100644 --- a/src/gallium/drivers/svga/svga_winsys.h +++ b/src/gallium/drivers/svga/svga_winsys.h @@ -108,6 +108,12 @@ struct svga_winsys_context uint32_t nr_bytes, uint32_t nr_relocs ); /** + * Returns current size of command buffer, in bytes. + */ + unsigned + (*get_command_buffer_size)(struct svga_winsys_context *swc); + + /** * Emit a relocation for a host surface. * * @param flags bitmask of SVGA_RELOC_* flags diff --git a/src/gallium/drivers/swr/.clang-format b/src/gallium/drivers/swr/.clang-format new file mode 100644 index 00000000000..0ec65a5de88 --- /dev/null +++ b/src/gallium/drivers/swr/.clang-format @@ -0,0 +1,64 @@ +--- +Language: Cpp +AccessModifierOffset: -3 +AlignAfterOpenBracket: true +AlignEscapedNewlinesLeft: false +AlignOperands: false +AlignTrailingComments: false +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AllowShortFunctionsOnASingleLine: All +AlwaysBreakAfterDefinitionReturnType: true +AlwaysBreakTemplateDeclarations: false +AlwaysBreakBeforeMultilineStrings: false +BreakBeforeBinaryOperators: NonAssignment +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: true +BinPackParameters: false +BinPackArguments: false +ColumnLimit: 78 +ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerIndentWidth: 3 +DerivePointerAlignment: false +ExperimentalAutoDetectBinPacking: false +IndentCaseLabels: false +IndentWrappedFunctionNames: false +IndentFunctionDeclarationAfterType: false +MaxEmptyLinesToKeep: 2 +KeepEmptyLinesAtTheStartOfBlocks: true +NamespaceIndentation: Inner +ObjCBlockIndentWidth: 3 +ObjCSpaceAfterProperty: true +ObjCSpaceBeforeProtocolList: true +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakString: 1000 +PenaltyBreakFirstLessLess: 120 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 0 +PointerAlignment: Right +SpacesBeforeTrailingComments: 1 +Cpp11BracedListStyle: true +Standard: Cpp11 +IndentWidth: 3 +TabWidth: 8 +UseTab: Never +BreakBeforeBraces: Linux +SpacesInParentheses: false +SpacesInSquareBrackets: false +SpacesInAngles: false +SpaceInEmptyParentheses: false +SpacesInCStyleCastParentheses: false +SpaceAfterCStyleCast: false +SpacesInContainerLiterals: true +SpaceBeforeAssignmentOperators: true +ContinuationIndentWidth: 3 +CommentPragmas: '^ IWYU pragma:' +ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] +SpaceBeforeParens: ControlStatements +DisableFormat: false +... + diff --git a/src/gallium/drivers/swr/Makefile.am b/src/gallium/drivers/swr/Makefile.am new file mode 100644 index 00000000000..f08806aaf77 --- /dev/null +++ b/src/gallium/drivers/swr/Makefile.am @@ -0,0 +1,31 @@ +# Copyright (C) 2015 Intel Corporation. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +include Makefile.sources +include $(top_srcdir)/src/gallium/Automake.inc + +AM_CXXFLAGS = $(GALLIUM_DRIVER_CFLAGS) + +noinst_LTLIBRARIES = libmesaswr.la + +libmesaswr_la_SOURCES = $(LOADER_SOURCES) + +EXTRA_DIST = Makefile.sources-arch diff --git a/src/gallium/drivers/swr/Makefile.sources b/src/gallium/drivers/swr/Makefile.sources new file mode 100644 index 00000000000..72247211184 --- /dev/null +++ b/src/gallium/drivers/swr/Makefile.sources @@ -0,0 +1,23 @@ +# Copyright (C) 2015 Intel Corporation. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +LOADER_SOURCES := \ + swr_loader.cpp diff --git a/src/gallium/drivers/swr/Makefile.sources-arch b/src/gallium/drivers/swr/Makefile.sources-arch new file mode 100644 index 00000000000..6c105f46199 --- /dev/null +++ b/src/gallium/drivers/swr/Makefile.sources-arch @@ -0,0 +1,111 @@ +# Copyright (C) 2015 Intel Corporation. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +CXX_SOURCES := \ + swr_clear.cpp \ + swr_context.cpp \ + swr_context.h \ + swr_context_llvm.h \ + swr_draw.cpp \ + swr_public.h \ + swr_resource.h \ + swr_screen.cpp \ + swr_screen.h \ + swr_state.cpp \ + swr_state.h \ + swr_tex_sample.cpp \ + swr_tex_sample.h \ + swr_scratch.h \ + swr_scratch.cpp \ + swr_shader.cpp \ + swr_memory.h \ + swr_fence.h \ + swr_fence.cpp \ + swr_query.h \ + swr_query.cpp + +COMMON_CXX_SOURCES := \ + rasterizer/common/containers.hpp \ + rasterizer/common/formats.cpp \ + rasterizer/common/formats.h \ + rasterizer/common/isa.hpp \ + rasterizer/common/os.h \ + rasterizer/common/rdtsc_buckets.cpp \ + rasterizer/common/rdtsc_buckets.h \ + rasterizer/common/rdtsc_buckets_shared.h \ + rasterizer/common/rdtsc_buckets_shared.h \ + rasterizer/common/simdintrin.h \ + rasterizer/common/swr_assert.cpp \ + rasterizer/common/swr_assert.h + +CORE_CXX_SOURCES := \ + rasterizer/core/api.cpp \ + rasterizer/core/api.h \ + rasterizer/core/arena.cpp \ + rasterizer/core/arena.h \ + rasterizer/core/backend.cpp \ + rasterizer/core/backend.h \ + rasterizer/core/blend.h \ + rasterizer/core/clip.cpp \ + rasterizer/core/clip.h \ + rasterizer/core/context.h \ + rasterizer/core/depthstencil.h \ + rasterizer/core/fifo.hpp \ + rasterizer/core/format_traits.h \ + rasterizer/core/format_types.h \ + rasterizer/core/frontend.cpp \ + rasterizer/core/frontend.h \ + rasterizer/core/knobs.h \ + rasterizer/core/knobs_init.h \ + rasterizer/core/multisample.cpp \ + rasterizer/core/multisample.h \ + rasterizer/core/pa_avx.cpp \ + rasterizer/core/pa.h \ + rasterizer/core/rasterizer.cpp \ + rasterizer/core/rasterizer.h \ + rasterizer/core/rdtsc_core.cpp \ + rasterizer/core/rdtsc_core.h \ + rasterizer/core/state.h \ + rasterizer/core/threads.cpp \ + rasterizer/core/threads.h \ + rasterizer/core/tilemgr.cpp \ + rasterizer/core/tilemgr.h \ + rasterizer/core/utils.cpp \ + rasterizer/core/utils.h + +JITTER_CXX_SOURCES := \ + rasterizer/jitter/blend_jit.cpp \ + rasterizer/jitter/blend_jit.h \ + rasterizer/jitter/builder.cpp \ + rasterizer/jitter/builder.h \ + rasterizer/jitter/builder_misc.cpp \ + rasterizer/jitter/builder_misc.h \ + rasterizer/jitter/fetch_jit.cpp \ + rasterizer/jitter/fetch_jit.h \ + rasterizer/jitter/JitManager.cpp \ + rasterizer/jitter/JitManager.h \ + rasterizer/jitter/streamout_jit.cpp \ + rasterizer/jitter/streamout_jit.h + +MEMORY_CXX_SOURCES := \ + rasterizer/memory/ClearTile.cpp \ + rasterizer/memory/LoadTile.cpp \ + rasterizer/memory/StoreTile.cpp diff --git a/src/gallium/drivers/swr/avx/Makefile.am b/src/gallium/drivers/swr/avx/Makefile.am new file mode 100644 index 00000000000..384f1a7eecf --- /dev/null +++ b/src/gallium/drivers/swr/avx/Makefile.am @@ -0,0 +1,99 @@ +# Copyright (C) 2015 Intel Corporation. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +include ../Makefile.sources-arch +include $(top_srcdir)/src/gallium/Automake.inc + +VPATH = $(srcdir) $(srcdir)/.. + +AM_CXXFLAGS = \ + -march=core-avx-i \ + -DKNOB_ARCH=KNOB_ARCH_AVX \ + $(GALLIUM_DRIVER_CFLAGS) \ + $(LLVM_CFLAGS) \ + -I$(builddir)/rasterizer/scripts \ + -I$(builddir)/rasterizer/jitter \ + -I$(srcdir)/../rasterizer \ + -I$(srcdir)/../rasterizer/core \ + -I$(srcdir)/../rasterizer/jitter + +lib_LTLIBRARIES = libswrAVX.la + +BUILT_SOURCES = \ + rasterizer/scripts/gen_knobs.cpp \ + rasterizer/scripts/gen_knobs.h \ + rasterizer/jitter/state_llvm.h \ + rasterizer/jitter/builder_gen.h \ + rasterizer/jitter/builder_gen.cpp \ + rasterizer/jitter/builder_x86.h \ + rasterizer/jitter/builder_x86.cpp + +libswrAVX_la_SOURCES = \ + $(CXX_SOURCES) \ + $(COMMON_CXX_SOURCES) \ + $(CORE_CXX_SOURCES) \ + $(JITTER_CXX_SOURCES) \ + $(MEMORY_CXX_SOURCES) \ + $(BUILT_SOURCES) + +rasterizer/scripts/gen_knobs.cpp rasterizer/scripts/gen_knobs.h: rasterizer/scripts/gen_knobs.py rasterizer/scripts/knob_defs.py rasterizer/scripts/templates/knobs.template + $(PYTHON2) $(PYTHON_FLAGS) \ + $(srcdir)/../rasterizer/scripts/gen_knobs.py \ + rasterizer/scripts + +rasterizer/jitter/state_llvm.h: rasterizer/jitter/scripts/gen_llvm_types.py rasterizer/core/state.h + $(PYTHON2) $(PYTHON_FLAGS) \ + $(srcdir)/../rasterizer/jitter/scripts/gen_llvm_types.py \ + --input $(srcdir)/../rasterizer/core/state.h \ + --output rasterizer/jitter/state_llvm.h + +rasterizer/jitter/builder_gen.h: rasterizer/jitter/scripts/gen_llvm_ir_macros.py $(LLVM_INCLUDEDIR)/llvm/IR/IRBuilder.h + $(PYTHON2) $(PYTHON_FLAGS) \ + $(srcdir)/../rasterizer/jitter/scripts/gen_llvm_ir_macros.py \ + --input $(LLVM_INCLUDEDIR)/llvm/IR/IRBuilder.h \ + --output rasterizer/jitter/builder_gen.h \ + --gen_h + +rasterizer/jitter/builder_gen.cpp: rasterizer/jitter/scripts/gen_llvm_ir_macros.py $(LLVM_INCLUDEDIR)/llvm/IR/IRBuilder.h + $(PYTHON2) $(PYTHON_FLAGS) \ + $(srcdir)/../rasterizer/jitter/scripts/gen_llvm_ir_macros.py \ + --input $(LLVM_INCLUDEDIR)/llvm/IR/IRBuilder.h \ + --output rasterizer/jitter/builder_gen.cpp \ + --gen_cpp + +rasterizer/jitter/builder_x86.h: rasterizer/jitter/scripts/gen_llvm_ir_macros.py + $(PYTHON2) $(PYTHON_FLAGS) \ + $(srcdir)/../rasterizer/jitter/scripts/gen_llvm_ir_macros.py \ + --output rasterizer/jitter/builder_x86.h \ + --gen_x86_h + +rasterizer/jitter/builder_x86.cpp: rasterizer/jitter/scripts/gen_llvm_ir_macros.py + $(PYTHON2) $(PYTHON_FLAGS) \ + $(srcdir)/../rasterizer/jitter/scripts/gen_llvm_ir_macros.py \ + --output rasterizer/jitter/builder_x86.cpp \ + --gen_x86_cpp + + +libswrAVX_la_LIBADD = \ + $(top_builddir)/src/gallium/auxiliary/libgallium.la \ + $(top_builddir)/src/mesa/libmesagallium.la + +include $(top_srcdir)/install-gallium-links.mk diff --git a/src/gallium/drivers/swr/avx2/Makefile.am b/src/gallium/drivers/swr/avx2/Makefile.am new file mode 100644 index 00000000000..a3968ecd95e --- /dev/null +++ b/src/gallium/drivers/swr/avx2/Makefile.am @@ -0,0 +1,99 @@ +# Copyright (C) 2015 Intel Corporation. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +include ../Makefile.sources-arch +include $(top_srcdir)/src/gallium/Automake.inc + +VPATH = $(srcdir) $(srcdir)/.. + +AM_CXXFLAGS = \ + -march=core-avx2 \ + -DKNOB_ARCH=KNOB_ARCH_AVX2 \ + $(GALLIUM_DRIVER_CFLAGS) \ + $(LLVM_CFLAGS) \ + -I$(builddir)/rasterizer/scripts \ + -I$(builddir)/rasterizer/jitter \ + -I$(srcdir)/../rasterizer \ + -I$(srcdir)/../rasterizer/core \ + -I$(srcdir)/../rasterizer/jitter + +lib_LTLIBRARIES = libswrAVX2.la + +BUILT_SOURCES = \ + rasterizer/scripts/gen_knobs.cpp \ + rasterizer/scripts/gen_knobs.h \ + rasterizer/jitter/state_llvm.h \ + rasterizer/jitter/builder_gen.h \ + rasterizer/jitter/builder_gen.cpp \ + rasterizer/jitter/builder_x86.h \ + rasterizer/jitter/builder_x86.cpp + +libswrAVX2_la_SOURCES = \ + $(CXX_SOURCES) \ + $(COMMON_CXX_SOURCES) \ + $(CORE_CXX_SOURCES) \ + $(JITTER_CXX_SOURCES) \ + $(MEMORY_CXX_SOURCES) \ + $(BUILT_SOURCES) + +rasterizer/scripts/gen_knobs.cpp rasterizer/scripts/gen_knobs.h: rasterizer/scripts/gen_knobs.py rasterizer/scripts/knob_defs.py rasterizer/scripts/templates/knobs.template + $(PYTHON2) $(PYTHON_FLAGS) \ + $(srcdir)/../rasterizer/scripts/gen_knobs.py \ + rasterizer/scripts + +rasterizer/jitter/state_llvm.h: rasterizer/jitter/scripts/gen_llvm_types.py rasterizer/core/state.h + $(PYTHON2) $(PYTHON_FLAGS) \ + $(srcdir)/../rasterizer/jitter/scripts/gen_llvm_types.py \ + --input $(srcdir)/../rasterizer/core/state.h \ + --output rasterizer/jitter/state_llvm.h + +rasterizer/jitter/builder_gen.h: rasterizer/jitter/scripts/gen_llvm_ir_macros.py $(LLVM_INCLUDEDIR)/llvm/IR/IRBuilder.h + $(PYTHON2) $(PYTHON_FLAGS) \ + $(srcdir)/../rasterizer/jitter/scripts/gen_llvm_ir_macros.py \ + --input $(LLVM_INCLUDEDIR)/llvm/IR/IRBuilder.h \ + --output rasterizer/jitter/builder_gen.h \ + --gen_h + +rasterizer/jitter/builder_gen.cpp: rasterizer/jitter/scripts/gen_llvm_ir_macros.py $(LLVM_INCLUDEDIR)/llvm/IR/IRBuilder.h + $(PYTHON2) $(PYTHON_FLAGS) \ + $(srcdir)/../rasterizer/jitter/scripts/gen_llvm_ir_macros.py \ + --input $(LLVM_INCLUDEDIR)/llvm/IR/IRBuilder.h \ + --output rasterizer/jitter/builder_gen.cpp \ + --gen_cpp + +rasterizer/jitter/builder_x86.h: rasterizer/jitter/scripts/gen_llvm_ir_macros.py + $(PYTHON2) $(PYTHON_FLAGS) \ + $(srcdir)/../rasterizer/jitter/scripts/gen_llvm_ir_macros.py \ + --output rasterizer/jitter/builder_x86.h \ + --gen_x86_h + +rasterizer/jitter/builder_x86.cpp: rasterizer/jitter/scripts/gen_llvm_ir_macros.py + $(PYTHON2) $(PYTHON_FLAGS) \ + $(srcdir)/../rasterizer/jitter/scripts/gen_llvm_ir_macros.py \ + --output rasterizer/jitter/builder_x86.cpp \ + --gen_x86_cpp + + +libswrAVX2_la_LIBADD = \ + $(top_builddir)/src/gallium/auxiliary/libgallium.la \ + $(top_builddir)/src/mesa/libmesagallium.la + +include $(top_srcdir)/install-gallium-links.mk diff --git a/src/gallium/drivers/swr/rasterizer/common/containers.hpp b/src/gallium/drivers/swr/rasterizer/common/containers.hpp new file mode 100644 index 00000000000..bc96c5f62fd --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/common/containers.hpp @@ -0,0 +1,208 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +****************************************************************************/ + +#ifndef SWRLIB_CONTAINERS_HPP__ +#define SWRLIB_CONTAINERS_HPP__ + +#include <functional> +#include "common/os.h" + +namespace SWRL +{ + +template <typename T, int NUM_ELEMENTS> +struct UncheckedFixedVector +{ + UncheckedFixedVector() : mSize(0) + { + } + + UncheckedFixedVector(std::size_t size, T const& exemplar) + { + this->mSize = 0; + for (std::size_t i = 0; i < size; ++i) + this->push_back(exemplar); + } + + template <typename Iter> + UncheckedFixedVector(Iter fst, Iter lst) + { + this->mSize = 0; + for ( ; fst != lst; ++fst) + this->push_back(*fst); + } + + UncheckedFixedVector(UncheckedFixedVector const& UFV) + { + this->mSize = 0; + for (std::size_t i = 0, N = UFV.size(); i < N; ++i) + (*this)[i] = UFV[i]; + this->mSize = UFV.size(); + } + + UncheckedFixedVector& operator=(UncheckedFixedVector const& UFV) + { + for (std::size_t i = 0, N = UFV.size(); i < N; ++i) + (*this)[i] = UFV[i]; + this->mSize = UFV.size(); + return *this; + } + + T* begin() { return &this->mElements[0]; } + T* end() { return &this->mElements[0] + this->mSize; } + T const* begin() const { return &this->mElements[0]; } + T const* end() const { return &this->mElements[0] + this->mSize; } + + friend bool operator==(UncheckedFixedVector const& L, UncheckedFixedVector const& R) + { + if (L.size() != R.size()) return false; + for (std::size_t i = 0, N = L.size(); i < N; ++i) + { + if (L[i] != R[i]) return false; + } + return true; + } + + friend bool operator!=(UncheckedFixedVector const& L, UncheckedFixedVector const& R) + { + if (L.size() != R.size()) return true; + for (std::size_t i = 0, N = L.size(); i < N; ++i) + { + if (L[i] != R[i]) return true; + } + return false; + } + + T& operator[](std::size_t idx) + { + return this->mElements[idx]; + } + T const& operator[](std::size_t idx) const + { + return this->mElements[idx]; + } + void push_back(T const& t) + { + this->mElements[this->mSize] = t; + ++this->mSize; + } + void pop_back() + { + SWR_ASSERT(this->mSize > 0); + --this->mSize; + } + T& back() + { + return this->mElements[this->mSize-1]; + } + T const& back() const + { + return this->mElements[this->mSize-1]; + } + bool empty() const + { + return this->mSize == 0; + } + std::size_t size() const + { + return this->mSize; + } + void resize(std::size_t sz) + { + this->mSize = sz; + } + void clear() + { + this->resize(0); + } +private: + std::size_t mSize; + T mElements[NUM_ELEMENTS]; +}; + +template <typename T, int NUM_ELEMENTS> +struct FixedStack : UncheckedFixedVector<T, NUM_ELEMENTS> +{ + FixedStack() {} + + void push(T const& t) + { + this->push_back(t); + } + + void pop() + { + this->pop_back(); + } + + T& top() + { + return this->back(); + } + + T const& top() const + { + return this->back(); + } +}; + +template <typename T> +struct CRCHash +{ + static_assert((sizeof(T) % sizeof(UINT)) == 0, "CRCHash expects templated type size is even multiple of 4B"); + UINT operator()(const T& k) const + { + UINT *pData = (UINT*)&k; + UINT crc = 0; + for (UINT i = 0; i < sizeof(T) / sizeof(UINT); ++i) + { + crc = _mm_crc32_u32(crc, pData[i]); + } + return crc; + } +}; + +}// end SWRL + +namespace std +{ + +template <typename T, int N> +struct hash<SWRL::UncheckedFixedVector<T, N>> +{ + size_t operator() (SWRL::UncheckedFixedVector<T, N> const& v) const + { + if (v.size() == 0) return 0; + std::hash<T> H; + size_t x = H(v[0]); + if (v.size() == 1) return x; + for (size_t i = 1; i < v.size(); ++i) + x ^= H(v[i]) + 0x9e3779b9 + (x<<6) + (x>>2); + return x; + } +}; + + +}// end std. + +#endif//SWRLIB_CONTAINERS_HPP__ diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.cpp b/src/gallium/drivers/swr/rasterizer/common/formats.cpp new file mode 100644 index 00000000000..ed8ce7e5b0f --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/common/formats.cpp @@ -0,0 +1,5469 @@ + +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file formats.cpp +* +* @brief auto-generated file +* +* DO NOT EDIT +* +******************************************************************************/ + +#include "formats.h" + +// lookup table for unorm8 srgb -> float conversion +const uint32_t srgb8Table[256] = { + 0x00000000, 0x399f22b4, 0x3a1f22b4, 0x3a6eb40f, 0x3a9f22b4, 0x3ac6eb61, 0x3aeeb40f, 0x3b0b3e5e, 0x3b1f22b4, 0x3b33070b, 0x3b46eb61, 0x3b5b518d, 0x3b70f18d, 0x3b83e1c6, 0x3b8fe616, 0x3b9c87fd, + 0x3ba9c9b5, 0x3bb7ad6f, 0x3bc63549, 0x3bd5635f, 0x3be539c1, 0x3bf5ba70, 0x3c0373b5, 0x3c0c6152, 0x3c15a703, 0x3c1f45be, 0x3c293e6b, 0x3c3391f7, 0x3c3e4149, 0x3c494d43, 0x3c54b6c7, 0x3c607eb1, + 0x3c6ca5dc, 0x3c792d22, 0x3c830aa8, 0x3c89af9f, 0x3c9085db, 0x3c978dc5, 0x3c9ec7c0, 0x3ca63431, 0x3cadd37d, 0x3cb5a601, 0x3cbdac20, 0x3cc5e639, 0x3cce54ab, 0x3cd6f7d3, 0x3cdfd00e, 0x3ce8ddb9, + 0x3cf22131, 0x3cfb9ac6, 0x3d02a56c, 0x3d0798df, 0x3d0ca7e7, 0x3d11d2b0, 0x3d171965, 0x3d1c7c31, 0x3d21fb3c, 0x3d2796b2, 0x3d2d4ebe, 0x3d332384, 0x3d39152e, 0x3d3f23e6, 0x3d454fd4, 0x3d4b991f, + 0x3d51ffef, 0x3d58846a, 0x3d5f26b7, 0x3d65e6fe, 0x3d6cc564, 0x3d73c20f, 0x3d7add25, 0x3d810b66, 0x3d84b795, 0x3d887330, 0x3d8c3e4a, 0x3d9018f6, 0x3d940345, 0x3d97fd4a, 0x3d9c0716, 0x3da020bb, + 0x3da44a4b, 0x3da883d7, 0x3daccd70, 0x3db12728, 0x3db59110, 0x3dba0b38, 0x3dbe95b5, 0x3dc33092, 0x3dc7dbe2, 0x3dcc97b6, 0x3dd1641f, 0x3dd6412c, 0x3ddb2eef, 0x3de02d77, 0x3de53cd5, 0x3dea5d19, + 0x3def8e55, 0x3df4d093, 0x3dfa23e8, 0x3dff8861, 0x3e027f07, 0x3e054282, 0x3e080ea5, 0x3e0ae379, 0x3e0dc107, 0x3e10a755, 0x3e13966c, 0x3e168e53, 0x3e198f11, 0x3e1c98ae, 0x3e1fab32, 0x3e22c6a3, + 0x3e25eb09, 0x3e29186c, 0x3e2c4ed2, 0x3e2f8e45, 0x3e32d6c8, 0x3e362865, 0x3e398322, 0x3e3ce706, 0x3e405419, 0x3e43ca62, 0x3e4749e8, 0x3e4ad2b1, 0x3e4e64c6, 0x3e52002b, 0x3e55a4e9, 0x3e595307, + 0x3e5d0a8b, 0x3e60cb7c, 0x3e6495e0, 0x3e6869bf, 0x3e6c4720, 0x3e702e08, 0x3e741e7f, 0x3e78188c, 0x3e7c1c38, 0x3e8014c2, 0x3e82203c, 0x3e84308d, 0x3e8645ba, 0x3e885fc5, 0x3e8a7eb2, 0x3e8ca283, + 0x3e8ecb3d, 0x3e90f8e1, 0x3e932b74, 0x3e9562f8, 0x3e979f71, 0x3e99e0e2, 0x3e9c274e, 0x3e9e72b7, 0x3ea0c322, 0x3ea31892, 0x3ea57308, 0x3ea7d289, 0x3eaa3718, 0x3eaca0b7, 0x3eaf0f69, 0x3eb18333, + 0x3eb3fc16, 0x3eb67a15, 0x3eb8fd34, 0x3ebb8576, 0x3ebe12e1, 0x3ec0a571, 0x3ec33d2d, 0x3ec5da17, 0x3ec87c33, 0x3ecb2383, 0x3ecdd00b, 0x3ed081cd, 0x3ed338cc, 0x3ed5f50b, 0x3ed8b68d, 0x3edb7d54, + 0x3ede4965, 0x3ee11ac1, 0x3ee3f16b, 0x3ee6cd67, 0x3ee9aeb6, 0x3eec955d, 0x3eef815d, 0x3ef272ba, 0x3ef56976, 0x3ef86594, 0x3efb6717, 0x3efe6e02, 0x3f00bd2b, 0x3f02460c, 0x3f03d1a5, 0x3f055ff8, + 0x3f06f106, 0x3f0884cf, 0x3f0a1b57, 0x3f0bb49d, 0x3f0d50a2, 0x3f0eef69, 0x3f1090f2, 0x3f123540, 0x3f13dc53, 0x3f15862d, 0x3f1732cf, 0x3f18e23b, 0x3f1a9471, 0x3f1c4973, 0x3f1e0143, 0x3f1fbbe1, + 0x3f217950, 0x3f23398f, 0x3f24fca2, 0x3f26c288, 0x3f288b43, 0x3f2a56d5, 0x3f2c253f, 0x3f2df681, 0x3f2fca9e, 0x3f31a197, 0x3f337b6c, 0x3f355820, 0x3f3737b3, 0x3f391a26, 0x3f3aff7e, 0x3f3ce7b7, + 0x3f3ed2d4, 0x3f40c0d6, 0x3f42b1c0, 0x3f44a592, 0x3f469c4d, 0x3f4895f3, 0x3f4a9284, 0x3f4c9203, 0x3f4e9470, 0x3f5099cd, 0x3f52a21a, 0x3f54ad59, 0x3f56bb8c, 0x3f58ccb3, 0x3f5ae0cf, 0x3f5cf7e2, + 0x3f5f11ee, 0x3f612ef2, 0x3f634eef, 0x3f6571ec, 0x3f6797e1, 0x3f69c0d8, 0x3f6beccb, 0x3f6e1bc2, 0x3f704db6, 0x3f7282b1, 0x3f74baae, 0x3f76f5b3, 0x3f7933b9, 0x3f7b74cb, 0x3f7db8e0, 0x3f800000, +}; + +// order must match SWR_FORMAT +const SWR_FORMAT_INFO gFormatInfo[] = { + // R32G32B32A32_FLOAT (0x0) + { + "R32G32B32A32_FLOAT", + { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 32, 32, 32, 32 }, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R32G32B32A32_SINT (0x1) + { + "R32G32B32A32_SINT", + { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 32, 32, 32, 32 }, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R32G32B32A32_UINT (0x2) + { + "R32G32B32A32_UINT", + { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 32, 32, 32, 32 }, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // 0x3 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x4 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x5 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // R32G32B32X32_FLOAT (0x6) + { + "R32G32B32X32_FLOAT", + { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 32, 32, 32, 32 }, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R32G32B32A32_SSCALED (0x7) + { + "R32G32B32A32_SSCALED", + { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 32, 32, 32, 32 }, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R32G32B32A32_USCALED (0x8) + { + "R32G32B32A32_USCALED", + { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 32, 32, 32, 32 }, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // 0x9 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xa (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xb (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xc (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xd (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xe (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xf (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x10 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x11 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x12 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x13 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x14 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x15 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x16 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x17 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x18 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x19 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1a (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1b (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1c (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1d (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1e (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1f (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x20 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x21 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x22 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x23 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x24 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x25 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x26 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x27 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x28 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x29 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x2a (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x2b (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x2c (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x2d (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x2e (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x2f (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x30 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x31 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x32 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x33 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x34 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x35 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x36 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x37 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x38 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x39 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x3a (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x3b (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x3c (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x3d (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x3e (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x3f (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // R32G32B32_FLOAT (0x40) + { + "R32G32B32_FLOAT", + { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 0 }, // Swizzle + { 32, 32, 32, 0 }, // Bits per component + 96, // Bits per element + 12, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R32G32B32_SINT (0x41) + { + "R32G32B32_SINT", + { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 2, 0 }, // Swizzle + { 32, 32, 32, 0 }, // Bits per component + 96, // Bits per element + 12, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R32G32B32_UINT (0x42) + { + "R32G32B32_UINT", + { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 2, 0 }, // Swizzle + { 32, 32, 32, 0 }, // Bits per component + 96, // Bits per element + 12, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // 0x43 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x44 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // R32G32B32_SSCALED (0x45) + { + "R32G32B32_SSCALED", + { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 0 }, // Swizzle + { 32, 32, 32, 0 }, // Bits per component + 96, // Bits per element + 12, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R32G32B32_USCALED (0x46) + { + "R32G32B32_USCALED", + { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 0 }, // Swizzle + { 32, 32, 32, 0 }, // Bits per component + 96, // Bits per element + 12, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // 0x47 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x48 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x49 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x4a (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x4b (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x4c (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x4d (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x4e (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x4f (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x50 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x51 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x52 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x53 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x54 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x55 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x56 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x57 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x58 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x59 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x5a (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x5b (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x5c (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x5d (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x5e (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x5f (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x60 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x61 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x62 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x63 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x64 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x65 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x66 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x67 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x68 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x69 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x6a (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x6b (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x6c (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x6d (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x6e (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x6f (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x70 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x71 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x72 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x73 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x74 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x75 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x76 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x77 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x78 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x79 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x7a (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x7b (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x7c (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x7d (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x7e (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x7f (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // R16G16B16A16_UNORM (0x80) + { + "R16G16B16A16_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 16, 16, 16, 16 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16G16B16A16_SNORM (0x81) + { + "R16G16B16A16_SNORM", + { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 16, 16, 16, 16 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16G16B16A16_SINT (0x82) + { + "R16G16B16A16_SINT", + { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 16, 16, 16, 16 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16G16B16A16_UINT (0x83) + { + "R16G16B16A16_UINT", + { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 16, 16, 16, 16 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16G16B16A16_FLOAT (0x84) + { + "R16G16B16A16_FLOAT", + { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 16, 16, 16, 16 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R32G32_FLOAT (0x85) + { + "R32G32_FLOAT", + { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 0, 0 }, // Swizzle + { 32, 32, 0, 0 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R32G32_SINT (0x86) + { + "R32G32_SINT", + { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 0, 0 }, // Swizzle + { 32, 32, 0, 0 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R32G32_UINT (0x87) + { + "R32G32_UINT", + { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 0, 0 }, // Swizzle + { 32, 32, 0, 0 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R32_FLOAT_X8X24_TYPELESS (0x88) + { + "R32_FLOAT_X8X24_TYPELESS", + { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 0, 0 }, // Swizzle + { 32, 32, 0, 0 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // X32_TYPELESS_G8X24_UINT (0x89) + { + "X32_TYPELESS_G8X24_UINT", + { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 0, 0 }, // Swizzle + { 32, 32, 0, 0 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // L32A32_FLOAT (0x8a) + { + "L32A32_FLOAT", + { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 3, 0, 0 }, // Swizzle + { 32, 32, 0, 0 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // 0x8b (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x8c (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x8d (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // R16G16B16X16_UNORM (0x8e) + { + "R16G16B16X16_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 16, 16, 16, 16 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16G16B16X16_FLOAT (0x8f) + { + "R16G16B16X16_FLOAT", + { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 16, 16, 16, 16 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // 0x90 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // L32X32_FLOAT (0x91) + { + "L32X32_FLOAT", + { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 3, 0, 0 }, // Swizzle + { 32, 32, 0, 0 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // I32X32_FLOAT (0x92) + { + "I32X32_FLOAT", + { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 3, 0, 0 }, // Swizzle + { 32, 32, 0, 0 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // R16G16B16A16_SSCALED (0x93) + { + "R16G16B16A16_SSCALED", + { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 16, 16, 16, 16 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16G16B16A16_USCALED (0x94) + { + "R16G16B16A16_USCALED", + { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 16, 16, 16, 16 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R32G32_SSCALED (0x95) + { + "R32G32_SSCALED", + { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 0, 0 }, // Swizzle + { 32, 32, 0, 0 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R32G32_USCALED (0x96) + { + "R32G32_USCALED", + { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 0, 0 }, // Swizzle + { 32, 32, 0, 0 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // 0x97 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // R32_FLOAT_X8X24_TYPELESS_LD (0x98) + { + "R32_FLOAT_X8X24_TYPELESS_LD", + { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 0, 0 }, // Swizzle + { 32, 32, 0, 0 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // 0x99 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x9a (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x9b (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x9c (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x9d (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x9e (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x9f (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xa0 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xa1 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xa2 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xa3 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xa4 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xa5 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xa6 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xa7 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xa8 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xa9 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xaa (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xab (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xac (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xad (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xae (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xaf (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xb0 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xb1 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xb2 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xb3 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xb4 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xb5 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xb6 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xb7 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xb8 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xb9 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xba (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xbb (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xbc (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xbd (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xbe (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xbf (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // B8G8R8A8_UNORM (0xc0) + { + "B8G8R8A8_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 2, 1, 0, 3 }, // Swizzle + { 8, 8, 8, 8 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // B8G8R8A8_UNORM_SRGB (0xc1) + { + "B8G8R8A8_UNORM_SRGB", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 2, 1, 0, 3 }, // Swizzle + { 8, 8, 8, 8 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R10G10B10A2_UNORM (0xc2) + { + "R10G10B10A2_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 10, 10, 10, 2 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R10G10B10A2_UNORM_SRGB (0xc3) + { + "R10G10B10A2_UNORM_SRGB", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 10, 10, 10, 2 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R10G10B10A2_UINT (0xc4) + { + "R10G10B10A2_UINT", + { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 10, 10, 10, 2 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // 0xc5 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xc6 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // R8G8B8A8_UNORM (0xc7) + { + "R8G8B8A8_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 8, 8, 8, 8 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R8G8B8A8_UNORM_SRGB (0xc8) + { + "R8G8B8A8_UNORM_SRGB", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 8, 8, 8, 8 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R8G8B8A8_SNORM (0xc9) + { + "R8G8B8A8_SNORM", + { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 8, 8, 8, 8 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R8G8B8A8_SINT (0xca) + { + "R8G8B8A8_SINT", + { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 8, 8, 8, 8 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R8G8B8A8_UINT (0xcb) + { + "R8G8B8A8_UINT", + { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 8, 8, 8, 8 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16G16_UNORM (0xcc) + { + "R16G16_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 0, 0 }, // Swizzle + { 16, 16, 0, 0 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 65535.0f, 1.0f / 65535.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16G16_SNORM (0xcd) + { + "R16G16_SNORM", + { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 0, 0 }, // Swizzle + { 16, 16, 0, 0 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 32767.0f, 1.0f / 32767.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16G16_SINT (0xce) + { + "R16G16_SINT", + { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 0, 0 }, // Swizzle + { 16, 16, 0, 0 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16G16_UINT (0xcf) + { + "R16G16_UINT", + { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 0, 0 }, // Swizzle + { 16, 16, 0, 0 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16G16_FLOAT (0xd0) + { + "R16G16_FLOAT", + { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 0, 0 }, // Swizzle + { 16, 16, 0, 0 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // B10G10R10A2_UNORM (0xd1) + { + "B10G10R10A2_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 2, 1, 0, 3 }, // Swizzle + { 10, 10, 10, 2 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // B10G10R10A2_UNORM_SRGB (0xd2) + { + "B10G10R10A2_UNORM_SRGB", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 2, 1, 0, 3 }, // Swizzle + { 10, 10, 10, 2 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R11G11B10_FLOAT (0xd3) + { + "R11G11B10_FLOAT", + { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 0 }, // Swizzle + { 11, 11, 10, 0 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // 0xd4 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xd5 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // R32_SINT (0xd6) + { + "R32_SINT", + { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 32, 0, 0, 0 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R32_UINT (0xd7) + { + "R32_UINT", + { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 32, 0, 0, 0 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R32_FLOAT (0xd8) + { + "R32_FLOAT", + { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 32, 0, 0, 0 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R24_UNORM_X8_TYPELESS (0xd9) + { + "R24_UNORM_X8_TYPELESS", + { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 24, 0, 0, 0 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 16777215.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // 0xda (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xdb (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // R24_UNORM_X8_TYPELESS_LD (0xdc) + { + "R24_UNORM_X8_TYPELESS_LD", + { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 24, 0, 0, 0 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 16777215.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // L32_UNORM (0xdd) + { + "L32_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 32, 0, 0, 0 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 4294967295.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // 0xde (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // L16A16_UNORM (0xdf) + { + "L16A16_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 3, 0, 0 }, // Swizzle + { 16, 16, 0, 0 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 65535.0f, 1.0f / 65535.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // I24X8_UNORM (0xe0) + { + "I24X8_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 3, 0, 0 }, // Swizzle + { 24, 8, 0, 0 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 16777215.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // L24X8_UNORM (0xe1) + { + "L24X8_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 3, 0, 0 }, // Swizzle + { 24, 8, 0, 0 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 16777215.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // 0xe2 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // I32_FLOAT (0xe3) + { + "I32_FLOAT", + { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 32, 0, 0, 0 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // L32_FLOAT (0xe4) + { + "L32_FLOAT", + { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 32, 0, 0, 0 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // A32_FLOAT (0xe5) + { + "A32_FLOAT", + { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 3, 0, 0, 0 }, // Swizzle + { 32, 0, 0, 0 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // 0xe6 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xe7 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xe8 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // B8G8R8X8_UNORM (0xe9) + { + "B8G8R8X8_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 2, 1, 0, 3 }, // Swizzle + { 8, 8, 8, 8 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // B8G8R8X8_UNORM_SRGB (0xea) + { + "B8G8R8X8_UNORM_SRGB", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 2, 1, 0, 3 }, // Swizzle + { 8, 8, 8, 8 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R8G8B8X8_UNORM (0xeb) + { + "R8G8B8X8_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 8, 8, 8, 8 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R8G8B8X8_UNORM_SRGB (0xec) + { + "R8G8B8X8_UNORM_SRGB", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 8, 8, 8, 8 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R9G9B9E5_SHAREDEXP (0xed) + { + "R9G9B9E5_SHAREDEXP", + { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 9, 9, 9, 5 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // B10G10R10X2_UNORM (0xee) + { + "B10G10R10X2_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 2, 1, 0, 3 }, // Swizzle + { 10, 10, 10, 2 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // 0xef (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // L16A16_FLOAT (0xf0) + { + "L16A16_FLOAT", + { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 3, 0, 0 }, // Swizzle + { 16, 16, 0, 0 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // 0xf1 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xf2 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // R10G10B10X2_USCALED (0xf3) + { + "R10G10B10X2_USCALED", + { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNUSED }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 10, 10, 10, 2 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R8G8B8A8_SSCALED (0xf4) + { + "R8G8B8A8_SSCALED", + { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 8, 8, 8, 8 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R8G8B8A8_USCALED (0xf5) + { + "R8G8B8A8_USCALED", + { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 8, 8, 8, 8 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16G16_SSCALED (0xf6) + { + "R16G16_SSCALED", + { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 0, 0 }, // Swizzle + { 16, 16, 0, 0 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16G16_USCALED (0xf7) + { + "R16G16_USCALED", + { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 0, 0 }, // Swizzle + { 16, 16, 0, 0 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R32_SSCALED (0xf8) + { + "R32_SSCALED", + { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 32, 0, 0, 0 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R32_USCALED (0xf9) + { + "R32_USCALED", + { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 32, 0, 0, 0 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // 0xfa (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xfb (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xfc (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xfd (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xfe (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0xff (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // B5G6R5_UNORM (0x100) + { + "B5G6R5_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 2, 1, 0, 0 }, // Swizzle + { 5, 6, 5, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // B5G6R5_UNORM_SRGB (0x101) + { + "B5G6R5_UNORM_SRGB", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 2, 1, 0, 0 }, // Swizzle + { 5, 6, 5, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 3, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // B5G5R5A1_UNORM (0x102) + { + "B5G5R5A1_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 2, 1, 0, 3 }, // Swizzle + { 5, 5, 5, 1 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // B5G5R5A1_UNORM_SRGB (0x103) + { + "B5G5R5A1_UNORM_SRGB", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 2, 1, 0, 3 }, // Swizzle + { 5, 5, 5, 1 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 4, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // B4G4R4A4_UNORM (0x104) + { + "B4G4R4A4_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 2, 1, 0, 3 }, // Swizzle + { 4, 4, 4, 4 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // B4G4R4A4_UNORM_SRGB (0x105) + { + "B4G4R4A4_UNORM_SRGB", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 2, 1, 0, 3 }, // Swizzle + { 4, 4, 4, 4 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 4, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R8G8_UNORM (0x106) + { + "R8G8_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 0, 0 }, // Swizzle + { 8, 8, 0, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R8G8_SNORM (0x107) + { + "R8G8_SNORM", + { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 0, 0 }, // Swizzle + { 8, 8, 0, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 127.0f, 1.0f / 127.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R8G8_SINT (0x108) + { + "R8G8_SINT", + { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 0, 0 }, // Swizzle + { 8, 8, 0, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R8G8_UINT (0x109) + { + "R8G8_UINT", + { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 0, 0 }, // Swizzle + { 8, 8, 0, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16_UNORM (0x10a) + { + "R16_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 16, 0, 0, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16_SNORM (0x10b) + { + "R16_SNORM", + { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 16, 0, 0, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 32767.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16_SINT (0x10c) + { + "R16_SINT", + { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 16, 0, 0, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16_UINT (0x10d) + { + "R16_UINT", + { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 16, 0, 0, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16_FLOAT (0x10e) + { + "R16_FLOAT", + { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 16, 0, 0, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // 0x10f (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x110 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // I16_UNORM (0x111) + { + "I16_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 16, 0, 0, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // L16_UNORM (0x112) + { + "L16_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 16, 0, 0, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // A16_UNORM (0x113) + { + "A16_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 3, 0, 0, 0 }, // Swizzle + { 16, 0, 0, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // L8A8_UNORM (0x114) + { + "L8A8_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 3, 0, 0 }, // Swizzle + { 8, 8, 0, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // I16_FLOAT (0x115) + { + "I16_FLOAT", + { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 16, 0, 0, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // L16_FLOAT (0x116) + { + "L16_FLOAT", + { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 16, 0, 0, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // A16_FLOAT (0x117) + { + "A16_FLOAT", + { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 3, 0, 0, 0 }, // Swizzle + { 16, 0, 0, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // L8A8_UNORM_SRGB (0x118) + { + "L8A8_UNORM_SRGB", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 3, 0, 0 }, // Swizzle + { 8, 8, 0, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 2, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // 0x119 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // B5G5R5X1_UNORM (0x11a) + { + "B5G5R5X1_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 2, 1, 0, 3 }, // Swizzle + { 5, 5, 5, 1 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // B5G5R5X1_UNORM_SRGB (0x11b) + { + "B5G5R5X1_UNORM_SRGB", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 2, 1, 0, 3 }, // Swizzle + { 5, 5, 5, 1 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 4, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R8G8_SSCALED (0x11c) + { + "R8G8_SSCALED", + { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 0, 0 }, // Swizzle + { 8, 8, 0, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R8G8_USCALED (0x11d) + { + "R8G8_USCALED", + { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 0, 0 }, // Swizzle + { 8, 8, 0, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16_SSCALED (0x11e) + { + "R16_SSCALED", + { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 16, 0, 0, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16_USCALED (0x11f) + { + "R16_USCALED", + { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 16, 0, 0, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // 0x120 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x121 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x122 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x123 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x124 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x125 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // L8A8_UINT (0x126) + { + "L8A8_UINT", + { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 3, 0, 0 }, // Swizzle + { 8, 8, 0, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // L8A8_SINT (0x127) + { + "L8A8_SINT", + { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 3, 0, 0 }, // Swizzle + { 8, 8, 0, 0 }, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // 0x128 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x129 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x12a (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x12b (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x12c (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x12d (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x12e (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x12f (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x130 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x131 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x132 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x133 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x134 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x135 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x136 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x137 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x138 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x139 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x13a (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x13b (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x13c (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x13d (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x13e (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x13f (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // R8_UNORM (0x140) + { + "R8_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R8_SNORM (0x141) + { + "R8_SNORM", + { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R8_SINT (0x142) + { + "R8_SINT", + { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R8_UINT (0x143) + { + "R8_UINT", + { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // A8_UNORM (0x144) + { + "A8_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 3, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // I8_UNORM (0x145) + { + "I8_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // L8_UNORM (0x146) + { + "L8_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // 0x147 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x148 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // R8_SSCALED (0x149) + { + "R8_SSCALED", + { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R8_USCALED (0x14a) + { + "R8_USCALED", + { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // 0x14b (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // L8_UNORM_SRGB (0x14c) + { + "L8_UNORM_SRGB", + { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // 0x14d (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x14e (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x14f (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x150 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x151 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // L8_UINT (0x152) + { + "L8_UINT", + { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // L8_SINT (0x153) + { + "L8_SINT", + { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // I8_UINT (0x154) + { + "I8_UINT", + { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // I8_SINT (0x155) + { + "I8_SINT", + { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 0, 0, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + true, // isLuminance + }, + // 0x156 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x157 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x158 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x159 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x15a (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x15b (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x15c (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x15d (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x15e (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x15f (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x160 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x161 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x162 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x163 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x164 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x165 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x166 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x167 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x168 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x169 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x16a (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x16b (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x16c (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x16d (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x16e (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x16f (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x170 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x171 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x172 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x173 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x174 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x175 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x176 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x177 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x178 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x179 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x17a (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x17b (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x17c (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x17d (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x17e (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x17f (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x180 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x181 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x182 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // YCRCB_SWAPUVY (0x183) + { + "YCRCB_SWAPUVY", + { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 8, 8, 8, 8 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + true, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 2, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // 0x184 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x185 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // BC1_UNORM (0x186) + { + "BC1_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 1, // Num components + false, // isSRGB + true, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor + 4, // bcWidth + 4, // bcHeight + false, // isLuminance + }, + // BC2_UNORM (0x187) + { + "BC2_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 1, // Num components + false, // isSRGB + true, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor + 4, // bcWidth + 4, // bcHeight + false, // isLuminance + }, + // BC3_UNORM (0x188) + { + "BC3_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 1, // Num components + false, // isSRGB + true, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor + 4, // bcWidth + 4, // bcHeight + false, // isLuminance + }, + // BC4_UNORM (0x189) + { + "BC4_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 1, // Num components + false, // isSRGB + true, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor + 4, // bcWidth + 4, // bcHeight + false, // isLuminance + }, + // BC5_UNORM (0x18a) + { + "BC5_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 1, // Num components + false, // isSRGB + true, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor + 4, // bcWidth + 4, // bcHeight + false, // isLuminance + }, + // BC1_UNORM_SRGB (0x18b) + { + "BC1_UNORM_SRGB", + { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 1, // Num components + true, // isSRGB + true, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor + 4, // bcWidth + 4, // bcHeight + false, // isLuminance + }, + // BC2_UNORM_SRGB (0x18c) + { + "BC2_UNORM_SRGB", + { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 1, // Num components + true, // isSRGB + true, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor + 4, // bcWidth + 4, // bcHeight + false, // isLuminance + }, + // BC3_UNORM_SRGB (0x18d) + { + "BC3_UNORM_SRGB", + { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 1, // Num components + true, // isSRGB + true, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor + 4, // bcWidth + 4, // bcHeight + false, // isLuminance + }, + // 0x18e (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // YCRCB_SWAPUV (0x18f) + { + "YCRCB_SWAPUV", + { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 8, 8, 8, 8 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + true, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 2, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // 0x190 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x191 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x192 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // R8G8B8_UNORM (0x193) + { + "R8G8B8_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 0 }, // Swizzle + { 8, 8, 8, 0 }, // Bits per component + 24, // Bits per element + 3, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R8G8B8_SNORM (0x194) + { + "R8G8B8_SNORM", + { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 0 }, // Swizzle + { 8, 8, 8, 0 }, // Bits per component + 24, // Bits per element + 3, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R8G8B8_SSCALED (0x195) + { + "R8G8B8_SSCALED", + { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 0 }, // Swizzle + { 8, 8, 8, 0 }, // Bits per component + 24, // Bits per element + 3, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R8G8B8_USCALED (0x196) + { + "R8G8B8_USCALED", + { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 0 }, // Swizzle + { 8, 8, 8, 0 }, // Bits per component + 24, // Bits per element + 3, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // 0x197 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x198 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // BC4_SNORM (0x199) + { + "BC4_SNORM", + { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 1, // Num components + false, // isSRGB + true, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor + 4, // bcWidth + 4, // bcHeight + false, // isLuminance + }, + // BC5_SNORM (0x19a) + { + "BC5_SNORM", + { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 1, // Num components + false, // isSRGB + true, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor + 4, // bcWidth + 4, // bcHeight + false, // isLuminance + }, + // R16G16B16_FLOAT (0x19b) + { + "R16G16B16_FLOAT", + { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 0 }, // Swizzle + { 16, 16, 16, 0 }, // Bits per component + 48, // Bits per element + 6, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16G16B16_UNORM (0x19c) + { + "R16G16B16_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 0 }, // Swizzle + { 16, 16, 16, 0 }, // Bits per component + 48, // Bits per element + 6, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16G16B16_SNORM (0x19d) + { + "R16G16B16_SNORM", + { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 0 }, // Swizzle + { 16, 16, 16, 0 }, // Bits per component + 48, // Bits per element + 6, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16G16B16_SSCALED (0x19e) + { + "R16G16B16_SSCALED", + { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 0 }, // Swizzle + { 16, 16, 16, 0 }, // Bits per component + 48, // Bits per element + 6, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16G16B16_USCALED (0x19f) + { + "R16G16B16_USCALED", + { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 0 }, // Swizzle + { 16, 16, 16, 0 }, // Bits per component + 48, // Bits per element + 6, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // 0x1a0 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // BC6H_SF16 (0x1a1) + { + "BC6H_SF16", + { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 1, // Num components + false, // isSRGB + true, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor + 4, // bcWidth + 4, // bcHeight + false, // isLuminance + }, + // BC7_UNORM (0x1a2) + { + "BC7_UNORM", + { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 1, // Num components + false, // isSRGB + true, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor + 4, // bcWidth + 4, // bcHeight + false, // isLuminance + }, + // BC7_UNORM_SRGB (0x1a3) + { + "BC7_UNORM_SRGB", + { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 1, // Num components + true, // isSRGB + true, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor + 4, // bcWidth + 4, // bcHeight + false, // isLuminance + }, + // BC6H_UF16 (0x1a4) + { + "BC6H_UF16", + { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 0, 0, 0 }, // Swizzle + { 8, 0, 0, 0 }, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 1, // Num components + false, // isSRGB + true, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor + 4, // bcWidth + 4, // bcHeight + false, // isLuminance + }, + // 0x1a5 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1a6 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1a7 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // R8G8B8_UNORM_SRGB (0x1a8) + { + "R8G8B8_UNORM_SRGB", + { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 0 }, // Swizzle + { 8, 8, 8, 0 }, // Bits per component + 24, // Bits per element + 3, // Bytes per element + 3, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // 0x1a9 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1aa (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1ab (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1ac (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1ad (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1ae (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1af (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // R16G16B16_UINT (0x1b0) + { + "R16G16B16_UINT", + { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 2, 0 }, // Swizzle + { 16, 16, 16, 0 }, // Bits per component + 48, // Bits per element + 6, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R16G16B16_SINT (0x1b1) + { + "R16G16B16_SINT", + { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 2, 0 }, // Swizzle + { 16, 16, 16, 0 }, // Bits per component + 48, // Bits per element + 6, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // 0x1b2 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // R10G10B10A2_SNORM (0x1b3) + { + "R10G10B10A2_SNORM", + { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 10, 10, 10, 2 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R10G10B10A2_USCALED (0x1b4) + { + "R10G10B10A2_USCALED", + { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 10, 10, 10, 2 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R10G10B10A2_SSCALED (0x1b5) + { + "R10G10B10A2_SSCALED", + { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 10, 10, 10, 2 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R10G10B10A2_SINT (0x1b6) + { + "R10G10B10A2_SINT", + { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 2, 3 }, // Swizzle + { 10, 10, 10, 2 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // B10G10R10A2_SNORM (0x1b7) + { + "B10G10R10A2_SNORM", + { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 2, 1, 0, 3 }, // Swizzle + { 10, 10, 10, 2 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { true, true, true, true }, // Is normalized? + { 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // B10G10R10A2_USCALED (0x1b8) + { + "B10G10R10A2_USCALED", + { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 2, 1, 0, 3 }, // Swizzle + { 10, 10, 10, 2 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // B10G10R10A2_SSCALED (0x1b9) + { + "B10G10R10A2_SSCALED", + { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED }, + { 0, 0, 0, 0x3f800000 }, // Defaults for missing components + { 2, 1, 0, 3 }, // Swizzle + { 10, 10, 10, 2 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // B10G10R10A2_UINT (0x1ba) + { + "B10G10R10A2_UINT", + { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 2, 1, 0, 3 }, // Swizzle + { 10, 10, 10, 2 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // B10G10R10A2_SINT (0x1bb) + { + "B10G10R10A2_SINT", + { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 2, 1, 0, 3 }, // Swizzle + { 10, 10, 10, 2 }, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // 0x1bc (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1bd (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1be (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1bf (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1c0 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1c1 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1c2 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1c3 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1c4 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1c5 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1c6 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // 0x1c7 (Padding) + { + "UNKNOWN", + { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, + { false, false, false, false }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + 1, 1, false }, + // R8G8B8_UINT (0x1c8) + { + "R8G8B8_UINT", + { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 2, 0 }, // Swizzle + { 8, 8, 8, 0 }, // Bits per component + 24, // Bits per element + 3, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, + // R8G8B8_SINT (0x1c9) + { + "R8G8B8_SINT", + { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN }, + { 0, 0, 0, 0x1 }, // Defaults for missing components + { 0, 1, 2, 0 }, // Swizzle + { 8, 8, 8, 0 }, // Bits per component + 24, // Bits per element + 3, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + { false, false, false, false }, // Is normalized? + { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor + 1, // bcWidth + 1, // bcHeight + false, // isLuminance + }, +}; diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.h b/src/gallium/drivers/swr/rasterizer/common/formats.h new file mode 100644 index 00000000000..b9dd53ebaa4 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/common/formats.h @@ -0,0 +1,251 @@ + +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file formats.h +* +* @brief auto-generated file +* +* DO NOT EDIT +* +******************************************************************************/ + +#pragma once + +#include "common/os.h" + +////////////////////////////////////////////////////////////////////////// +/// SWR_TYPE - Format component type +////////////////////////////////////////////////////////////////////////// +enum SWR_TYPE +{ + SWR_TYPE_UNKNOWN, + SWR_TYPE_UNUSED, + SWR_TYPE_UNORM, + SWR_TYPE_SNORM, + SWR_TYPE_UINT, + SWR_TYPE_SINT, + SWR_TYPE_FLOAT, + SWR_TYPE_SSCALED, + SWR_TYPE_USCALED, +}; +////////////////////////////////////////////////////////////////////////// +/// SWR_FORMAT +////////////////////////////////////////////////////////////////////////// +enum SWR_FORMAT +{ + R32G32B32A32_FLOAT = 0x0, + R32G32B32A32_SINT = 0x1, + R32G32B32A32_UINT = 0x2, + R32G32B32X32_FLOAT = 0x6, + R32G32B32A32_SSCALED = 0x7, + R32G32B32A32_USCALED = 0x8, + R32G32B32_FLOAT = 0x40, + R32G32B32_SINT = 0x41, + R32G32B32_UINT = 0x42, + R32G32B32_SSCALED = 0x45, + R32G32B32_USCALED = 0x46, + R16G16B16A16_UNORM = 0x80, + R16G16B16A16_SNORM = 0x81, + R16G16B16A16_SINT = 0x82, + R16G16B16A16_UINT = 0x83, + R16G16B16A16_FLOAT = 0x84, + R32G32_FLOAT = 0x85, + R32G32_SINT = 0x86, + R32G32_UINT = 0x87, + R32_FLOAT_X8X24_TYPELESS = 0x88, + X32_TYPELESS_G8X24_UINT = 0x89, + L32A32_FLOAT = 0x8A, + R16G16B16X16_UNORM = 0x8E, + R16G16B16X16_FLOAT = 0x8F, + L32X32_FLOAT = 0x91, + I32X32_FLOAT = 0x92, + R16G16B16A16_SSCALED = 0x93, + R16G16B16A16_USCALED = 0x94, + R32G32_SSCALED = 0x95, + R32G32_USCALED = 0x96, + R32_FLOAT_X8X24_TYPELESS_LD = 0x98, + B8G8R8A8_UNORM = 0xC0, + B8G8R8A8_UNORM_SRGB = 0xC1, + R10G10B10A2_UNORM = 0xC2, + R10G10B10A2_UNORM_SRGB = 0xC3, + R10G10B10A2_UINT = 0xC4, + R8G8B8A8_UNORM = 0xC7, + R8G8B8A8_UNORM_SRGB = 0xC8, + R8G8B8A8_SNORM = 0xC9, + R8G8B8A8_SINT = 0xCA, + R8G8B8A8_UINT = 0xCB, + R16G16_UNORM = 0xCC, + R16G16_SNORM = 0xCD, + R16G16_SINT = 0xCE, + R16G16_UINT = 0xCF, + R16G16_FLOAT = 0xD0, + B10G10R10A2_UNORM = 0xD1, + B10G10R10A2_UNORM_SRGB = 0xD2, + R11G11B10_FLOAT = 0xD3, + R32_SINT = 0xD6, + R32_UINT = 0xD7, + R32_FLOAT = 0xD8, + R24_UNORM_X8_TYPELESS = 0xD9, + R24_UNORM_X8_TYPELESS_LD = 0xDC, + L32_UNORM = 0xDD, + L16A16_UNORM = 0xDF, + I24X8_UNORM = 0xE0, + L24X8_UNORM = 0xE1, + I32_FLOAT = 0xE3, + L32_FLOAT = 0xE4, + A32_FLOAT = 0xE5, + B8G8R8X8_UNORM = 0xE9, + B8G8R8X8_UNORM_SRGB = 0xEA, + R8G8B8X8_UNORM = 0xEB, + R8G8B8X8_UNORM_SRGB = 0xEC, + R9G9B9E5_SHAREDEXP = 0xED, + B10G10R10X2_UNORM = 0xEE, + L16A16_FLOAT = 0xF0, + R10G10B10X2_USCALED = 0xF3, + R8G8B8A8_SSCALED = 0xF4, + R8G8B8A8_USCALED = 0xF5, + R16G16_SSCALED = 0xF6, + R16G16_USCALED = 0xF7, + R32_SSCALED = 0xF8, + R32_USCALED = 0xF9, + B5G6R5_UNORM = 0x100, + B5G6R5_UNORM_SRGB = 0x101, + B5G5R5A1_UNORM = 0x102, + B5G5R5A1_UNORM_SRGB = 0x103, + B4G4R4A4_UNORM = 0x104, + B4G4R4A4_UNORM_SRGB = 0x105, + R8G8_UNORM = 0x106, + R8G8_SNORM = 0x107, + R8G8_SINT = 0x108, + R8G8_UINT = 0x109, + R16_UNORM = 0x10A, + R16_SNORM = 0x10B, + R16_SINT = 0x10C, + R16_UINT = 0x10D, + R16_FLOAT = 0x10E, + I16_UNORM = 0x111, + L16_UNORM = 0x112, + A16_UNORM = 0x113, + L8A8_UNORM = 0x114, + I16_FLOAT = 0x115, + L16_FLOAT = 0x116, + A16_FLOAT = 0x117, + L8A8_UNORM_SRGB = 0x118, + B5G5R5X1_UNORM = 0x11A, + B5G5R5X1_UNORM_SRGB = 0x11B, + R8G8_SSCALED = 0x11C, + R8G8_USCALED = 0x11D, + R16_SSCALED = 0x11E, + R16_USCALED = 0x11F, + L8A8_UINT = 0x126, + L8A8_SINT = 0x127, + R8_UNORM = 0x140, + R8_SNORM = 0x141, + R8_SINT = 0x142, + R8_UINT = 0x143, + A8_UNORM = 0x144, + I8_UNORM = 0x145, + L8_UNORM = 0x146, + R8_SSCALED = 0x149, + R8_USCALED = 0x14A, + L8_UNORM_SRGB = 0x14C, + L8_UINT = 0x152, + L8_SINT = 0x153, + I8_UINT = 0x154, + I8_SINT = 0x155, + YCRCB_SWAPUVY = 0x183, + BC1_UNORM = 0x186, + BC2_UNORM = 0x187, + BC3_UNORM = 0x188, + BC4_UNORM = 0x189, + BC5_UNORM = 0x18A, + BC1_UNORM_SRGB = 0x18B, + BC2_UNORM_SRGB = 0x18C, + BC3_UNORM_SRGB = 0x18D, + YCRCB_SWAPUV = 0x18F, + R8G8B8_UNORM = 0x193, + R8G8B8_SNORM = 0x194, + R8G8B8_SSCALED = 0x195, + R8G8B8_USCALED = 0x196, + BC4_SNORM = 0x199, + BC5_SNORM = 0x19A, + R16G16B16_FLOAT = 0x19B, + R16G16B16_UNORM = 0x19C, + R16G16B16_SNORM = 0x19D, + R16G16B16_SSCALED = 0x19E, + R16G16B16_USCALED = 0x19F, + BC6H_SF16 = 0x1A1, + BC7_UNORM = 0x1A2, + BC7_UNORM_SRGB = 0x1A3, + BC6H_UF16 = 0x1A4, + R8G8B8_UNORM_SRGB = 0x1A8, + R16G16B16_UINT = 0x1B0, + R16G16B16_SINT = 0x1B1, + R10G10B10A2_SNORM = 0x1B3, + R10G10B10A2_USCALED = 0x1B4, + R10G10B10A2_SSCALED = 0x1B5, + R10G10B10A2_SINT = 0x1B6, + B10G10R10A2_SNORM = 0x1B7, + B10G10R10A2_USCALED = 0x1B8, + B10G10R10A2_SSCALED = 0x1B9, + B10G10R10A2_UINT = 0x1BA, + B10G10R10A2_SINT = 0x1BB, + R8G8B8_UINT = 0x1C8, + R8G8B8_SINT = 0x1C9, + NUM_SWR_FORMATS = 0x1CA, +}; +////////////////////////////////////////////////////////////////////////// +/// SWR_FORMAT_INFO - Format information +////////////////////////////////////////////////////////////////////////// +struct SWR_FORMAT_INFO +{ + const char* name; + SWR_TYPE type[4]; + uint32_t defaults[4]; + uint32_t swizzle[4]; ///< swizzle per component + uint32_t bpc[4]; ///< bits per component + uint32_t bpp; ///< bits per pixel + uint32_t Bpp; ///< bytes per pixel + uint32_t numComps; ///< number of components + bool isSRGB; + bool isBC; + bool isSubsampled; + bool isNormalized[4]; + float toFloat[4]; + uint32_t bcWidth; + uint32_t bcHeight; + bool isLuminance; +}; + +extern const SWR_FORMAT_INFO gFormatInfo[]; + +////////////////////////////////////////////////////////////////////////// +/// @brief Retrieves format info struct for given format. +/// @param format - SWR format +INLINE const SWR_FORMAT_INFO& GetFormatInfo(SWR_FORMAT format) +{ + return gFormatInfo[format]; +} + +// lookup table for unorm8 srgb -> float conversion +extern const uint32_t srgb8Table[256]; diff --git a/src/gallium/drivers/swr/rasterizer/common/isa.hpp b/src/gallium/drivers/swr/rasterizer/common/isa.hpp new file mode 100644 index 00000000000..ef381799bc3 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/common/isa.hpp @@ -0,0 +1,235 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +****************************************************************************/ + +#pragma once + +#include <iostream> +#include <vector> +#include <bitset> +#include <array> +#include <string> +#include <algorithm> + +#if defined(_WIN32) +#include <intrin.h> +#else +#include <string.h> +#include <cpuid.h> +#endif + +class InstructionSet +{ +public: + InstructionSet() : CPU_Rep() {}; + + // getters + std::string Vendor(void) { return CPU_Rep.vendor_; } + std::string Brand(void) { return CPU_Rep.brand_; } + + bool SSE3(void) { return CPU_Rep.f_1_ECX_[0]; } + bool PCLMULQDQ(void) { return CPU_Rep.f_1_ECX_[1]; } + bool MONITOR(void) { return CPU_Rep.f_1_ECX_[3]; } + bool SSSE3(void) { return CPU_Rep.f_1_ECX_[9]; } + bool FMA(void) { return CPU_Rep.f_1_ECX_[12]; } + bool CMPXCHG16B(void) { return CPU_Rep.f_1_ECX_[13]; } + bool SSE41(void) { return CPU_Rep.f_1_ECX_[19]; } + bool SSE42(void) { return CPU_Rep.f_1_ECX_[20]; } + bool MOVBE(void) { return CPU_Rep.f_1_ECX_[22]; } + bool POPCNT(void) { return CPU_Rep.f_1_ECX_[23]; } + bool AES(void) { return CPU_Rep.f_1_ECX_[25]; } + bool XSAVE(void) { return CPU_Rep.f_1_ECX_[26]; } + bool OSXSAVE(void) { return CPU_Rep.f_1_ECX_[27]; } + bool RDRAND(void) { return CPU_Rep.f_1_ECX_[30]; } + + bool MSR(void) { return CPU_Rep.f_1_EDX_[5]; } + bool CX8(void) { return CPU_Rep.f_1_EDX_[8]; } + bool SEP(void) { return CPU_Rep.f_1_EDX_[11]; } + bool CMOV(void) { return CPU_Rep.f_1_EDX_[15]; } + bool CLFSH(void) { return CPU_Rep.f_1_EDX_[19]; } + bool MMX(void) { return CPU_Rep.f_1_EDX_[23]; } + bool FXSR(void) { return CPU_Rep.f_1_EDX_[24]; } + bool SSE(void) { return CPU_Rep.f_1_EDX_[25]; } + bool SSE2(void) { return CPU_Rep.f_1_EDX_[26]; } + + bool FSGSBASE(void) { return CPU_Rep.f_7_EBX_[0]; } + bool BMI1(void) { return CPU_Rep.f_7_EBX_[3]; } + bool HLE(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[4]; } + bool BMI2(void) { return CPU_Rep.f_7_EBX_[8]; } + bool ERMS(void) { return CPU_Rep.f_7_EBX_[9]; } + bool INVPCID(void) { return CPU_Rep.f_7_EBX_[10]; } + bool RTM(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[11]; } + bool RDSEED(void) { return CPU_Rep.f_7_EBX_[18]; } + bool ADX(void) { return CPU_Rep.f_7_EBX_[19]; } + bool SHA(void) { return CPU_Rep.f_7_EBX_[29]; } + + bool PREFETCHWT1(void) { return CPU_Rep.f_7_ECX_[0]; } + + bool LAHF(void) { return CPU_Rep.f_81_ECX_[0]; } + bool LZCNT(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_ECX_[5]; } + bool ABM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[5]; } + bool SSE4a(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[6]; } + bool XOP(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[11]; } + bool TBM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[21]; } + + bool SYSCALL(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[11]; } + bool MMXEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[22]; } + bool RDTSCP(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[27]; } + bool _3DNOWEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[30]; } + bool _3DNOW(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[31]; } + + bool AVX(void) { return CPU_Rep.f_1_ECX_[28]; } + bool F16C(void) { return CPU_Rep.f_1_ECX_[29]; } + bool AVX2(void) { return CPU_Rep.f_7_EBX_[5]; } + bool AVX512F(void) { return CPU_Rep.f_7_EBX_[16]; } + bool AVX512PF(void) { return CPU_Rep.f_7_EBX_[26]; } + bool AVX512ER(void) { return CPU_Rep.f_7_EBX_[27]; } + bool AVX512CD(void) { return CPU_Rep.f_7_EBX_[28]; } + +private: + class InstructionSet_Internal + { + public: + InstructionSet_Internal() + : nIds_{ 0 }, + nExIds_{ 0 }, + isIntel_{ false }, + isAMD_{ false }, + f_1_ECX_{ 0 }, + f_1_EDX_{ 0 }, + f_7_EBX_{ 0 }, + f_7_ECX_{ 0 }, + f_81_ECX_{ 0 }, + f_81_EDX_{ 0 }, + data_{}, + extdata_{} + { + //int cpuInfo[4] = {-1}; + std::array<int, 4> cpui; + + // Calling __cpuid with 0x0 as the function_id argument + // gets the number of the highest valid function ID. +#if defined(_WIN32) + __cpuid(cpui.data(), 0); + nIds_ = cpui[0]; +#else + nIds_ = __get_cpuid_max(0, NULL); +#endif + + for (int i = 0; i <= nIds_; ++i) + { +#if defined(_WIN32) + __cpuidex(cpui.data(), i, 0); +#else + int *data = cpui.data(); + __cpuid_count(i, 0, data[0], data[1], data[2], data[3]); +#endif + data_.push_back(cpui); + } + + // Capture vendor string + char vendor[0x20]; + memset(vendor, 0, sizeof(vendor)); + *reinterpret_cast<int*>(vendor) = data_[0][1]; + *reinterpret_cast<int*>(vendor + 4) = data_[0][3]; + *reinterpret_cast<int*>(vendor + 8) = data_[0][2]; + vendor_ = vendor; + if (vendor_ == "GenuineIntel") + { + isIntel_ = true; + } + else if (vendor_ == "AuthenticAMD") + { + isAMD_ = true; + } + + // load bitset with flags for function 0x00000001 + if (nIds_ >= 1) + { + f_1_ECX_ = data_[1][2]; + f_1_EDX_ = data_[1][3]; + } + + // load bitset with flags for function 0x00000007 + if (nIds_ >= 7) + { + f_7_EBX_ = data_[7][1]; + f_7_ECX_ = data_[7][2]; + } + + // Calling __cpuid with 0x80000000 as the function_id argument + // gets the number of the highest valid extended ID. +#if defined(_WIN32) + __cpuid(cpui.data(), 0x80000000); + nExIds_ = cpui[0]; +#else + nExIds_ = __get_cpuid_max(0x80000000, NULL); +#endif + + char brand[0x40]; + memset(brand, 0, sizeof(brand)); + + for (unsigned i = 0x80000000; i <= nExIds_; ++i) + { +#if defined(_WIN32) + __cpuidex(cpui.data(), i, 0); +#else + int *data = cpui.data(); + __cpuid_count(i, 0, data[0], data[1], data[2], data[3]); +#endif + extdata_.push_back(cpui); + } + + // load bitset with flags for function 0x80000001 + if (nExIds_ >= 0x80000001) + { + f_81_ECX_ = extdata_[1][2]; + f_81_EDX_ = extdata_[1][3]; + } + + // Interpret CPU brand string if reported + if (nExIds_ >= 0x80000004) + { + memcpy(brand, extdata_[2].data(), sizeof(cpui)); + memcpy(brand + 16, extdata_[3].data(), sizeof(cpui)); + memcpy(brand + 32, extdata_[4].data(), sizeof(cpui)); + brand_ = brand; + } + }; + + int nIds_; + unsigned nExIds_; + std::string vendor_; + std::string brand_; + bool isIntel_; + bool isAMD_; + std::bitset<32> f_1_ECX_; + std::bitset<32> f_1_EDX_; + std::bitset<32> f_7_EBX_; + std::bitset<32> f_7_ECX_; + std::bitset<32> f_81_ECX_; + std::bitset<32> f_81_EDX_; + std::vector<std::array<int, 4>> data_; + std::vector<std::array<int, 4>> extdata_; + }; + const InstructionSet_Internal CPU_Rep; +}; diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h new file mode 100644 index 00000000000..522ae0dd65f --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/common/os.h @@ -0,0 +1,220 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +****************************************************************************/ + +#ifndef __SWR_OS_H__ +#define __SWR_OS_H__ + +#include "core/knobs.h" + +#if (defined(FORCE_WINDOWS) || defined(_WIN32)) && !defined(FORCE_LINUX) + +#define SWR_API __cdecl + +#ifndef _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_WARNINGS +#endif + +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include "Windows.h" +#include <intrin.h> +#include <cstdint> + +#define OSALIGN(RWORD, WIDTH) __declspec(align(WIDTH)) RWORD +#define THREAD __declspec(thread) +#define INLINE __forceinline +#define DEBUGBREAK __debugbreak() + +#define PRAGMA_WARNING_PUSH_DISABLE(...) \ + __pragma(warning(push));\ + __pragma(warning(disable:__VA_ARGS__)); + +#define PRAGMA_WARNING_POP() __pragma(warning(pop)) + +#if defined(_WIN32) +#if defined(_WIN64) +#define BitScanForwardSizeT BitScanForward64 +#define _mm_popcount_sizeT _mm_popcnt_u64 +#else +#define BitScanForwardSizeT BitScanForward +#define _mm_popcount_sizeT _mm_popcnt_u32 +#endif +#endif + +#elif defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__) + +#define SWR_API + +#include <stdlib.h> +#include <string.h> +#include <X11/Xmd.h> +#include <x86intrin.h> +#include <stdint.h> +#include <sys/types.h> +#include <unistd.h> +#include <sys/stat.h> + +typedef void VOID; +typedef void* LPVOID; +typedef CARD8 BOOL; +typedef wchar_t WCHAR; +typedef uint16_t UINT16; +typedef int INT; +typedef unsigned int UINT; +typedef uint32_t UINT32; +typedef uint64_t UINT64; +typedef int64_t INT64; +typedef void* HANDLE; +typedef float FLOAT; +typedef int LONG; +typedef CARD8 BYTE; +typedef unsigned char UCHAR; +typedef unsigned int DWORD; + +#undef FALSE +#define FALSE 0 + +#undef TRUE +#define TRUE 1 + +#define OSALIGN(RWORD, WIDTH) RWORD __attribute__((aligned(WIDTH))) +#define THREAD __thread +#ifndef INLINE +#define INLINE __inline +#endif +#define DEBUGBREAK asm ("int $3") +#define __cdecl +#define __declspec(X) + +#define GCC_VERSION (__GNUC__ * 10000 \ + + __GNUC_MINOR__ * 100 \ + + __GNUC_PATCHLEVEL__) + +#if !defined(__clang__) && (__GNUC__) && (GCC_VERSION < 40500) +inline +uint64_t __rdtsc() +{ + long low, high; + asm volatile("rdtsc" : "=a"(low), "=d"(high)); + return (low | ((uint64_t)high << 32)); +} +#endif + +#ifndef __clang__ +// Intrinsic not defined in gcc +static INLINE +void _mm256_storeu2_m128i(__m128i *hi, __m128i *lo, __m256i a) +{ + _mm_storeu_si128((__m128i*)lo, _mm256_castsi256_si128(a)); + _mm_storeu_si128((__m128i*)hi, _mm256_extractf128_si256(a, 0x1)); +} +#endif + +inline +unsigned char _BitScanForward(unsigned long *Index, unsigned long Mask) +{ + *Index = __builtin_ctz(Mask); + return (Mask != 0); +} + +inline +unsigned char _BitScanForward(unsigned int *Index, unsigned int Mask) +{ + *Index = __builtin_ctz(Mask); + return (Mask != 0); +} + +inline +unsigned char _BitScanReverse(unsigned long *Index, unsigned long Mask) +{ + *Index = __builtin_clz(Mask); + return (Mask != 0); +} + +inline +unsigned char _BitScanReverse(unsigned int *Index, unsigned int Mask) +{ + *Index = __builtin_clz(Mask); + return (Mask != 0); +} + +inline +void *_aligned_malloc(unsigned int size, unsigned int alignment) +{ + void *ret; + if (posix_memalign(&ret, alignment, size)) + { + return NULL; + } + return ret; +} + +inline +unsigned char _bittest(const LONG *a, LONG b) +{ + return ((*(unsigned *)(a) & (1 << b)) != 0); +} + +#define GetCurrentProcessId getpid + +#define CreateDirectory(name, pSecurity) mkdir(name, 0777) + +#if defined(_WIN32) +static inline +unsigned int _mm_popcnt_u32(unsigned int v) +{ + return __builtin_popcount(v); +} +#endif + +#define _aligned_free free +#define InterlockedCompareExchange(Dest, Exchange, Comparand) __sync_val_compare_and_swap(Dest, Comparand, Exchange) +#define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value) +#define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1) +#define InterlockedIncrement(Append) __sync_add_and_fetch(Append, 1) +#define _ReadWriteBarrier() asm volatile("" ::: "memory") +#define __stdcall + +#define PRAGMA_WARNING_PUSH_DISABLE(...) +#define PRAGMA_WARNING_POP() + +#else + +#error Unsupported OS/system. + +#endif + +// Universal types +typedef BYTE KILOBYTE[1024]; +typedef KILOBYTE MEGABYTE[1024]; +typedef MEGABYTE GIGABYTE[1024]; + +#define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64) +#if KNOB_SIMD_WIDTH == 8 +#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, 32) +#endif + +#include "common/swr_assert.h" + +#endif//__SWR_OS_H__ diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp new file mode 100644 index 00000000000..454641b2751 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp @@ -0,0 +1,188 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file rdtsc_buckets.cpp +* +* @brief implementation of rdtsc buckets. +* +* Notes: +* +******************************************************************************/ +#include "rdtsc_buckets.h" +#include <inttypes.h> + +THREAD UINT tlsThreadId = 0; + +void BucketManager::RegisterThread(const std::string& name) +{ + BUCKET_THREAD newThread; + newThread.name = name; + newThread.root.children.reserve(mBuckets.size()); + newThread.root.id = 0; + newThread.root.pParent = nullptr; + newThread.pCurrent = &newThread.root; + + mThreadMutex.lock(); + + // assign unique thread id for this thread + size_t id = mThreads.size(); + newThread.id = (UINT)id; + tlsThreadId = (UINT)id; + + // open threadviz file if enabled + if (mThreadViz) + { + std::stringstream ss; + ss << mThreadVizDir << "\\threadviz_thread." << newThread.id << ".dat"; + newThread.vizFile = fopen(ss.str().c_str(), "wb"); + } + + // store new thread + mThreads.push_back(newThread); + + mThreadMutex.unlock(); +} + +UINT BucketManager::RegisterBucket(const BUCKET_DESC& desc) +{ + size_t id = mBuckets.size(); + mBuckets.push_back(desc); + return (UINT)id; +} + +void BucketManager::PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket) +{ + const char *arrows[] = { + "", + "|-> ", + " |-> ", + " |-> ", + " |-> ", + " |-> ", + " |-> " + }; + + // compute percent of total cycles used by this bucket + float percentTotal = (float)((double)bucket.elapsed / (double)threadCycles * 100.0); + + // compute percent of parent cycles used by this bucket + float percentParent = (float)((double)bucket.elapsed / (double)parentCycles * 100.0); + + // compute average cycle count per invocation + UINT64 CPE = bucket.elapsed / bucket.count; + + BUCKET_DESC &desc = mBuckets[bucket.id]; + + // construct hierarchy visualization + char hier[80]; + strcpy(hier, arrows[level]); + strcat(hier, desc.name.c_str()); + + // print out + fprintf(f, "%6.2f %6.2f %-10" PRIu64 " %-10" PRIu64 " %-10u %-10lu %-10u %s\n", + percentTotal, + percentParent, + bucket.elapsed, + CPE, + bucket.count, + (unsigned long)0, + (uint32_t)0, + hier + ); + + // dump all children of this bucket + for (const BUCKET& child : bucket.children) + { + if (child.count) + { + PrintBucket(f, level + 1, threadCycles, bucket.elapsed, child); + } + } +} + +void BucketManager::PrintThread(FILE* f, const BUCKET_THREAD& thread) +{ + // print header + fprintf(f, "\nThread %u (%s)\n", thread.id, thread.name.c_str()); + fprintf(f, " %%Tot %%Par Cycles CPE NumEvent CPE2 NumEvent2 Bucket\n"); + + // compute thread level total cycle counts across all buckets from root + const BUCKET& root = thread.root; + UINT64 totalCycles = 0; + for (const BUCKET& child : root.children) + { + totalCycles += child.elapsed; + } + + for (const BUCKET& child : root.children) + { + if (child.count) + { + PrintBucket(f, 0, totalCycles, totalCycles, child); + } + } +} + +void BucketManager::DumpThreadViz() +{ + // ensure all thread data is flushed + mThreadMutex.lock(); + for (auto& thread : mThreads) + { + fflush(thread.vizFile); + fclose(thread.vizFile); + } + mThreadMutex.unlock(); + + // dump bucket descriptions + std::stringstream ss; + ss << mThreadVizDir << "\\threadviz_buckets.dat"; + + FILE* f = fopen(ss.str().c_str(), "wb"); + for (auto& bucket : mBuckets) + { + Serialize(f, bucket); + } + fclose(f); +} + +void BucketManager::PrintReport(const std::string& filename) +{ + if (mThreadViz) + { + DumpThreadViz(); + } + else + { + FILE* f = fopen(filename.c_str(), "w"); + + mThreadMutex.lock(); + for (const BUCKET_THREAD& thread : mThreads) + { + PrintThread(f, thread); + fprintf(f, "\n"); + } + mThreadMutex.unlock(); + + fclose(f); + } +} diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h new file mode 100644 index 00000000000..99cb10ec6e8 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h @@ -0,0 +1,229 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file rdtsc_buckets.h +* +* @brief declaration for rdtsc buckets. +* +* Notes: +* +******************************************************************************/ +#pragma once + +#include "os.h" +#include <vector> +#include <mutex> +#include <sstream> + +#include "rdtsc_buckets_shared.h" + +// unique thread id stored in thread local storage +extern THREAD UINT tlsThreadId; + +////////////////////////////////////////////////////////////////////////// +/// @brief BucketManager encapsulates a single instance of the buckets +/// functionality. There can be one or many bucket managers active +/// at any time. The manager owns all the threads and +/// bucket information that have been registered to it. +class BucketManager +{ +public: + BucketManager(bool enableThreadViz) : mThreadViz(enableThreadViz) + { + if (mThreadViz) + { + uint32_t pid = GetCurrentProcessId(); + std::stringstream str; + str << "threadviz." << pid; + mThreadVizDir = str.str(); + CreateDirectory(mThreadVizDir.c_str(), NULL); + } + } + + // removes all registered thread data + void ClearThreads() + { + mThreadMutex.lock(); + mThreads.clear(); + mThreadMutex.unlock(); + } + + // removes all registered buckets + void ClearBuckets() + { + mBuckets.clear(); + } + + /// Registers a new thread with the manager. + /// @param name - name of thread, used for labels in reports and threadviz + void RegisterThread(const std::string& name); + + /// Registers a new bucket type with the manager. Returns a unique + /// id which should be used in subsequent calls to start/stop the bucket + /// @param desc - description of the bucket + /// @return unique id + UINT RegisterBucket(const BUCKET_DESC& desc); + + // dump threadviz data + void DumpThreadViz(); + + // print report + void PrintReport(const std::string& filename); + + // start capturing + INLINE void StartCapture() + { + mCapturing = true; + } + + // stop capturing + INLINE void StopCapture() + { + mCapturing = false; + + // wait for all threads to pop back to root bucket + bool stillCapturing = true; + while (stillCapturing) + { + stillCapturing = false; + for (const BUCKET_THREAD& t : mThreads) + { + if (t.pCurrent != &t.root) + { + stillCapturing = true; + continue; + } + } + } + } + + // start a bucket + // @param id generated by RegisterBucket + INLINE void StartBucket(UINT id) + { + if (!mCapturing) return; + + SWR_ASSERT(tlsThreadId < mThreads.size()); + + BUCKET_THREAD& bt = mThreads[tlsThreadId]; + + // if threadviz is enabled, only need to dump start info to threads viz file + if (mThreadViz) + { + SWR_ASSERT(bt.vizFile != nullptr); + if (mBuckets[id].enableThreadViz) + { + VIZ_START_DATA data{ VIZ_START, id, __rdtsc() }; + Serialize(bt.vizFile, data); + } + } + else + { + if (bt.pCurrent->children.size() < mBuckets.size()) + { + bt.pCurrent->children.resize(mBuckets.size()); + } + BUCKET &child = bt.pCurrent->children[id]; + child.pParent = bt.pCurrent; + child.id = id; + child.start = __rdtsc(); + + // update thread's currently executing bucket + bt.pCurrent = &child; + } + + bt.level++; + } + + // stop the currently executing bucket + INLINE void StopBucket(UINT id) + { + SWR_ASSERT(tlsThreadId < mThreads.size()); + BUCKET_THREAD &bt = mThreads[tlsThreadId]; + + if (bt.level == 0) return; + + if (mThreadViz) + { + SWR_ASSERT(bt.vizFile != nullptr); + if (mBuckets[id].enableThreadViz) + { + VIZ_STOP_DATA data{ VIZ_STOP, __rdtsc() }; + Serialize(bt.vizFile, data); + } + } + else + { + if (bt.pCurrent->start == 0) return; + SWR_ASSERT(bt.pCurrent->id == id, "Mismatched buckets detected"); + + bt.pCurrent->elapsed += (__rdtsc() - bt.pCurrent->start); + bt.pCurrent->count++; + + // pop to parent + bt.pCurrent = bt.pCurrent->pParent; + } + + bt.level--; + } + + INLINE void AddEvent(uint32_t id, uint32_t count) + { + if (!mCapturing) return; + + SWR_ASSERT(tlsThreadId < mThreads.size()); + + BUCKET_THREAD& bt = mThreads[tlsThreadId]; + + // don't record events for threadviz + if (!mThreadViz) + { + if (bt.pCurrent->children.size() < mBuckets.size()) + { + bt.pCurrent->children.resize(mBuckets.size()); + } + BUCKET &child = bt.pCurrent->children[id]; + child.pParent = bt.pCurrent; + child.id = id; + child.count += count; + } + } + +private: + void PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket); + void PrintThread(FILE* f, const BUCKET_THREAD& thread); + + // list of active threads that have registered with this manager + std::vector<BUCKET_THREAD> mThreads; + + // list of buckets registered with this manager + std::vector<BUCKET_DESC> mBuckets; + + // is capturing currently enabled + volatile bool mCapturing{ false }; + + std::mutex mThreadMutex; + + // enable threadviz + bool mThreadViz{ false }; + std::string mThreadVizDir; +}; diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h new file mode 100644 index 00000000000..41c6d5dec79 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h @@ -0,0 +1,167 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file rdtsc_buckets.h +* +* @brief declaration for rdtsc buckets. +* +* Notes: +* +******************************************************************************/ +#pragma once + +#include <vector> +#include <cassert> + +struct BUCKET +{ + uint32_t id{ 0 }; + uint64_t start{ 0 }; + uint64_t elapsed{ 0 }; + uint32_t count{ 0 }; + + BUCKET* pParent{ nullptr }; + std::vector<BUCKET> children; +}; + +struct BUCKET_DESC +{ + // name of bucket, used in reports + std::string name; + + // description of bucket, used in threadviz + std::string description; + + // enable for threadviz dumping + bool enableThreadViz; + + // threadviz color of bucket, in RGBA8_UNORM format + uint32_t color; +}; + +struct BUCKET_THREAD +{ + // name of thread, used in reports + std::string name; + + // id for this thread, assigned by the thread manager + uint32_t id; + + // root of the bucket hierarchy for this thread + BUCKET root; + + // currently executing bucket somewhere in the hierarchy + BUCKET* pCurrent; + + // currently executing hierarchy level + uint32_t level{ 0 }; + + // threadviz file object + FILE* vizFile{ nullptr }; + + BUCKET_THREAD() {} + BUCKET_THREAD(const BUCKET_THREAD& that) + { + name = that.name; + id = that.id; + root = that.root; + pCurrent = &root; + vizFile = that.vizFile; + } +}; + +enum VIZ_TYPE +{ + VIZ_START = 0, + VIZ_STOP = 1, + VIZ_DATA = 2 +}; + +struct VIZ_START_DATA +{ + uint8_t type; + uint32_t bucketId; + uint64_t timestamp; +}; + +struct VIZ_STOP_DATA +{ + uint8_t type; + uint64_t timestamp; +}; + +inline void Serialize(FILE* f, const VIZ_START_DATA& data) +{ + fwrite(&data, sizeof(VIZ_START_DATA), 1, f); +} + +inline void Deserialize(FILE* f, VIZ_START_DATA& data) +{ + fread(&data, sizeof(VIZ_START_DATA), 1, f); + assert(data.type == VIZ_START); +} + +inline void Serialize(FILE* f, const VIZ_STOP_DATA& data) +{ + fwrite(&data, sizeof(VIZ_STOP_DATA), 1, f); +} + +inline void Deserialize(FILE* f, VIZ_STOP_DATA& data) +{ + fread(&data, sizeof(VIZ_STOP_DATA), 1, f); + assert(data.type == VIZ_STOP); +} + +inline void Serialize(FILE* f, const std::string& string) +{ + assert(string.size() <= 256); + + uint8_t length = (uint8_t)string.size(); + fwrite(&length, sizeof(length), 1, f); + fwrite(string.c_str(), string.size(), 1, f); +} + +inline void Deserialize(FILE* f, std::string& string) +{ + char cstr[256]; + uint8_t length; + fread(&length, sizeof(length), 1, f); + fread(cstr, length, 1, f); + cstr[length] = 0; + string.assign(cstr); +} + +inline void Serialize(FILE* f, const BUCKET_DESC& desc) +{ + Serialize(f, desc.name); + Serialize(f, desc.description); + fwrite(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f); + fwrite(&desc.color, sizeof(desc.color), 1, f); +} + +inline void Deserialize(FILE* f, BUCKET_DESC& desc) +{ + Deserialize(f, desc.name); + Deserialize(f, desc.description); + fread(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f); + fread(&desc.color, sizeof(desc.color), 1, f); +} diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h new file mode 100644 index 00000000000..8fa6d9ef408 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h @@ -0,0 +1,787 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +****************************************************************************/ + +#ifndef __SWR_SIMDINTRIN_H__ +#define __SWR_SIMDINTRIN_H__ + +#include "os.h" + +#include <cassert> + +#include <emmintrin.h> +#include <immintrin.h> +#include <xmmintrin.h> + +#if KNOB_SIMD_WIDTH == 8 +typedef __m256 simdscalar; +typedef __m256i simdscalari; +typedef uint8_t simdmask; +#else +#error Unsupported vector width +#endif + +// simd vector +OSALIGNSIMD(union) simdvector +{ + simdscalar v[4]; + struct + { + simdscalar x, y, z, w; + }; + + simdscalar& operator[] (const int i) { return v[i]; } + const simdscalar& operator[] (const int i) const { return v[i]; } +}; + +#if KNOB_SIMD_WIDTH == 8 +#define _simd128_maskstore_ps _mm_maskstore_ps +#define _simd_load_ps _mm256_load_ps +#define _simd_load1_ps _mm256_broadcast_ss +#define _simd_loadu_ps _mm256_loadu_ps +#define _simd_setzero_ps _mm256_setzero_ps +#define _simd_set1_ps _mm256_set1_ps +#define _simd_blend_ps _mm256_blend_ps +#define _simd_blendv_ps _mm256_blendv_ps +#define _simd_store_ps _mm256_store_ps +#define _simd_mul_ps _mm256_mul_ps +#define _simd_add_ps _mm256_add_ps +#define _simd_sub_ps _mm256_sub_ps +#define _simd_rsqrt_ps _mm256_rsqrt_ps +#define _simd_min_ps _mm256_min_ps +#define _simd_max_ps _mm256_max_ps +#define _simd_movemask_ps _mm256_movemask_ps +#define _simd_cvtps_epi32 _mm256_cvtps_epi32 +#define _simd_cvttps_epi32 _mm256_cvttps_epi32 +#define _simd_cvtepi32_ps _mm256_cvtepi32_ps +#define _simd_cmplt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ) +#define _simd_cmpgt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ) +#define _simd_cmpneq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_NEQ_OQ) +#define _simd_cmpeq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ) +#define _simd_cmpge_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ) +#define _simd_cmple_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ) +#define _simd_cmp_ps(a, b, imm) _mm256_cmp_ps(a, b, imm) +#define _simd_and_ps _mm256_and_ps +#define _simd_or_ps _mm256_or_ps + +#define _simd_rcp_ps _mm256_rcp_ps +#define _simd_div_ps _mm256_div_ps +#define _simd_castsi_ps _mm256_castsi256_ps +#define _simd_andnot_ps _mm256_andnot_ps +#define _simd_round_ps _mm256_round_ps +#define _simd_castpd_ps _mm256_castpd_ps +#define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a)) + +#define _simd_load_sd _mm256_load_sd +#define _simd_movemask_pd _mm256_movemask_pd +#define _simd_castsi_pd _mm256_castsi256_pd + +// emulated integer simd +#define SIMD_EMU_EPI(func, intrin) \ +INLINE \ +__m256i func(__m256i a, __m256i b)\ +{\ + __m128i aHi = _mm256_extractf128_si256(a, 1);\ + __m128i bHi = _mm256_extractf128_si256(b, 1);\ + __m128i aLo = _mm256_castsi256_si128(a);\ + __m128i bLo = _mm256_castsi256_si128(b);\ +\ + __m128i subLo = intrin(aLo, bLo);\ + __m128i subHi = intrin(aHi, bHi);\ +\ + __m256i result = _mm256_castsi128_si256(subLo);\ + result = _mm256_insertf128_si256(result, subHi, 1);\ +\ + return result;\ +} + +#if (KNOB_ARCH == KNOB_ARCH_AVX) +#define _simd_mul_epi32 _simdemu_mul_epi32 +#define _simd_mullo_epi32 _simdemu_mullo_epi32 +#define _simd_sub_epi32 _simdemu_sub_epi32 +#define _simd_sub_epi64 _simdemu_sub_epi64 +#define _simd_min_epi32 _simdemu_min_epi32 +#define _simd_min_epu32 _simdemu_min_epu32 +#define _simd_max_epi32 _simdemu_max_epi32 +#define _simd_max_epu32 _simdemu_max_epu32 +#define _simd_add_epi32 _simdemu_add_epi32 +#define _simd_and_si _simdemu_and_si +#define _simd_andnot_si _simdemu_andnot_si +#define _simd_cmpeq_epi32 _simdemu_cmpeq_epi32 +#define _simd_cmplt_epi32 _simdemu_cmplt_epi32 +#define _simd_cmpgt_epi32 _simdemu_cmpgt_epi32 +#define _simd_or_si _simdemu_or_si +#define _simd_castps_si _mm256_castps_si256 +#define _simd_adds_epu8 _simdemu_adds_epu8 +#define _simd_subs_epu8 _simdemu_subs_epu8 +#define _simd_add_epi8 _simdemu_add_epi8 +#define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64 +#define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64 +#define _simd_movemask_epi8 _simdemu_movemask_epi8 + +SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32) +SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32) +SIMD_EMU_EPI(_simdemu_sub_epi32, _mm_sub_epi32) +SIMD_EMU_EPI(_simdemu_sub_epi64, _mm_sub_epi64) +SIMD_EMU_EPI(_simdemu_min_epi32, _mm_min_epi32) +SIMD_EMU_EPI(_simdemu_min_epu32, _mm_min_epu32) +SIMD_EMU_EPI(_simdemu_max_epi32, _mm_max_epi32) +SIMD_EMU_EPI(_simdemu_max_epu32, _mm_max_epu32) +SIMD_EMU_EPI(_simdemu_add_epi32, _mm_add_epi32) +SIMD_EMU_EPI(_simdemu_and_si, _mm_and_si128) +SIMD_EMU_EPI(_simdemu_andnot_si, _mm_andnot_si128) +SIMD_EMU_EPI(_simdemu_cmpeq_epi32, _mm_cmpeq_epi32) +SIMD_EMU_EPI(_simdemu_cmplt_epi32, _mm_cmplt_epi32) +SIMD_EMU_EPI(_simdemu_cmpgt_epi32, _mm_cmpgt_epi32) +SIMD_EMU_EPI(_simdemu_or_si, _mm_or_si128) +SIMD_EMU_EPI(_simdemu_adds_epu8, _mm_adds_epu8) +SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8) +SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8) +SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64) +SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64) + +#define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))) +#define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))) + +#define _simd_slli_epi32(a,i) _simdemu_slli_epi32(a,i) +#define _simd_srai_epi32(a,i) _simdemu_srai_epi32(a,i) +#define _simd_srli_epi32(a,i) _simdemu_srli_epi32(a,i) +#define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a))) + +#define _simd128_fmadd_ps _mm_fmaddemu_ps +#define _simd_fmadd_ps _mm_fmaddemu256_ps +#define _simd_fmsub_ps _mm_fmsubemu256_ps +#define _simd_shuffle_epi8 _simdemu_shuffle_epi8 +SIMD_EMU_EPI(_simdemu_shuffle_epi8, _mm_shuffle_epi8) + +INLINE +__m128 _mm_fmaddemu_ps(__m128 a, __m128 b, __m128 c) +{ + __m128 res = _mm_mul_ps(a, b); + res = _mm_add_ps(res, c); + return res; +} + +INLINE +__m256 _mm_fmaddemu256_ps(__m256 a, __m256 b, __m256 c) +{ + __m256 res = _mm256_mul_ps(a, b); + res = _mm256_add_ps(res, c); + return res; +} + +INLINE +__m256 _mm_fmsubemu256_ps(__m256 a, __m256 b, __m256 c) +{ + __m256 res = _mm256_mul_ps(a, b); + res = _mm256_sub_ps(res, c); + return res; +} + +INLINE +__m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale) +{ + uint32_t *pOffsets = (uint32_t*)&vOffsets; + simdscalar vResult; + float* pResult = (float*)&vResult; + for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) + { + uint32_t offset = pOffsets[i]; + offset = offset * scale; + pResult[i] = *(float*)(((const uint8_t*)pBase + offset)); + } + + return vResult; +} + +INLINE +__m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale) +{ + uint32_t *pOffsets = (uint32_t*)&vOffsets; + simdscalar vResult = vSrc; + float* pResult = (float*)&vResult; + DWORD index; + uint32_t mask = _simd_movemask_ps(vMask); + while (_BitScanForward(&index, mask)) + { + mask &= ~(1 << index); + uint32_t offset = pOffsets[index]; + offset = offset * scale; + pResult[index] = *(float*)(((const uint8_t*)pBase + offset)); + } + + return vResult; +} + +INLINE +__m256i _simd_abs_epi32(__m256i a) +{ + __m128i aHi = _mm256_extractf128_si256(a, 1); + __m128i aLo = _mm256_castsi256_si128(a); + __m128i absLo = _mm_abs_epi32(aLo); + __m128i absHi = _mm_abs_epi32(aHi); + __m256i result = _mm256_castsi128_si256(absLo); + result = _mm256_insertf128_si256(result, absHi, 1); + return result; +} + +INLINE +int _simdemu_movemask_epi8(__m256i a) +{ + __m128i aHi = _mm256_extractf128_si256(a, 1); + __m128i aLo = _mm256_castsi256_si128(a); + + int resHi = _mm_movemask_epi8(aHi); + int resLo = _mm_movemask_epi8(aLo); + + return (resHi << 16) | resLo; +} +#else + +#define _simd_mul_epi32 _mm256_mul_epi32 +#define _simd_mullo_epi32 _mm256_mullo_epi32 +#define _simd_sub_epi32 _mm256_sub_epi32 +#define _simd_sub_epi64 _mm256_sub_epi64 +#define _simd_min_epi32 _mm256_min_epi32 +#define _simd_max_epi32 _mm256_max_epi32 +#define _simd_min_epu32 _mm256_min_epu32 +#define _simd_max_epu32 _mm256_max_epu32 +#define _simd_add_epi32 _mm256_add_epi32 +#define _simd_and_si _mm256_and_si256 +#define _simd_andnot_si _mm256_andnot_si256 +#define _simd_cmpeq_epi32 _mm256_cmpeq_epi32 +#define _simd_cmplt_epi32(a,b) _mm256_cmpgt_epi32(b,a) +#define _simd_cmpgt_epi32(a,b) _mm256_cmpgt_epi32(a,b) +#define _simd_or_si _mm256_or_si256 +#define _simd_castps_si _mm256_castps_si256 + +#define _simd_unpacklo_epi32 _mm256_unpacklo_epi32 +#define _simd_unpackhi_epi32 _mm256_unpackhi_epi32 + +#define _simd_srli_si(a,i) _simdemu_srli_si128<i>(a) +#define _simd_slli_epi32 _mm256_slli_epi32 +#define _simd_srai_epi32 _mm256_srai_epi32 +#define _simd_srli_epi32 _mm256_srli_epi32 +#define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a))) +#define _simd128_fmadd_ps _mm_fmadd_ps +#define _simd_fmadd_ps _mm256_fmadd_ps +#define _simd_fmsub_ps _mm256_fmsub_ps +#define _simd_shuffle_epi8 _mm256_shuffle_epi8 +#define _simd_adds_epu8 _mm256_adds_epu8 +#define _simd_subs_epu8 _mm256_subs_epu8 +#define _simd_add_epi8 _mm256_add_epi8 +#define _simd_i32gather_ps _mm256_i32gather_ps +#define _simd_mask_i32gather_ps _mm256_mask_i32gather_ps +#define _simd_abs_epi32 _mm256_abs_epi32 + +#define _simd_cmpeq_epi64 _mm256_cmpeq_epi64 +#define _simd_cmpgt_epi64 _mm256_cmpgt_epi64 +#define _simd_movemask_epi8 _mm256_movemask_epi8 +#endif + +#define _simd_shuffleps_epi32(vA, vB, imm) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(vA), _mm256_castsi256_ps(vB), imm)) +#define _simd_shuffle_ps _mm256_shuffle_ps +#define _simd_set1_epi32 _mm256_set1_epi32 +#define _simd_set1_epi8 _mm256_set1_epi8 +#define _simd_setzero_si _mm256_setzero_si256 +#define _simd_cvttps_epi32 _mm256_cvttps_epi32 +#define _simd_store_si _mm256_store_si256 +#define _simd_broadcast_ss _mm256_broadcast_ss +#define _simd_maskstore_ps _mm256_maskstore_ps +#define _simd_load_si _mm256_load_si256 +#define _simd_loadu_si _mm256_loadu_si256 +#define _simd_sub_ps _mm256_sub_ps +#define _simd_testz_ps _mm256_testz_ps +#define _simd_xor_ps _mm256_xor_ps + + +INLINE +simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalar mask) +{ + return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), mask)); +} + +// convert bitmask to vector mask +INLINE +simdscalar vMask(int32_t mask) +{ + __m256i vec = _mm256_set1_epi32(mask); + const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); + vec = _simd_and_si(vec, bit); + vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec); + return _simd_castsi_ps(vec); +} + +INLINE +void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int slane) +{ + OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH]; + _mm256_store_ps(rArray, r); + _mm256_store_ps(sArray, s); + rArray[rlane] = sArray[slane]; + r = _mm256_load_ps(rArray); +} + +INLINE __m256i _simdemu_slli_epi32(__m256i a, uint32_t i) +{ + __m128i aHi = _mm256_extractf128_si256(a, 1); + __m128i aLo = _mm256_castsi256_si128(a); + + __m128i resHi = _mm_slli_epi32(aHi, i); + __m128i resLo = _mm_slli_epi32(aLo, i); + + __m256i result = _mm256_castsi128_si256(resLo); + result = _mm256_insertf128_si256(result, resHi, 1); + + return result; +} + +INLINE __m256i _simdemu_srai_epi32(__m256i a, uint32_t i) +{ + __m128i aHi = _mm256_extractf128_si256(a, 1); + __m128i aLo = _mm256_castsi256_si128(a); + + __m128i resHi = _mm_srai_epi32(aHi, i); + __m128i resLo = _mm_srai_epi32(aLo, i); + + __m256i result = _mm256_castsi128_si256(resLo); + result = _mm256_insertf128_si256(result, resHi, 1); + + return result; +} + +INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i) +{ + __m128i aHi = _mm256_extractf128_si256(a, 1); + __m128i aLo = _mm256_castsi256_si128(a); + + __m128i resHi = _mm_srli_epi32(aHi, i); + __m128i resLo = _mm_srli_epi32(aLo, i); + + __m256i result = _mm256_castsi128_si256(resLo); + result = _mm256_insertf128_si256(result, resHi, 1); + + return result; +} + +INLINE +void _simdvec_transpose(simdvector &v) +{ + SWR_ASSERT(false, "Need to implement 8 wide version"); +} + +#else +#error Unsupported vector width +#endif + +// Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww. +INLINE +void _simdvec_load_ps(simdvector& r, const float *p) +{ + r[0] = _simd_set1_ps(p[0]); + r[1] = _simd_set1_ps(p[1]); + r[2] = _simd_set1_ps(p[2]); + r[3] = _simd_set1_ps(p[3]); +} + +INLINE +void _simdvec_mov(simdvector& r, const simdscalar& s) +{ + r[0] = s; + r[1] = s; + r[2] = s; + r[3] = s; +} + +INLINE +void _simdvec_mov(simdvector& r, const simdvector& v) +{ + r[0] = v[0]; + r[1] = v[1]; + r[2] = v[2]; + r[3] = v[3]; +} + +// just move a lane from the source simdvector to dest simdvector +INLINE +void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane) +{ + _simd_mov(r[0], rlane, s[0], slane); + _simd_mov(r[1], rlane, s[1], slane); + _simd_mov(r[2], rlane, s[2], slane); + _simd_mov(r[3], rlane, s[3], slane); +} + +INLINE +void _simdvec_dp3_ps(simdscalar& r, const simdvector& v0, const simdvector& v1) +{ + simdscalar tmp; + r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x) + + tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y) + r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + + tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z) + r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) +} + +INLINE +void _simdvec_dp4_ps(simdscalar& r, const simdvector& v0, const simdvector& v1) +{ + simdscalar tmp; + r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x) + + tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y) + r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + + tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z) + r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) + + tmp = _simd_mul_ps(v0[3], v1[3]); // (v0.w*v1.w) + r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) +} + +INLINE +simdscalar _simdvec_rcp_length_ps(const simdvector& v) +{ + simdscalar length; + _simdvec_dp4_ps(length, v, v); + return _simd_rsqrt_ps(length); +} + +INLINE +void _simdvec_normalize_ps(simdvector& r, const simdvector& v) +{ + simdscalar vecLength; + vecLength = _simdvec_rcp_length_ps(v); + + r[0] = _simd_mul_ps(v[0], vecLength); + r[1] = _simd_mul_ps(v[1], vecLength); + r[2] = _simd_mul_ps(v[2], vecLength); + r[3] = _simd_mul_ps(v[3], vecLength); +} + +INLINE +void _simdvec_mul_ps(simdvector& r, const simdvector& v, const simdscalar& s) +{ + r[0] = _simd_mul_ps(v[0], s); + r[1] = _simd_mul_ps(v[1], s); + r[2] = _simd_mul_ps(v[2], s); + r[3] = _simd_mul_ps(v[3], s); +} + +INLINE +void _simdvec_mul_ps(simdvector& r, const simdvector& v0, const simdvector& v1) +{ + r[0] = _simd_mul_ps(v0[0], v1[0]); + r[1] = _simd_mul_ps(v0[1], v1[1]); + r[2] = _simd_mul_ps(v0[2], v1[2]); + r[3] = _simd_mul_ps(v0[3], v1[3]); +} + +INLINE +void _simdvec_add_ps(simdvector& r, const simdvector& v0, const simdvector& v1) +{ + r[0] = _simd_add_ps(v0[0], v1[0]); + r[1] = _simd_add_ps(v0[1], v1[1]); + r[2] = _simd_add_ps(v0[2], v1[2]); + r[3] = _simd_add_ps(v0[3], v1[3]); +} + +INLINE +void _simdvec_min_ps(simdvector& r, const simdvector& v0, const simdscalar& s) +{ + r[0] = _simd_min_ps(v0[0], s); + r[1] = _simd_min_ps(v0[1], s); + r[2] = _simd_min_ps(v0[2], s); + r[3] = _simd_min_ps(v0[3], s); +} + +INLINE +void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s) +{ + r[0] = _simd_max_ps(v0[0], s); + r[1] = _simd_max_ps(v0[1], s); + r[2] = _simd_max_ps(v0[2], s); + r[3] = _simd_max_ps(v0[3], s); +} + +// Matrix4x4 * Vector4 +// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w) +// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w) +// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w) +// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w) +INLINE +void _simd_mat4x4_vec4_multiply( + simdvector& result, + const float *pMatrix, + const simdvector& v) +{ + simdscalar m; + simdscalar r0; + simdscalar r1; + + m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] + r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) + result[0] = r0; + + m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] + r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) + result[1] = r0; + + m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] + r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) + result[2] = r0; + + m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3] + r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) + result[3] = r0; +} + +// Matrix4x4 * Vector3 - Direction Vector where w = 0. +// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0) +// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0) +// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0) +// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0) +INLINE +void _simd_mat3x3_vec3_w0_multiply( + simdvector& result, + const float *pMatrix, + const simdvector& v) +{ + simdscalar m; + simdscalar r0; + simdscalar r1; + + m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + result[0] = r0; + + m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + result[1] = r0; + + m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + result[2] = r0; + + result[3] = _simd_setzero_ps(); +} + +// Matrix4x4 * Vector3 - Position vector where w = 1. +// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1) +// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1) +// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1) +// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1) +INLINE +void _simd_mat4x4_vec3_w1_multiply( + simdvector& result, + const float *pMatrix, + const simdvector& v) +{ + simdscalar m; + simdscalar r0; + simdscalar r1; + + m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] + r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + result[0] = r0; + + m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] + r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + result[1] = r0; + + m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] + r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + result[2] = r0; + + m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3] + result[3] = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) +} + +INLINE +void _simd_mat4x3_vec3_w1_multiply( + simdvector& result, + const float *pMatrix, + const simdvector& v) +{ + simdscalar m; + simdscalar r0; + simdscalar r1; + + m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] + r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + result[0] = r0; + + m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] + r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + result[1] = r0; + + m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] + r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + result[2] = r0; + result[3] = _simd_set1_ps(1.0f); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Compute plane equation vA * vX + vB * vY + vC +INLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscalar &vX, simdscalar &vY) +{ + simdscalar vOut = _simd_fmadd_ps(vA, vX, vC); + vOut = _simd_fmadd_ps(vB, vY, vOut); + return vOut; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Interpolates a single component. +/// @param vI - barycentric I +/// @param vJ - barycentric J +/// @param pInterpBuffer - pointer to attribute barycentric coeffs +template<UINT Attrib, UINT Comp> +static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, const float *pInterpBuffer) +{ + const float *pInterpA = &pInterpBuffer[Attrib * 12 + 0 + Comp]; + const float *pInterpB = &pInterpBuffer[Attrib * 12 + 4 + Comp]; + const float *pInterpC = &pInterpBuffer[Attrib * 12 + 8 + Comp]; + + simdscalar vA = _simd_broadcast_ss(pInterpA); + simdscalar vB = _simd_broadcast_ss(pInterpB); + simdscalar vC = _simd_broadcast_ss(pInterpC); + + simdscalar vk = _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f), vI), vJ); + vC = _simd_mul_ps(vk, vC); + + return vplaneps(vA, vB, vC, vI, vJ); +} + + +#endif//__SWR_SIMDINTRIN_H__ diff --git a/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp b/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp new file mode 100644 index 00000000000..0bffd2c8000 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp @@ -0,0 +1,238 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +****************************************************************************/ + +#include "common/os.h" +#include <stdarg.h> +#include <stdio.h> +#include <assert.h> + +#if SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS + +#if defined(_WIN32) +#pragma comment(lib, "user32.lib") +#endif // _WIN32 + +enum TextColor +{ + TEXT_BLACK = 0, + TEXT_RED = 1, + TEXT_GREEN = 2, + TEXT_BLUE = 4, + TEXT_PURPLE = TEXT_RED | TEXT_BLUE, + TEXT_CYAN = TEXT_GREEN | TEXT_BLUE, + TEXT_YELLOW = TEXT_RED | TEXT_GREEN, + TEXT_WHITE = TEXT_RED | TEXT_GREEN | TEXT_BLUE, +}; + +enum TextStyle +{ + TEXT_NORMAL = 0, + TEXT_INTENSITY = 1, +}; + +void SetTextColor(FILE* stream, TextColor color = TEXT_WHITE, TextStyle style = TEXT_NORMAL) +{ +#if defined(_WIN32) + + HANDLE hConsoleHandle = nullptr; + if (stream == stderr) + { + hConsoleHandle = GetStdHandle(STD_ERROR_HANDLE); + } + else if (stream == stdout) + { + hConsoleHandle = GetStdHandle(STD_OUTPUT_HANDLE); + } + else + { + // Not a console stream, do nothing + return; + } + + WORD textAttributes = 0; + if (color & TEXT_RED) + { + textAttributes |= FOREGROUND_RED; + } + if (color & TEXT_GREEN) + { + textAttributes |= FOREGROUND_GREEN; + } + if (color & TEXT_BLUE) + { + textAttributes |= FOREGROUND_BLUE; + } + if (style & TEXT_INTENSITY) + { + textAttributes |= FOREGROUND_INTENSITY; + } + SetConsoleTextAttribute(hConsoleHandle, textAttributes); + +#else // !_WIN32 + + // Print ANSI codes + uint32_t cc = 30 + (style ? 60 : 0) + color; + fprintf(stream, "\033[0m\033[%d;%dm", style, cc); + +#endif +} + +void ResetTextColor(FILE* stream) +{ +#if defined(_WIN32) + + SetTextColor(stream); + +#else // !_WIN32 + + // Print ANSI codes + fprintf(stream, "\033[0m"); + +#endif +} + +bool SwrAssert( + bool chkDebugger, + bool& enabled, + const char* pExpression, + const char* pFileName, + uint32_t lineNum, + const char* pFunction, + const char* pFmtString /* = nullptr */, + ...) +{ + if (!enabled) return false; + + SetTextColor(stderr, TEXT_CYAN, TEXT_NORMAL); + + fprintf(stderr, "%s(%d): ", pFileName, lineNum); + + SetTextColor(stderr, TEXT_RED, TEXT_INTENSITY); + + fprintf(stderr, "ASSERT: %s\n", pExpression); + + SetTextColor(stderr, TEXT_CYAN, TEXT_INTENSITY); + fprintf(stderr, "\t%s\n", pFunction); + + if (pFmtString) + { + SetTextColor(stderr, TEXT_YELLOW, TEXT_INTENSITY); + fprintf(stderr, "\t"); + va_list args; + va_start(args, pFmtString); + vfprintf(stderr, pFmtString, args); + va_end(args); + fprintf(stderr, "\n"); + } + ResetTextColor(stderr); + fflush(stderr); + +#if defined(_WIN32) + static const int MAX_MESSAGE_LEN = 2048; + char msgBuf[MAX_MESSAGE_LEN]; + + sprintf_s(msgBuf, "%s(%d): ASSERT: %s\n", pFileName, lineNum, pExpression); + msgBuf[MAX_MESSAGE_LEN - 2] = '\n'; + msgBuf[MAX_MESSAGE_LEN - 1] = 0; + OutputDebugStringA(msgBuf); + + sprintf_s(msgBuf, "\t%s\n", pFunction); + msgBuf[MAX_MESSAGE_LEN - 2] = '\n'; + msgBuf[MAX_MESSAGE_LEN - 1] = 0; + OutputDebugStringA(msgBuf); + + int offset = 0; + + if (pFmtString) + { + va_list args; + va_start(args, pFmtString); + offset = _vsnprintf_s( + msgBuf, + sizeof(msgBuf), + sizeof(msgBuf), + pFmtString, + args); + va_end(args); + + if (offset < 0) { return true; } + + OutputDebugStringA("\t"); + OutputDebugStringA(msgBuf); + OutputDebugStringA("\n"); + } + + if (KNOB_ENABLE_ASSERT_DIALOGS) + { + int retval = sprintf_s( + &msgBuf[offset], + MAX_MESSAGE_LEN - offset, + "\n\n" + "File: %s\n" + "Line: %d\n" + "\n" + "Expression: %s\n\n" + "Cancel: Disable this assert for the remainder of the process\n" + "Try Again: Break into the debugger\n" + "Continue: Continue execution (but leave assert enabled)", + pFileName, + lineNum, + pExpression); + + if (retval < 0) { return true; } + + offset += retval; + + if (!IsDebuggerPresent()) + { + sprintf_s( + &msgBuf[offset], + MAX_MESSAGE_LEN - offset, + "\n\n*** NO DEBUGGER DETECTED ***\n\nPressing \"Try Again\" will cause a program crash!"); + } + + retval = MessageBoxA(nullptr, msgBuf, "Assert Failed", MB_CANCELTRYCONTINUE | MB_ICONEXCLAMATION); + + switch (retval) + { + case IDCANCEL: + enabled = false; + return false; + + case IDTRYAGAIN: + return true; + + case IDCONTINUE: + return false; + } + } + else + { + return IsDebuggerPresent() || !chkDebugger; + } +#endif // _WIN32 + + return true; +} + +#endif // SWR_ENABLE_ASSERTS diff --git a/src/gallium/drivers/swr/rasterizer/common/swr_assert.h b/src/gallium/drivers/swr/rasterizer/common/swr_assert.h new file mode 100644 index 00000000000..fecadb3d499 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/common/swr_assert.h @@ -0,0 +1,109 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +****************************************************************************/ + +#ifndef __SWR_ASSERT_H__ +#define __SWR_ASSERT_H__ + +#if !defined(__SWR_OS_H__) +#error swr_assert.h should not be included directly, please include "common/os.h" instead. +#endif + +#if !defined(SWR_ENABLE_ASSERTS) + +#if !defined(NDEBUG) +#define SWR_ENABLE_ASSERTS 1 +#else +#define SWR_ENABLE_ASSERTS 0 +#endif // _DEBUG + +#endif // SWR_ENABLE_ASSERTS + +#if !defined(SWR_ENABLE_REL_ASSERTS) +#define SWR_ENABLE_REL_ASSERTS 1 +#endif + +#if SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS +#include "assert.h" + +#if !defined(__cplusplus) + +#pragma message("C++ is required for SWR Asserts, falling back to assert.h") + +#if SWR_ENABLE_ASSERTS +#define SWR_ASSERT(e, ...) assert(e) +#endif + +#if SWR_ENABLE_REL_ASSERTS +#define SWR_REL_ASSERT(e, ...) assert(e) +#endif + +#else + +#if SWR_ENABLE_ASSERTS +#if defined(assert) +#undef assert +#endif +#define assert(exp) SWR_ASSERT(exp) +#endif + +bool SwrAssert( + bool chkDebugger, + bool& enabled, + const char* pExpression, + const char* pFileName, + uint32_t lineNum, + const char* function, + const char* pFmtString = nullptr, + ...); + +#define _SWR_ASSERT(chkDebugger, e, ...) {\ + bool expFailed = !(e);\ + if (expFailed) {\ + static bool swrAssertEnabled = true;\ + expFailed = SwrAssert(chkDebugger, swrAssertEnabled, #e, __FILE__, __LINE__, __FUNCTION__, ##__VA_ARGS__);\ + if (expFailed) { DEBUGBREAK; }\ + }\ +} + +#if SWR_ENABLE_ASSERTS +#define SWR_ASSERT(e, ...) _SWR_ASSERT(true, e, ##__VA_ARGS__) +#endif + +#if SWR_ENABLE_REL_ASSERTS +#define SWR_REL_ASSERT(e, ...) _SWR_ASSERT(false, e, ##__VA_ARGS__) +#endif +#endif // C++ + +#endif // SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS + +#if !SWR_ENABLE_ASSERTS +#define SWR_ASSERT(e, ...) +#endif + +#if !SWR_ENABLE_REL_ASSERTS +#define SWR_REL_ASSERT(e, ...) +#endif + +#define SWR_NOT_IMPL SWR_ASSERT(0, "%s not implemented", __FUNCTION__) + +#endif//__SWR_ASSERT_H__ diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp new file mode 100644 index 00000000000..fccccab503c --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -0,0 +1,1511 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file api.cpp +* +* @brief API implementation +* +******************************************************************************/ + +#include <cfloat> +#include <cmath> +#include <cstdio> + +#include "core/api.h" +#include "core/backend.h" +#include "core/context.h" +#include "core/frontend.h" +#include "core/rasterizer.h" +#include "core/rdtsc_core.h" +#include "core/threads.h" +#include "core/tilemgr.h" +#include "core/clip.h" + +#include "common/simdintrin.h" +#include "common/os.h" + +void SetupDefaultState(SWR_CONTEXT *pContext); + +////////////////////////////////////////////////////////////////////////// +/// @brief Create SWR Context. +/// @param pCreateInfo - pointer to creation info. +HANDLE SwrCreateContext( + const SWR_CREATECONTEXT_INFO* pCreateInfo) +{ + RDTSC_RESET(); + RDTSC_INIT(0); + + void* pContextMem = _aligned_malloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4); + memset(pContextMem, 0, sizeof(SWR_CONTEXT)); + SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT(); + + pContext->driverType = pCreateInfo->driver; + pContext->privateStateSize = pCreateInfo->privateStateSize; + + pContext->dcRing = (DRAW_CONTEXT*)_aligned_malloc(sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT, 64); + memset(pContext->dcRing, 0, sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT); + + pContext->dsRing = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT, 64); + memset(pContext->dsRing, 0, sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT); + + pContext->numSubContexts = pCreateInfo->maxSubContexts; + if (pContext->numSubContexts > 1) + { + pContext->subCtxSave = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE) * pContext->numSubContexts, 64); + memset(pContext->subCtxSave, 0, sizeof(DRAW_STATE) * pContext->numSubContexts); + } + + for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) + { + pContext->dcRing[dc].pArena = new Arena(); + pContext->dcRing[dc].inUse = false; + pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena)); + pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen. + + pContext->dsRing[dc].pArena = new Arena(); + } + + if (!KNOB_SINGLE_THREADED) + { + memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock)); + memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty)); + new (&pContext->WaitLock) std::mutex(); + new (&pContext->FifosNotEmpty) std::condition_variable(); + + CreateThreadPool(pContext, &pContext->threadPool); + } + + // Calling createThreadPool() above can set SINGLE_THREADED + if (KNOB_SINGLE_THREADED) + { + pContext->NumWorkerThreads = 1; + } + + // Allocate scratch space for workers. + ///@note We could lazily allocate this but its rather small amount of memory. + for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) + { + ///@todo Use numa API for allocations using numa information from thread data (if exists). + pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4); + } + + pContext->nextDrawId = 1; + pContext->DrawEnqueued = 1; + + // State setup AFTER context is fully initialized + SetupDefaultState(pContext); + + // initialize hot tile manager + pContext->pHotTileMgr = new HotTileMgr(); + + // initialize function pointer tables + InitClearTilesTable(); + + // initialize store tiles function + pContext->pfnLoadTile = pCreateInfo->pfnLoadTile; + pContext->pfnStoreTile = pCreateInfo->pfnStoreTile; + pContext->pfnClearTile = pCreateInfo->pfnClearTile; + + return (HANDLE)pContext; +} + +void SwrDestroyContext(HANDLE hContext) +{ + SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; + DestroyThreadPool(pContext, &pContext->threadPool); + + // free the fifos + for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i) + { + delete pContext->dcRing[i].pArena; + delete pContext->dsRing[i].pArena; + delete(pContext->dcRing[i].pTileMgr); + delete(pContext->dcRing[i].pDispatch); + } + + // Free scratch space. + for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) + { + _aligned_free(pContext->pScratch[i]); + } + + _aligned_free(pContext->dcRing); + _aligned_free(pContext->dsRing); + _aligned_free(pContext->subCtxSave); + + delete(pContext->pHotTileMgr); + + pContext->~SWR_CONTEXT(); + _aligned_free((SWR_CONTEXT*)hContext); +} + +void CopyState(DRAW_STATE& dst, const DRAW_STATE& src) +{ + memcpy(&dst.state, &src.state, sizeof(API_STATE)); +} + +void WakeAllThreads(SWR_CONTEXT *pContext) +{ + pContext->FifosNotEmpty.notify_all(); +} + +bool StillDrawing(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC) +{ + // For single thread nothing should still be drawing. + if (KNOB_SINGLE_THREADED) { return false; } + + if (pDC->isCompute) + { + if (pDC->doneCompute) + { + pDC->inUse = false; + return false; + } + } + + // Check if backend work is done. First make sure all triangles have been binned. + if (pDC->doneFE == true) + { + // ensure workers have all moved passed this draw + if (pDC->threadsDoneFE != pContext->NumWorkerThreads) + { + return true; + } + + if (pDC->threadsDoneBE != pContext->NumWorkerThreads) + { + return true; + } + + pDC->inUse = false; // all work is done. + } + + return pDC->inUse; +} + +void QueueDraw(SWR_CONTEXT *pContext) +{ + SWR_ASSERT(pContext->pCurDrawContext->inUse == false); + pContext->pCurDrawContext->inUse = true; + + _ReadWriteBarrier(); + { + std::unique_lock<std::mutex> lock(pContext->WaitLock); + pContext->DrawEnqueued++; + } + + if (KNOB_SINGLE_THREADED) + { + // flush denormals to 0 + uint32_t mxcsr = _mm_getcsr(); + _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); + + std::unordered_set<uint32_t> lockedTiles; + uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId }; + WorkOnFifoFE(pContext, 0, curDraw[0], 0); + WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles); + + // restore csr + _mm_setcsr(mxcsr); + } + else + { + RDTSC_START(APIDrawWakeAllThreads); + WakeAllThreads(pContext); + RDTSC_STOP(APIDrawWakeAllThreads, 1, 0); + } + + // Set current draw context to NULL so that next state call forces a new draw context to be created and populated. + pContext->pPrevDrawContext = pContext->pCurDrawContext; + pContext->pCurDrawContext = nullptr; +} + +///@todo Combine this with QueueDraw +void QueueDispatch(SWR_CONTEXT *pContext) +{ + SWR_ASSERT(pContext->pCurDrawContext->inUse == false); + pContext->pCurDrawContext->inUse = true; + + _ReadWriteBarrier(); + { + std::unique_lock<std::mutex> lock(pContext->WaitLock); + pContext->DrawEnqueued++; + } + + if (KNOB_SINGLE_THREADED) + { + // flush denormals to 0 + uint32_t mxcsr = _mm_getcsr(); + _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); + + uint64_t curDispatch = pContext->pCurDrawContext->drawId; + WorkOnCompute(pContext, 0, curDispatch); + + // restore csr + _mm_setcsr(mxcsr); + } + else + { + RDTSC_START(APIDrawWakeAllThreads); + WakeAllThreads(pContext); + RDTSC_STOP(APIDrawWakeAllThreads, 1, 0); + } + + // Set current draw context to NULL so that next state call forces a new draw context to be created and populated. + pContext->pPrevDrawContext = pContext->pCurDrawContext; + pContext->pCurDrawContext = nullptr; +} + +DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) +{ + RDTSC_START(APIGetDrawContext); + // If current draw context is null then need to obtain a new draw context to use from ring. + if (pContext->pCurDrawContext == nullptr) + { + uint32_t dcIndex = pContext->nextDrawId % KNOB_MAX_DRAWS_IN_FLIGHT; + + DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex]; + pContext->pCurDrawContext = pCurDrawContext; + + // Need to wait until this draw context is available to use. + while (StillDrawing(pContext, pCurDrawContext)) + { + _mm_pause(); + } + + // Assign next available entry in DS ring to this DC. + uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT; + pCurDrawContext->pState = &pContext->dsRing[dsIndex]; + + Arena& stateArena = *(pCurDrawContext->pState->pArena); + + // Copy previous state to current state. + if (pContext->pPrevDrawContext) + { + DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext; + + // If we're splitting our draw then we can just use the same state from the previous + // draw. In this case, we won't increment the DS ring index so the next non-split + // draw can receive the state. + if (isSplitDraw == false) + { + CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState); + + stateArena.Reset(true); // Reset memory. + pCurDrawContext->pState->pPrivateState = nullptr; + + pContext->curStateId++; // Progress state ring index forward. + } + else + { + // If its a split draw then just copy the state pointer over + // since its the same draw. + pCurDrawContext->pState = pPrevDrawContext->pState; + } + } + else + { + stateArena.Reset(); // Reset memory. + pContext->curStateId++; // Progress state ring index forward. + } + + pCurDrawContext->dependency = 0; + pCurDrawContext->pArena->Reset(); + pCurDrawContext->pContext = pContext; + pCurDrawContext->isCompute = false; // Dispatch has to set this to true. + pCurDrawContext->inUse = false; + + pCurDrawContext->doneCompute = false; + pCurDrawContext->doneFE = false; + pCurDrawContext->FeLock = 0; + pCurDrawContext->threadsDoneFE = 0; + pCurDrawContext->threadsDoneBE = 0; + + pCurDrawContext->pTileMgr->initialize(); + + // Assign unique drawId for this DC + pCurDrawContext->drawId = pContext->nextDrawId++; + } + else + { + SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC"); + } + + RDTSC_STOP(APIGetDrawContext, 0, 0); + return pContext->pCurDrawContext; +} + +void SWR_API SwrSetActiveSubContext( + HANDLE hContext, + uint32_t subContextIndex) +{ + SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; + if (subContextIndex >= pContext->numSubContexts) + { + return; + } + + if (subContextIndex != pContext->curSubCtxId) + { + // Save and restore draw state + DRAW_CONTEXT* pDC = GetDrawContext(pContext); + CopyState( + pContext->subCtxSave[pContext->curSubCtxId], + *(pDC->pState)); + + CopyState( + *(pDC->pState), + pContext->subCtxSave[subContextIndex]); + + pContext->curSubCtxId = subContextIndex; + } +} + +API_STATE* GetDrawState(SWR_CONTEXT *pContext) +{ + DRAW_CONTEXT* pDC = GetDrawContext(pContext); + SWR_ASSERT(pDC->pState != nullptr); + + return &pDC->pState->state; +} + +void SetupDefaultState(SWR_CONTEXT *pContext) +{ + API_STATE* pState = GetDrawState(pContext); + + pState->rastState.cullMode = SWR_CULLMODE_NONE; + pState->rastState.frontWinding = SWR_FRONTWINDING_CCW; +} + +static INLINE SWR_CONTEXT* GetContext(HANDLE hContext) +{ + return (SWR_CONTEXT*)hContext; +} + +void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2, uint64_t userData3) +{ + RDTSC_START(APISync); + + SWR_ASSERT(pfnFunc != nullptr); + + SWR_CONTEXT *pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); + + pDC->FeWork.type = SYNC; + pDC->FeWork.pfnWork = ProcessSync; + pDC->FeWork.desc.sync.pfnCallbackFunc = pfnFunc; + pDC->FeWork.desc.sync.userData = userData; + pDC->FeWork.desc.sync.userData2 = userData2; + pDC->FeWork.desc.sync.userData3 = userData3; + + // cannot execute until all previous draws have completed + pDC->dependency = pDC->drawId - 1; + + //enqueue + QueueDraw(pContext); + + RDTSC_STOP(APISync, 1, 0); +} + +void SwrWaitForIdle(HANDLE hContext) +{ + SWR_CONTEXT *pContext = GetContext(hContext); + + RDTSC_START(APIWaitForIdle); + // Wait for all work to complete. + for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) + { + DRAW_CONTEXT *pDC = &pContext->dcRing[dc]; + + while (StillDrawing(pContext, pDC)) + { + _mm_pause(); + } + } + RDTSC_STOP(APIWaitForIdle, 1, 0); +} + +void SwrSetVertexBuffers( + HANDLE hContext, + uint32_t numBuffers, + const SWR_VERTEX_BUFFER_STATE* pVertexBuffers) +{ + API_STATE* pState = GetDrawState(GetContext(hContext)); + + for (uint32_t i = 0; i < numBuffers; ++i) + { + const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i]; + pState->vertexBuffers[pVB->index] = *pVB; + } +} + +void SwrSetIndexBuffer( + HANDLE hContext, + const SWR_INDEX_BUFFER_STATE* pIndexBuffer) +{ + API_STATE* pState = GetDrawState(GetContext(hContext)); + + pState->indexBuffer = *pIndexBuffer; +} + +void SwrSetFetchFunc( + HANDLE hContext, + PFN_FETCH_FUNC pfnFetchFunc) +{ + API_STATE* pState = GetDrawState(GetContext(hContext)); + + pState->pfnFetchFunc = pfnFetchFunc; +} + +void SwrSetSoFunc( + HANDLE hContext, + PFN_SO_FUNC pfnSoFunc, + uint32_t streamIndex) +{ + API_STATE* pState = GetDrawState(GetContext(hContext)); + + SWR_ASSERT(streamIndex < MAX_SO_STREAMS); + + pState->pfnSoFunc[streamIndex] = pfnSoFunc; +} + +void SwrSetSoState( + HANDLE hContext, + SWR_STREAMOUT_STATE* pSoState) +{ + API_STATE* pState = GetDrawState(GetContext(hContext)); + + pState->soState = *pSoState; +} + +void SwrSetSoBuffers( + HANDLE hContext, + SWR_STREAMOUT_BUFFER* pSoBuffer, + uint32_t slot) +{ + API_STATE* pState = GetDrawState(GetContext(hContext)); + + SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot); + + pState->soBuffer[slot] = *pSoBuffer; +} + +void SwrSetVertexFunc( + HANDLE hContext, + PFN_VERTEX_FUNC pfnVertexFunc) +{ + API_STATE* pState = GetDrawState(GetContext(hContext)); + + pState->pfnVertexFunc = pfnVertexFunc; +} + +void SwrSetFrontendState( + HANDLE hContext, + SWR_FRONTEND_STATE *pFEState) +{ + API_STATE* pState = GetDrawState(GetContext(hContext)); + pState->frontendState = *pFEState; +} + +void SwrSetGsState( + HANDLE hContext, + SWR_GS_STATE *pGSState) +{ + API_STATE* pState = GetDrawState(GetContext(hContext)); + pState->gsState = *pGSState; +} + +void SwrSetGsFunc( + HANDLE hContext, + PFN_GS_FUNC pfnGsFunc) +{ + API_STATE* pState = GetDrawState(GetContext(hContext)); + pState->pfnGsFunc = pfnGsFunc; +} + +void SwrSetCsFunc( + HANDLE hContext, + PFN_CS_FUNC pfnCsFunc, + uint32_t totalThreadsInGroup) +{ + API_STATE* pState = GetDrawState(GetContext(hContext)); + pState->pfnCsFunc = pfnCsFunc; + pState->totalThreadsInGroup = totalThreadsInGroup; +} + +void SwrSetTsState( + HANDLE hContext, + SWR_TS_STATE *pState) +{ + API_STATE* pApiState = GetDrawState(GetContext(hContext)); + pApiState->tsState = *pState; +} + +void SwrSetHsFunc( + HANDLE hContext, + PFN_HS_FUNC pfnFunc) +{ + API_STATE* pApiState = GetDrawState(GetContext(hContext)); + pApiState->pfnHsFunc = pfnFunc; +} + +void SwrSetDsFunc( + HANDLE hContext, + PFN_DS_FUNC pfnFunc) +{ + API_STATE* pApiState = GetDrawState(GetContext(hContext)); + pApiState->pfnDsFunc = pfnFunc; +} + +void SwrSetDepthStencilState( + HANDLE hContext, + SWR_DEPTH_STENCIL_STATE *pDSState) +{ + API_STATE* pState = GetDrawState(GetContext(hContext)); + + pState->depthStencilState = *pDSState; +} + +void SwrSetBackendState( + HANDLE hContext, + SWR_BACKEND_STATE *pBEState) +{ + API_STATE* pState = GetDrawState(GetContext(hContext)); + + pState->backendState = *pBEState; +} + +void SwrSetPixelShaderState( + HANDLE hContext, + SWR_PS_STATE *pPSState) +{ + API_STATE *pState = GetDrawState(GetContext(hContext)); + pState->psState = *pPSState; +} + +void SwrSetBlendState( + HANDLE hContext, + SWR_BLEND_STATE *pBlendState) +{ + API_STATE *pState = GetDrawState(GetContext(hContext)); + memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE)); +} + +void SwrSetBlendFunc( + HANDLE hContext, + uint32_t renderTarget, + PFN_BLEND_JIT_FUNC pfnBlendFunc) +{ + SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS); + API_STATE *pState = GetDrawState(GetContext(hContext)); + pState->pfnBlendFunc[renderTarget] = pfnBlendFunc; +} + +void SwrSetLinkage( + HANDLE hContext, + uint32_t mask, + const uint8_t* pMap) +{ + API_STATE* pState = GetDrawState(GetContext(hContext)); + + static const uint8_t IDENTITY_MAP[] = + { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + }; + static_assert(sizeof(IDENTITY_MAP) == sizeof(pState->linkageMap), + "Update for new value of MAX_ATTRIBUTES"); + + pState->linkageMask = mask; + pState->linkageCount = _mm_popcnt_u32(mask); + + if (!pMap) + { + pMap = IDENTITY_MAP; + } + memcpy(pState->linkageMap, pMap, pState->linkageCount); +} + +// update guardband multipliers for the viewport +void updateGuardband(API_STATE *pState) +{ + // guardband center is viewport center + pState->gbState.left = KNOB_GUARDBAND_WIDTH / pState->vp[0].width; + pState->gbState.right = KNOB_GUARDBAND_WIDTH / pState->vp[0].width; + pState->gbState.top = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height; + pState->gbState.bottom = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height; +} + +void SwrSetRastState( + HANDLE hContext, + const SWR_RASTSTATE *pRastState) +{ + SWR_CONTEXT *pContext = GetContext(hContext); + API_STATE* pState = GetDrawState(pContext); + + memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE)); +} + +void SwrSetViewports( + HANDLE hContext, + uint32_t numViewports, + const SWR_VIEWPORT* pViewports, + const SWR_VIEWPORT_MATRIX* pMatrices) +{ + SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS, + "Invalid number of viewports."); + + SWR_CONTEXT *pContext = GetContext(hContext); + API_STATE* pState = GetDrawState(pContext); + + memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports); + + if (pMatrices != nullptr) + { + memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports); + } + else + { + // Compute default viewport transform. + for (uint32_t i = 0; i < numViewports; ++i) + { + if (pContext->driverType == DX) + { + pState->vpMatrix[i].m00 = pState->vp[i].width / 2.0f; + pState->vpMatrix[i].m11 = -pState->vp[i].height / 2.0f; + pState->vpMatrix[i].m22 = pState->vp[i].maxZ - pState->vp[i].minZ; + pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00; + pState->vpMatrix[i].m31 = pState->vp[i].y - pState->vpMatrix[i].m11; + pState->vpMatrix[i].m32 = pState->vp[i].minZ; + } + else + { + // Standard, with the exception that Y is inverted. + pState->vpMatrix[i].m00 = (pState->vp[i].width - pState->vp[i].x) / 2.0f; + pState->vpMatrix[i].m11 = (pState->vp[i].y - pState->vp[i].height) / 2.0f; + pState->vpMatrix[i].m22 = (pState->vp[i].maxZ - pState->vp[i].minZ) / 2.0f; + pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00; + pState->vpMatrix[i].m31 = pState->vp[i].height + pState->vpMatrix[i].m11; + pState->vpMatrix[i].m32 = pState->vp[i].minZ + pState->vpMatrix[i].m22; + + // Now that the matrix is calculated, clip the view coords to screen size. + // OpenGL allows for -ve x,y in the viewport. + pState->vp[i].x = std::max(pState->vp[i].x, 0.0f); + pState->vp[i].y = std::max(pState->vp[i].y, 0.0f); + } + } + } + + updateGuardband(pState); +} + +void SwrSetScissorRects( + HANDLE hContext, + uint32_t numScissors, + const BBOX* pScissors) +{ + SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS, + "Invalid number of scissor rects."); + + API_STATE* pState = GetDrawState(GetContext(hContext)); + memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(BBOX)); +}; + +void SetupMacroTileScissors(DRAW_CONTEXT *pDC) +{ + API_STATE *pState = &pDC->pState->state; + uint32_t left, right, top, bottom; + + // Set up scissor dimensions based on scissor or viewport + if (pState->rastState.scissorEnable) + { + // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges + left = pState->scissorRects[0].left; + right = pState->scissorRects[0].right; + top = pState->scissorRects[0].top; + bottom = pState->scissorRects[0].bottom; + } + else + { + left = (int32_t)pState->vp[0].x; + right = (int32_t)pState->vp[0].x + (int32_t)pState->vp[0].width; + top = (int32_t)pState->vp[0].y; + bottom = (int32_t)pState->vp[0].y + (int32_t)pState->vp[0].height; + } + + right = std::min<uint32_t>(right, KNOB_MAX_SCISSOR_X); + bottom = std::min<uint32_t>(bottom, KNOB_MAX_SCISSOR_Y); + + if (left > KNOB_MAX_SCISSOR_X || top > KNOB_MAX_SCISSOR_Y) + { + pState->scissorInFixedPoint.left = 0; + pState->scissorInFixedPoint.right = 0; + pState->scissorInFixedPoint.top = 0; + pState->scissorInFixedPoint.bottom = 0; + } + else + { + pState->scissorInFixedPoint.left = left * FIXED_POINT_SCALE; + pState->scissorInFixedPoint.right = right * FIXED_POINT_SCALE - 1; + pState->scissorInFixedPoint.top = top * FIXED_POINT_SCALE; + pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1; + } +} + +void SetupPipeline(DRAW_CONTEXT *pDC) +{ + DRAW_STATE* pState = pDC->pState; + const SWR_RASTSTATE &rastState = pState->state.rastState; + BACKEND_FUNCS& backendFuncs = pState->backendFuncs; + const uint32_t forcedSampleCount = (rastState.bForcedSampleCount) ? 1 : 0; + + // setup backend + if (pState->state.psState.pfnPixelShader == nullptr) + { + backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount]; + // always need to generate I & J per sample for Z interpolation + backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[1]; + } + else + { + const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0; + const uint32_t centroid = ((pState->state.psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0; + + // currently only support 'normal' input coverage + SWR_ASSERT(pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL || + pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NONE); + + SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)pState->state.psState.barycentricsMask; + + // select backend function + switch(pState->state.psState.shadingRate) + { + case SWR_SHADING_RATE_PIXEL: + if(bMultisampleEnable) + { + // always need to generate I & J per sample for Z interpolation + barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); + backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][pState->state.psState.inputCoverage][centroid][forcedSampleCount]; + backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount]; + } + else + { + // always need to generate I & J per pixel for Z interpolation + barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK); + backendFuncs.pfnBackend = gBackendSingleSample[pState->state.psState.inputCoverage][centroid]; + backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][SWR_MULTISAMPLE_1X]; + } + break; + case SWR_SHADING_RATE_SAMPLE: + SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN); + // always need to generate I & J per sample for Z interpolation + barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); + backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][pState->state.psState.inputCoverage][centroid]; + backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount]; + break; + case SWR_SHADING_RATE_COARSE: + default: + SWR_ASSERT(0 && "Invalid shading rate"); + break; + } + + // setup pointer to function that generates necessary barycentrics required by the PS + bool bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_PIXEL_MASK) > 0 ? 1 : 0; + backendFuncs.pfnCalcPixelBarycentrics = gPixelBarycentricTable[bBarycentrics]; + + bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_SAMPLE_MASK) > 0 ? 1 : 0; + backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[bBarycentrics]; + + bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0 ? 1 : 0; + backendFuncs.pfnCalcCentroidBarycentrics = gCentroidBarycentricTable[rastState.sampleCount][bBarycentrics][rastState.samplePattern][forcedSampleCount]; + } + + PFN_PROCESS_PRIMS pfnBinner; + switch (pState->state.topology) + { + case TOP_POINT_LIST: + pState->pfnProcessPrims = ClipPoints; + pfnBinner = BinPoints; + break; + case TOP_LINE_LIST: + case TOP_LINE_STRIP: + case TOP_LINE_LOOP: + case TOP_LINE_LIST_ADJ: + case TOP_LISTSTRIP_ADJ: + pState->pfnProcessPrims = ClipLines; + pfnBinner = BinLines; + break; + default: + pState->pfnProcessPrims = ClipTriangles; + pfnBinner = BinTriangles; + break; + }; + + // disable clipper if viewport transform is disabled + if (pState->state.frontendState.vpTransformDisable) + { + pState->pfnProcessPrims = pfnBinner; + } + + if ((pState->state.psState.pfnPixelShader == nullptr) && + (pState->state.depthStencilState.depthTestEnable == FALSE) && + (pState->state.depthStencilState.depthWriteEnable == FALSE) && + (pState->state.depthStencilState.stencilTestEnable == FALSE) && + (pState->state.depthStencilState.stencilWriteEnable == FALSE) && + (pState->state.linkageCount == 0)) + { + pState->pfnProcessPrims = nullptr; + pState->state.linkageMask = 0; + } + + if (pState->state.soState.rasterizerDisable == true) + { + pState->pfnProcessPrims = nullptr; + pState->state.linkageMask = 0; + } + + // set up the frontend attrib mask + pState->state.feAttribMask = pState->state.linkageMask; + if (pState->state.soState.soEnable) + { + for (uint32_t i = 0; i < 4; ++i) + { + pState->state.feAttribMask |= pState->state.soState.streamMasks[i]; + } + } + + // complicated logic to test for cases where we don't need backing hottile memory for a draw + // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled. + pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable && + !pState->state.depthStencilState.depthWriteEnable && + pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) && + (pState->state.depthStencilState.depthTestEnable || + pState->state.depthStencilState.depthWriteEnable)) ? true : false; + + pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable && + !pState->state.depthStencilState.stencilWriteEnable && + pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) || + // for stencil we have to check the double sided state as well + (!(pState->state.depthStencilState.doubleSidedStencilTestEnable && + !pState->state.depthStencilState.stencilWriteEnable && + pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) && + (pState->state.depthStencilState.stencilTestEnable || + pState->state.depthStencilState.stencilWriteEnable)) ? true : false; + + uint32_t numRTs = pState->state.psState.numRenderTargets; + pState->state.colorHottileEnable = 0; + if(pState->state.psState.pfnPixelShader != nullptr) + { + for (uint32_t rt = 0; rt < numRTs; ++rt) + { + pState->state.colorHottileEnable |= + (!pState->state.blendState.renderTarget[rt].writeDisableAlpha || + !pState->state.blendState.renderTarget[rt].writeDisableRed || + !pState->state.blendState.renderTarget[rt].writeDisableGreen || + !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0; + } + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief InitDraw +/// @param pDC - Draw context to initialize for this draw. +void InitDraw( + DRAW_CONTEXT *pDC, + bool isSplitDraw) +{ + // We don't need to re-setup the scissors/pipeline state again for split draw. + if (isSplitDraw == false) + { + SetupMacroTileScissors(pDC); + SetupPipeline(pDC); + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief We can split the draw for certain topologies for better performance. +/// @param totalVerts - Total vertices for draw +/// @param topology - Topology used for draw +uint32_t MaxVertsPerDraw( + DRAW_CONTEXT* pDC, + uint32_t totalVerts, + PRIMITIVE_TOPOLOGY topology) +{ + API_STATE& state = pDC->pState->state; + + uint32_t vertsPerDraw = totalVerts; + + if (state.soState.soEnable) + { + return totalVerts; + } + + switch (topology) + { + case TOP_POINT_LIST: + case TOP_TRIANGLE_LIST: + vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW; + break; + + case TOP_PATCHLIST_1: + case TOP_PATCHLIST_2: + case TOP_PATCHLIST_3: + case TOP_PATCHLIST_4: + case TOP_PATCHLIST_5: + case TOP_PATCHLIST_6: + case TOP_PATCHLIST_7: + case TOP_PATCHLIST_8: + case TOP_PATCHLIST_9: + case TOP_PATCHLIST_10: + case TOP_PATCHLIST_11: + case TOP_PATCHLIST_12: + case TOP_PATCHLIST_13: + case TOP_PATCHLIST_14: + case TOP_PATCHLIST_15: + case TOP_PATCHLIST_16: + case TOP_PATCHLIST_17: + case TOP_PATCHLIST_18: + case TOP_PATCHLIST_19: + case TOP_PATCHLIST_20: + case TOP_PATCHLIST_21: + case TOP_PATCHLIST_22: + case TOP_PATCHLIST_23: + case TOP_PATCHLIST_24: + case TOP_PATCHLIST_25: + case TOP_PATCHLIST_26: + case TOP_PATCHLIST_27: + case TOP_PATCHLIST_28: + case TOP_PATCHLIST_29: + case TOP_PATCHLIST_30: + case TOP_PATCHLIST_31: + case TOP_PATCHLIST_32: + if (pDC->pState->state.tsState.tsEnable) + { + uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE; + vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW; + } + break; + + default: + // We are not splitting up draws for other topologies. + break; + } + + return vertsPerDraw; +} + +// Recursive template used to auto-nest conditionals. Converts dynamic boolean function +// arguments to static template arguments. +template <bool... ArgsB> +struct FEDrawChooser +{ + // Last Arg Terminator + static PFN_FE_WORK_FUNC GetFunc(bool bArg) + { + if (bArg) + { + return ProcessDraw<ArgsB..., true>; + } + + return ProcessDraw<ArgsB..., false>; + } + + // Recursively parse args + template <typename... TArgsT> + static PFN_FE_WORK_FUNC GetFunc(bool bArg, TArgsT... remainingArgs) + { + if (bArg) + { + return FEDrawChooser<ArgsB..., true>::GetFunc(remainingArgs...); + } + + return FEDrawChooser<ArgsB..., false>::GetFunc(remainingArgs...); + } +}; + +// Selector for correct templated Draw front-end function +INLINE +static PFN_FE_WORK_FUNC GetFEDrawFunc(bool IsIndexed, bool HasTessellation, bool HasGeometryShader, bool HasStreamOut, bool RasterizerEnabled) +{ + return FEDrawChooser<>::GetFunc(IsIndexed, HasTessellation, HasGeometryShader, HasStreamOut, RasterizerEnabled); +} + + +////////////////////////////////////////////////////////////////////////// +/// @brief DrawInstanced +/// @param hContext - Handle passed back from SwrCreateContext +/// @param topology - Specifies topology for draw. +/// @param numVerts - How many vertices to read sequentially from vertex data (per instance). +/// @param startVertex - Specifies start vertex for draw. (vertex data) +/// @param numInstances - How many instances to render. +/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) +void DrawInstanced( + HANDLE hContext, + PRIMITIVE_TOPOLOGY topology, + uint32_t numVertices, + uint32_t startVertex, + uint32_t numInstances = 1, + uint32_t startInstance = 0) +{ + if (KNOB_TOSS_DRAW) + { + return; + } + + RDTSC_START(APIDraw); + + SWR_CONTEXT *pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); + + int32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology); + uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw); + int32_t remainingVerts = numVertices; + + API_STATE *pState = &pDC->pState->state; + pState->topology = topology; + pState->forceFront = false; + + // disable culling for points/lines + uint32_t oldCullMode = pState->rastState.cullMode; + if (topology == TOP_POINT_LIST) + { + pState->rastState.cullMode = SWR_CULLMODE_NONE; + pState->forceFront = true; + } + + int draw = 0; + while (remainingVerts) + { + uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ? + remainingVerts : maxVertsPerDraw; + + bool isSplitDraw = (draw > 0) ? true : false; + DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw); + InitDraw(pDC, isSplitDraw); + + pDC->FeWork.type = DRAW; + pDC->FeWork.pfnWork = GetFEDrawFunc( + false, // IsIndexed + pState->tsState.tsEnable, + pState->gsState.gsEnable, + pState->soState.soEnable, + pDC->pState->pfnProcessPrims != nullptr); + pDC->FeWork.desc.draw.numVerts = numVertsForDraw; + pDC->FeWork.desc.draw.startVertex = startVertex; + pDC->FeWork.desc.draw.numInstances = numInstances; + pDC->FeWork.desc.draw.startInstance = startInstance; + pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; + pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw; + + //enqueue DC + QueueDraw(pContext); + + remainingVerts -= numVertsForDraw; + draw++; + } + + // restore culling state + pDC = GetDrawContext(pContext); + pDC->pState->state.rastState.cullMode = oldCullMode; + + RDTSC_STOP(APIDraw, numVertices * numInstances, 0); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief SwrDraw +/// @param hContext - Handle passed back from SwrCreateContext +/// @param topology - Specifies topology for draw. +/// @param startVertex - Specifies start vertex in vertex buffer for draw. +/// @param primCount - Number of vertices. +void SwrDraw( + HANDLE hContext, + PRIMITIVE_TOPOLOGY topology, + uint32_t startVertex, + uint32_t numVertices) +{ + DrawInstanced(hContext, topology, numVertices, startVertex); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief SwrDrawInstanced +/// @param hContext - Handle passed back from SwrCreateContext +/// @param topology - Specifies topology for draw. +/// @param numVertsPerInstance - How many vertices to read sequentially from vertex data. +/// @param numInstances - How many instances to render. +/// @param startVertex - Specifies start vertex for draw. (vertex data) +/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) +void SwrDrawInstanced( + HANDLE hContext, + PRIMITIVE_TOPOLOGY topology, + uint32_t numVertsPerInstance, + uint32_t numInstances, + uint32_t startVertex, + uint32_t startInstance + ) +{ + DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief DrawIndexedInstanced +/// @param hContext - Handle passed back from SwrCreateContext +/// @param topology - Specifies topology for draw. +/// @param numIndices - Number of indices to read sequentially from index buffer. +/// @param indexOffset - Starting index into index buffer. +/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. +/// @param numInstances - Number of instances to render. +/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) +void DrawIndexedInstance( + HANDLE hContext, + PRIMITIVE_TOPOLOGY topology, + uint32_t numIndices, + uint32_t indexOffset, + int32_t baseVertex, + uint32_t numInstances = 1, + uint32_t startInstance = 0) +{ + if (KNOB_TOSS_DRAW) + { + return; + } + + RDTSC_START(APIDrawIndexed); + + SWR_CONTEXT *pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); + API_STATE* pState = &pDC->pState->state; + + int32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology); + uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw); + int32_t remainingIndices = numIndices; + + uint32_t indexSize = 0; + switch (pState->indexBuffer.format) + { + case R32_UINT: indexSize = sizeof(uint32_t); break; + case R16_UINT: indexSize = sizeof(uint16_t); break; + case R8_UINT: indexSize = sizeof(uint8_t); break; + default: + SWR_ASSERT(0); + } + + int draw = 0; + uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices; + pIB += (uint64_t)indexOffset * (uint64_t)indexSize; + + pState->topology = topology; + pState->forceFront = false; + + // disable culling for points/lines + uint32_t oldCullMode = pState->rastState.cullMode; + if (topology == TOP_POINT_LIST) + { + pState->rastState.cullMode = SWR_CULLMODE_NONE; + pState->forceFront = true; + } + + while (remainingIndices) + { + uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ? + remainingIndices : maxIndicesPerDraw; + + // When breaking up draw, we need to obtain new draw context for each iteration. + bool isSplitDraw = (draw > 0) ? true : false; + pDC = GetDrawContext(pContext, isSplitDraw); + InitDraw(pDC, isSplitDraw); + + pDC->FeWork.type = DRAW; + pDC->FeWork.pfnWork = GetFEDrawFunc( + true, // IsIndexed + pState->tsState.tsEnable, + pState->gsState.gsEnable, + pState->soState.soEnable, + pDC->pState->pfnProcessPrims != nullptr); + pDC->FeWork.desc.draw.pDC = pDC; + pDC->FeWork.desc.draw.numIndices = numIndicesForDraw; + pDC->FeWork.desc.draw.pIB = (int*)pIB; + pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format; + + pDC->FeWork.desc.draw.numInstances = numInstances; + pDC->FeWork.desc.draw.startInstance = startInstance; + pDC->FeWork.desc.draw.baseVertex = baseVertex; + pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; + + //enqueue DC + QueueDraw(pContext); + + pIB += maxIndicesPerDraw * indexSize; + remainingIndices -= numIndicesForDraw; + draw++; + } + + // restore culling state + pDC = GetDrawContext(pContext); + pDC->pState->state.rastState.cullMode = oldCullMode; + + RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0); +} + + +////////////////////////////////////////////////////////////////////////// +/// @brief DrawIndexed +/// @param hContext - Handle passed back from SwrCreateContext +/// @param topology - Specifies topology for draw. +/// @param numIndices - Number of indices to read sequentially from index buffer. +/// @param indexOffset - Starting index into index buffer. +/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. +void SwrDrawIndexed( + HANDLE hContext, + PRIMITIVE_TOPOLOGY topology, + uint32_t numIndices, + uint32_t indexOffset, + int32_t baseVertex + ) +{ + DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief SwrDrawIndexedInstanced +/// @param hContext - Handle passed back from SwrCreateContext +/// @param topology - Specifies topology for draw. +/// @param numIndices - Number of indices to read sequentially from index buffer. +/// @param numInstances - Number of instances to render. +/// @param indexOffset - Starting index into index buffer. +/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. +/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) +void SwrDrawIndexedInstanced( + HANDLE hContext, + PRIMITIVE_TOPOLOGY topology, + uint32_t numIndices, + uint32_t numInstances, + uint32_t indexOffset, + int32_t baseVertex, + uint32_t startInstance) +{ + DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance); +} + +// Attach surfaces to pipeline +void SwrInvalidateTiles( + HANDLE hContext, + uint32_t attachmentMask) +{ + SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; + DRAW_CONTEXT* pDC = GetDrawContext(pContext); + + // Queue a load to the hottile + pDC->FeWork.type = INVALIDATETILES; + pDC->FeWork.pfnWork = ProcessInvalidateTiles; + pDC->FeWork.desc.invalidateTiles.attachmentMask = attachmentMask; + + //enqueue + QueueDraw(pContext); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief SwrDispatch +/// @param hContext - Handle passed back from SwrCreateContext +/// @param threadGroupCountX - Number of thread groups dispatched in X direction +/// @param threadGroupCountY - Number of thread groups dispatched in Y direction +/// @param threadGroupCountZ - Number of thread groups dispatched in Z direction +void SwrDispatch( + HANDLE hContext, + uint32_t threadGroupCountX, + uint32_t threadGroupCountY, + uint32_t threadGroupCountZ) +{ + if (KNOB_TOSS_DRAW) + { + return; + } + + RDTSC_START(APIDispatch); + SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; + DRAW_CONTEXT* pDC = GetDrawContext(pContext); + + pDC->isCompute = true; // This is a compute context. + + // Ensure spill fill pointers are initialized to nullptr. + memset(pDC->pSpillFill, 0, sizeof(pDC->pSpillFill)); + + COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64); + + pTaskData->threadGroupCountX = threadGroupCountX; + pTaskData->threadGroupCountY = threadGroupCountY; + pTaskData->threadGroupCountZ = threadGroupCountZ; + + uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ; + pDC->pDispatch->initialize(totalThreadGroups, pTaskData); + + QueueDispatch(pContext); + RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0); +} + +// Deswizzles, converts and stores current contents of the hot tiles to surface +// described by pState +void SwrStoreTiles( + HANDLE hContext, + SWR_RENDERTARGET_ATTACHMENT attachment, + SWR_TILE_STATE postStoreTileState) +{ + RDTSC_START(APIStoreTiles); + + SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; + DRAW_CONTEXT* pDC = GetDrawContext(pContext); + + SetupMacroTileScissors(pDC); + + pDC->FeWork.type = STORETILES; + pDC->FeWork.pfnWork = ProcessStoreTiles; + pDC->FeWork.desc.storeTiles.attachment = attachment; + pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState; + + //enqueue + QueueDraw(pContext); + + RDTSC_STOP(APIStoreTiles, 0, 0); +} + +void SwrClearRenderTarget( + HANDLE hContext, + uint32_t clearMask, + const float clearColor[4], + float z, + BYTE stencil) +{ + RDTSC_START(APIClearRenderTarget); + + SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; + + DRAW_CONTEXT* pDC = GetDrawContext(pContext); + + SetupMacroTileScissors(pDC); + + CLEAR_FLAGS flags; + flags.mask = clearMask; + + pDC->FeWork.type = CLEAR; + pDC->FeWork.pfnWork = ProcessClear; + pDC->FeWork.desc.clear.flags = flags; + pDC->FeWork.desc.clear.clearDepth = z; + pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0]; + pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1]; + pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2]; + pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3]; + pDC->FeWork.desc.clear.clearStencil = stencil; + + // enqueue draw + QueueDraw(pContext); + + RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Returns a pointer to the private context state for the current +/// draw operation. This is used for external componets such as the +/// sampler. +/// SWR is responsible for the allocation of the private context state. +/// @param hContext - Handle passed back from SwrCreateContext +VOID* SwrGetPrivateContextState( + HANDLE hContext) +{ + SWR_CONTEXT* pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); + DRAW_STATE* pState = pDC->pState; + + if (pState->pPrivateState == nullptr) + { + pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float)); + } + + return pState->pPrivateState; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Clients can use this to allocate memory for draw/dispatch +/// operations. The memory will automatically be freed once operation +/// has completed. Client can use this to allocate binding tables, +/// etc. needed for shader execution. +/// @param hContext - Handle passed back from SwrCreateContext +/// @param size - Size of allocation +/// @param align - Alignment needed for allocation. +VOID* SwrAllocDrawContextMemory( + HANDLE hContext, + uint32_t size, + uint32_t align) +{ + SWR_CONTEXT* pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); + + return pDC->pState->pArena->AllocAligned(size, align); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Returns pointer to SWR stats. +/// @note The counters are atomically incremented by multiple threads. +/// When calling this, you need to ensure all previous operations +/// have completed. +/// @todo If necessary, add a callback to avoid stalling the pipe to +/// sample the counters. +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pStats - SWR will fill this out for caller. +void SwrGetStats( + HANDLE hContext, + SWR_STATS* pStats) +{ + SWR_CONTEXT *pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); + + pDC->FeWork.type = QUERYSTATS; + pDC->FeWork.pfnWork = ProcessQueryStats; + pDC->FeWork.desc.queryStats.pStats = pStats; + + // cannot execute until all previous draws have completed + pDC->dependency = pDC->drawId - 1; + + //enqueue + QueueDraw(pContext); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Enables stats counting +/// @param hContext - Handle passed back from SwrCreateContext +/// @param enable - If true then counts are incremented. +void SwrEnableStats( + HANDLE hContext, + bool enable) +{ + SWR_CONTEXT *pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); + + pDC->pState->state.enableStats = enable; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Mark end of frame - used for performance profiling +/// @param hContext - Handle passed back from SwrCreateContext +void SWR_API SwrEndFrame( + HANDLE hContext) +{ + RDTSC_ENDFRAME(); +} diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h new file mode 100644 index 00000000000..72fae8b2c21 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/api.h @@ -0,0 +1,500 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file api.h +* +* @brief API definitions +* +******************************************************************************/ + +#ifndef __SWR_API_H__ +#define __SWR_API_H__ + +#include "common/os.h" + +#include <assert.h> +#include <vector> + +#include "common/simdintrin.h" +#include "common/formats.h" +#include "core/utils.h" +#include "core/state.h" + +///@todo place all the API functions into the 'swr' namespace. + +typedef void(SWR_API *PFN_CALLBACK_FUNC)(uint64_t data, uint64_t data2, uint64_t data3); + +////////////////////////////////////////////////////////////////////////// +/// @brief Function signature for load hot tiles +/// @param hPrivateContext - handle to private data +/// @param dstFormat - format of the hot tile +/// @param renderTargetIndex - render target to store, can be color, depth or stencil +/// @param x - destination x coordinate +/// @param y - destination y coordinate +/// @param pDstHotTile - pointer to the hot tile surface +typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT dstFormat, + SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, + uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pDstHotTile); + +////////////////////////////////////////////////////////////////////////// +/// @brief Function signature for store hot tiles +/// @param hPrivateContext - handle to private data +/// @param srcFormat - format of the hot tile +/// @param renderTargetIndex - render target to store, can be color, depth or stencil +/// @param x - destination x coordinate +/// @param y - destination y coordinate +/// @param pSrcHotTile - pointer to the hot tile surface +typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, SWR_FORMAT srcFormat, + SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, + uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pSrcHotTile); + +/// @brief Function signature for clearing from the hot tiles clear value +/// @param hPrivateContext - handle to private data +/// @param renderTargetIndex - render target to store, can be color, depth or stencil +/// @param x - destination x coordinate +/// @param y - destination y coordinate +/// @param pClearColor - pointer to the hot tile's clear value +typedef void(SWR_API *PFN_CLEAR_TILE)(HANDLE hPrivateContext, + SWR_RENDERTARGET_ATTACHMENT rtIndex, + uint32_t x, uint32_t y, const float* pClearColor); + +////////////////////////////////////////////////////////////////////////// +/// SWR_CREATECONTEXT_INFO +///////////////////////////////////////////////////////////////////////// +struct SWR_CREATECONTEXT_INFO +{ + DRIVER_TYPE driver; + + // External functions (e.g. sampler) need per draw context state. + // Use SwrGetPrivateContextState() to access private state. + uint32_t privateStateSize; + + // Each SWR context can have multiple sets of active state + uint32_t maxSubContexts; + + // tile manipulation functions + PFN_LOAD_TILE pfnLoadTile; + PFN_STORE_TILE pfnStoreTile; + PFN_CLEAR_TILE pfnClearTile; +}; + +////////////////////////////////////////////////////////////////////////// +/// SWR_RECT +///////////////////////////////////////////////////////////////////////// +struct SWR_RECT +{ + uint32_t left; + uint32_t right; + uint32_t top; + uint32_t bottom; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief Create SWR Context. +/// @param pCreateInfo - pointer to creation info. +HANDLE SWR_API SwrCreateContext( + const SWR_CREATECONTEXT_INFO* pCreateInfo); + +////////////////////////////////////////////////////////////////////////// +/// @brief Destroys SWR Context. +/// @param hContext - Handle passed back from SwrCreateContext +void SWR_API SwrDestroyContext( + HANDLE hContext); + +////////////////////////////////////////////////////////////////////////// +/// @brief Set currently active state context +/// @param subContextIndex - value from 0 to +/// SWR_CREATECONTEXT_INFO.maxSubContexts. Defaults to 0. +void SWR_API SwrSetActiveSubContext( + HANDLE hContext, + uint32_t subContextIndex); + +////////////////////////////////////////////////////////////////////////// +/// @brief Sync cmd. Executes the callback func when all rendering up to this sync +/// has been completed +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pfnFunc - pointer to callback function, +/// @param userData - user data to pass back +void SWR_API SwrSync( + HANDLE hContext, + PFN_CALLBACK_FUNC pfnFunc, + uint64_t userData, + uint64_t userData2, + uint64_t userData3 = 0); + +////////////////////////////////////////////////////////////////////////// +/// @brief Blocks until all rendering has been completed. +/// @param hContext - Handle passed back from SwrCreateContext +void SWR_API SwrWaitForIdle( + HANDLE hContext); + +////////////////////////////////////////////////////////////////////////// +/// @brief Set vertex buffer state. +/// @param hContext - Handle passed back from SwrCreateContext +/// @param numBuffers - Number of vertex buffer state descriptors. +/// @param pVertexBuffers - Array of vertex buffer state descriptors. +void SWR_API SwrSetVertexBuffers( + HANDLE hContext, + uint32_t numBuffers, + const SWR_VERTEX_BUFFER_STATE* pVertexBuffers); + +////////////////////////////////////////////////////////////////////////// +/// @brief Set index buffer +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pIndexBuffer - Index buffer. +void SWR_API SwrSetIndexBuffer( + HANDLE hContext, + const SWR_INDEX_BUFFER_STATE* pIndexBuffer); + +////////////////////////////////////////////////////////////////////////// +/// @brief Set fetch shader pointer. +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pfnFetchFunc - Pointer to shader. +void SWR_API SwrSetFetchFunc( + HANDLE hContext, + PFN_FETCH_FUNC pfnFetchFunc); + +////////////////////////////////////////////////////////////////////////// +/// @brief Set streamout shader pointer. +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pfnSoFunc - Pointer to shader. +/// @param streamIndex - specifies stream +void SWR_API SwrSetSoFunc( + HANDLE hContext, + PFN_SO_FUNC pfnSoFunc, + uint32_t streamIndex); + +////////////////////////////////////////////////////////////////////////// +/// @brief Set streamout state +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pSoState - Pointer to streamout state. +void SWR_API SwrSetSoState( + HANDLE hContext, + SWR_STREAMOUT_STATE* pSoState); + +////////////////////////////////////////////////////////////////////////// +/// @brief Set streamout buffer state +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pSoBuffer - Pointer to streamout buffer. +/// @param slot - Slot to bind SO buffer to. +void SWR_API SwrSetSoBuffers( + HANDLE hContext, + SWR_STREAMOUT_BUFFER* pSoBuffer, + uint32_t slot); + +////////////////////////////////////////////////////////////////////////// +/// @brief Set vertex shader pointer. +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pfnVertexFunc - Pointer to shader. +void SWR_API SwrSetVertexFunc( + HANDLE hContext, + PFN_VERTEX_FUNC pfnVertexFunc); + +////////////////////////////////////////////////////////////////////////// +/// @brief Set frontend state. +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pState - Pointer to state +void SWR_API SwrSetFrontendState( + HANDLE hContext, + SWR_FRONTEND_STATE *pState); + +////////////////////////////////////////////////////////////////////////// +/// @brief Set geometry shader state. +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pState - Pointer to state +void SWR_API SwrSetGsState( + HANDLE hContext, + SWR_GS_STATE *pState); + +////////////////////////////////////////////////////////////////////////// +/// @brief Set geometry shader +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pState - Pointer to geometry shader function +void SWR_API SwrSetGsFunc( + HANDLE hContext, + PFN_GS_FUNC pfnGsFunc); + +////////////////////////////////////////////////////////////////////////// +/// @brief Set compute shader +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pState - Pointer to compute shader function +/// @param totalThreadsInGroup - product of thread group dimensions. +void SWR_API SwrSetCsFunc( + HANDLE hContext, + PFN_CS_FUNC pfnCsFunc, + uint32_t totalThreadsInGroup); + +////////////////////////////////////////////////////////////////////////// +/// @brief Set tessellation state. +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pState - Pointer to state +void SWR_API SwrSetTsState( + HANDLE hContext, + SWR_TS_STATE *pState); + +////////////////////////////////////////////////////////////////////////// +/// @brief Set hull shader +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pfnFunc - Pointer to shader function +void SWR_API SwrSetHsFunc( + HANDLE hContext, + PFN_HS_FUNC pfnFunc); + +////////////////////////////////////////////////////////////////////////// +/// @brief Set domain shader +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pfnFunc - Pointer to shader function +void SWR_API SwrSetDsFunc( + HANDLE hContext, + PFN_DS_FUNC pfnFunc); + +////////////////////////////////////////////////////////////////////////// +/// @brief Set depth stencil state +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pState - Pointer to state. +void SWR_API SwrSetDepthStencilState( + HANDLE hContext, + SWR_DEPTH_STENCIL_STATE *pState); + +////////////////////////////////////////////////////////////////////////// +/// @brief Set backend state +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pState - Pointer to state. +void SWR_API SwrSetBackendState( + HANDLE hContext, + SWR_BACKEND_STATE *pState); + +////////////////////////////////////////////////////////////////////////// +/// @brief Set pixel shader state +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pState - Pointer to state. +void SWR_API SwrSetPixelShaderState( + HANDLE hContext, + SWR_PS_STATE *pState); + +////////////////////////////////////////////////////////////////////////// +/// @brief Set blend state +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pState - Pointer to state. +void SWR_API SwrSetBlendState( + HANDLE hContext, + SWR_BLEND_STATE *pState); + +////////////////////////////////////////////////////////////////////////// +/// @brief Set blend function +/// @param hContext - Handle passed back from SwrCreateContext +/// @param renderTarget - render target index +/// @param pfnBlendFunc - function pointer +void SWR_API SwrSetBlendFunc( + HANDLE hContext, + uint32_t renderTarget, + PFN_BLEND_JIT_FUNC pfnBlendFunc); + +////////////////////////////////////////////////////////////////////////// +/// @brief Set linkage mask +/// @param hContext - Handle passed back from SwrCreateContext +/// @param mask - Specifies which vertex outputs are are needed by PS. +/// @param pMap - (Optional)Linkage map to specify where FE attributes are +/// gathered from to supply PS attribute values. The length +/// of the map buffer needs to match the number of set bits +/// in "mask". +void SWR_API SwrSetLinkage( + HANDLE hContext, + uint32_t mask, + const uint8_t* pMap); + +////////////////////////////////////////////////////////////////////////// +/// @brief SwrDraw +/// @param hContext - Handle passed back from SwrCreateContext +/// @param topology - Specifies topology for draw. +/// @param startVertex - Specifies start vertex in vertex buffer for draw. +/// @param primCount - Number of vertices. +void SWR_API SwrDraw( + HANDLE hContext, + PRIMITIVE_TOPOLOGY topology, + uint32_t startVertex, + uint32_t primCount); + +////////////////////////////////////////////////////////////////////////// +/// @brief SwrDrawInstanced +/// @param hContext - Handle passed back from SwrCreateContext +/// @param topology - Specifies topology for draw. +/// @param numVertsPerInstance - How many vertices to read sequentially from vertex data. +/// @param numInstances - How many instances to render. +/// @param startVertex - Specifies start vertex for draw. (vertex data) +/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) +void SWR_API SwrDrawInstanced( + HANDLE hContext, + PRIMITIVE_TOPOLOGY topology, + uint32_t numVertsPerInstance, + uint32_t numInstances, + uint32_t startVertex, + uint32_t startInstance); + +////////////////////////////////////////////////////////////////////////// +/// @brief DrawIndexed +/// @param hContext - Handle passed back from SwrCreateContext +/// @param topology - Specifies topology for draw. +/// @param numIndices - Number of indices to read sequentially from index buffer. +/// @param indexOffset - Starting index into index buffer. +/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. +void SWR_API SwrDrawIndexed( + HANDLE hContext, + PRIMITIVE_TOPOLOGY topology, + uint32_t numIndices, + uint32_t indexOffset, + int32_t baseVertex); + +////////////////////////////////////////////////////////////////////////// +/// @brief SwrDrawIndexedInstanced +/// @param hContext - Handle passed back from SwrCreateContext +/// @param topology - Specifies topology for draw. +/// @param numIndices - Number of indices to read sequentially from index buffer. +/// @param numInstances - Number of instances to render. +/// @param indexOffset - Starting index into index buffer. +/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. +/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) +void SWR_API SwrDrawIndexedInstanced( + HANDLE hContext, + PRIMITIVE_TOPOLOGY topology, + uint32_t numIndices, + uint32_t numInstances, + uint32_t indexOffset, + int32_t baseVertex, + uint32_t startInstance); + +////////////////////////////////////////////////////////////////////////// +/// @brief SwrInvalidateTiles +/// @param hContext - Handle passed back from SwrCreateContext +/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate. +void SWR_API SwrInvalidateTiles( + HANDLE hContext, + uint32_t attachmentMask); + +////////////////////////////////////////////////////////////////////////// +/// @brief SwrDispatch +/// @param hContext - Handle passed back from SwrCreateContext +/// @param threadGroupCountX - Number of thread groups dispatched in X direction +/// @param threadGroupCountY - Number of thread groups dispatched in Y direction +/// @param threadGroupCountZ - Number of thread groups dispatched in Z direction +void SWR_API SwrDispatch( + HANDLE hContext, + uint32_t threadGroupCountX, + uint32_t threadGroupCountY, + uint32_t threadGroupCountZ); + + +enum SWR_TILE_STATE +{ + SWR_TILE_INVALID = 0, // tile is in unitialized state and should be loaded with surface contents before rendering + SWR_TILE_DIRTY = 2, // tile contains newer data than surface it represents + SWR_TILE_RESOLVED = 3, // is in sync with surface it represents +}; + +/// @todo Add a good description for what attachments are and when and why you would use the different SWR_TILE_STATEs. +void SWR_API SwrStoreTiles( + HANDLE hContext, + SWR_RENDERTARGET_ATTACHMENT attachment, + SWR_TILE_STATE postStoreTileState); + +void SWR_API SwrClearRenderTarget( + HANDLE hContext, + uint32_t clearMask, + const FLOAT clearColor[4], + float z, + BYTE stencil); + +void SWR_API SwrSetRastState( + HANDLE hContext, + const SWR_RASTSTATE *pRastState); + +////////////////////////////////////////////////////////////////////////// +/// @brief SwrSetViewports +/// @param hContext - Handle passed back from SwrCreateContext +/// @param numViewports - number of viewports passed in +/// @param pViewports - Specifies extents of viewport. +/// @param pMatrices - If not specified then SWR computes a default one. +void SWR_API SwrSetViewports( + HANDLE hContext, + uint32_t numViewports, + const SWR_VIEWPORT* pViewports, + const SWR_VIEWPORT_MATRIX* pMatrices); + +////////////////////////////////////////////////////////////////////////// +/// @brief SwrSetScissorRects +/// @param hContext - Handle passed back from SwrCreateContext +/// @param numScissors - number of scissors passed in +/// @param pScissors - array of scissors +void SWR_API SwrSetScissorRects( + HANDLE hContext, + uint32_t numScissors, + const BBOX* pScissors); + +////////////////////////////////////////////////////////////////////////// +/// @brief Returns a pointer to the private context state for the current +/// draw operation. This is used for external componets such as the +/// sampler. +/// +/// @note Client needs to resend private state prior to each draw call. +/// Also, SWR is responsible for the private state memory. +/// @param hContext - Handle passed back from SwrCreateContext +VOID* SWR_API SwrGetPrivateContextState( + HANDLE hContext); + +////////////////////////////////////////////////////////////////////////// +/// @brief Clients can use this to allocate memory for draw/dispatch +/// operations. The memory will automatically be freed once operation +/// has completed. Client can use this to allocate binding tables, +/// etc. needed for shader execution. +/// @param hContext - Handle passed back from SwrCreateContext +/// @param size - Size of allocation +/// @param align - Alignment needed for allocation. +VOID* SWR_API SwrAllocDrawContextMemory( + HANDLE hContext, + uint32_t size, + uint32_t align); + +////////////////////////////////////////////////////////////////////////// +/// @brief Returns pointer to SWR stats. +/// @note The counters are incremented by multiple threads. +/// When calling this, you need to ensure all previous operations +/// have completed. +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pStats - SWR will fill this out for caller. +void SWR_API SwrGetStats( + HANDLE hContext, + SWR_STATS* pStats); + +////////////////////////////////////////////////////////////////////////// +/// @brief Enables stats counting +/// @param hContext - Handle passed back from SwrCreateContext +/// @param enable - If true then counts are incremented. +void SWR_API SwrEnableStats( + HANDLE hContext, + bool enable); + +////////////////////////////////////////////////////////////////////////// +/// @brief Mark end of frame - used for performance profiling +/// @param hContext - Handle passed back from SwrCreateContext +void SWR_API SwrEndFrame( + HANDLE hContext); +#endif//__SWR_API_H__ diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.cpp b/src/gallium/drivers/swr/rasterizer/core/arena.cpp new file mode 100644 index 00000000000..8184c8d3f4c --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/arena.cpp @@ -0,0 +1,166 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file arena.cpp +* +* @brief Arena memory manager +* The arena is convenient and fast for managing allocations for any of +* our allocations that are associated with operations and can all be freed +* once when their operation has completed. Allocations are cheap since +* most of the time its simply an increment of an offset. Also, no need to +* free individual allocations. All of the arena memory can be freed at once. +* +******************************************************************************/ + +#include "context.h" +#include "arena.h" + +#include <cmath> + +Arena::Arena() + : m_pCurBlock(nullptr), m_size(0) +{ + m_pMutex = new std::mutex(); +} + +Arena::~Arena() +{ + Reset(); // Reset just in case to avoid leaking memory. + + if (m_pCurBlock) + { + _aligned_free(m_pCurBlock->pMem); + delete m_pCurBlock; + } + + delete m_pMutex; +} + +///@todo Remove this when all users have stopped using this. +void Arena::Init() +{ + m_size = 0; + m_pCurBlock = nullptr; + + m_pMutex = new std::mutex(); +} + +void* Arena::AllocAligned(size_t size, size_t align) +{ + if (m_pCurBlock) + { + ArenaBlock* pCurBlock = m_pCurBlock; + pCurBlock->offset = AlignUp(pCurBlock->offset, align); + + if ((pCurBlock->offset + size) <= pCurBlock->blockSize) + { + void* pMem = PtrAdd(pCurBlock->pMem, pCurBlock->offset); + pCurBlock->offset += size; + m_size += size; + return pMem; + } + + // Not enough memory in this block, fall through to allocate + // a new block + } + + static const size_t ArenaBlockSize = 1024*1024; + size_t blockSize = std::max(m_size + ArenaBlockSize, std::max(size, ArenaBlockSize)); + blockSize = AlignUp(blockSize, KNOB_SIMD_WIDTH*4); + + void *pMem = _aligned_malloc(blockSize, KNOB_SIMD_WIDTH*4); // Arena blocks are always simd byte aligned. + SWR_ASSERT(pMem != nullptr); + + ArenaBlock* pNewBlock = new (std::nothrow) ArenaBlock(); + SWR_ASSERT(pNewBlock != nullptr); + + if (pNewBlock != nullptr) + { + pNewBlock->pNext = m_pCurBlock; + + m_pCurBlock = pNewBlock; + m_pCurBlock->pMem = pMem; + m_pCurBlock->blockSize = blockSize; + + } + + return AllocAligned(size, align); +} + +void* Arena::Alloc(size_t size) +{ + return AllocAligned(size, 1); +} + +void* Arena::AllocAlignedSync(size_t size, size_t align) +{ + void* pAlloc = nullptr; + + SWR_ASSERT(m_pMutex != nullptr); + + m_pMutex->lock(); + pAlloc = AllocAligned(size, align); + m_pMutex->unlock(); + + return pAlloc; +} + +void* Arena::AllocSync(size_t size) +{ + void* pAlloc = nullptr; + + SWR_ASSERT(m_pMutex != nullptr); + + m_pMutex->lock(); + pAlloc = Alloc(size); + m_pMutex->unlock(); + + return pAlloc; +} + +void Arena::Reset(bool removeAll) +{ + if (m_pCurBlock) + { + m_pCurBlock->offset = 0; + + ArenaBlock *pUsedBlocks = m_pCurBlock->pNext; + m_pCurBlock->pNext = nullptr; + while(pUsedBlocks) + { + ArenaBlock* pBlock = pUsedBlocks; + pUsedBlocks = pBlock->pNext; + + _aligned_free(pBlock->pMem); + delete pBlock; + } + + if (removeAll) + { + _aligned_free(m_pCurBlock->pMem); + delete m_pCurBlock; + m_pCurBlock = nullptr; + } + } + + m_size = 0; +} diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h new file mode 100644 index 00000000000..76eee11fb08 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/arena.h @@ -0,0 +1,69 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file arena.h +* +* @brief Arena memory manager +* The arena is convenient and fast for managing allocations for any of +* our allocations that are associated with operations and can all be freed +* once when their operation has completed. Allocations are cheap since +* most of the time its simply an increment of an offset. Also, no need to +* free individual allocations. All of the arena memory can be freed at once. +* +******************************************************************************/ +#pragma once + +#include <mutex> + +class Arena +{ +public: + Arena(); + ~Arena(); + + void Init(); + + void* AllocAligned(size_t size, size_t align); + void* Alloc(size_t size); + + void* AllocAlignedSync(size_t size, size_t align); + void* AllocSync(size_t size); + + void Reset(bool removeAll = false); + size_t Size() { return m_size; } + +private: + + struct ArenaBlock + { + void* pMem = nullptr; + size_t blockSize = 0; + size_t offset = 0; + ArenaBlock* pNext = nullptr; + }; + + ArenaBlock* m_pCurBlock = nullptr; + size_t m_size = 0; + + /// @note Mutex is only used by sync allocation functions. + std::mutex* m_pMutex; +}; diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp new file mode 100644 index 00000000000..4a472bc9e5c --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp @@ -0,0 +1,1899 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file backend.cpp +* +* @brief Backend handles rasterization, pixel shading and output merger +* operations. +* +******************************************************************************/ + +#include <smmintrin.h> + +#include "rdtsc_core.h" +#include "backend.h" +#include "depthstencil.h" +#include "tilemgr.h" +#include "memory/tilingtraits.h" +#include "core/multisample.h" + +#include <algorithm> + +const __m128 vTileOffsetsX = {0.5, KNOB_TILE_X_DIM - 0.5, 0.5, KNOB_TILE_X_DIM - 0.5}; +const __m128 vTileOffsetsY = {0.5, 0.5, KNOB_TILE_Y_DIM - 0.5, KNOB_TILE_Y_DIM - 0.5}; + +/// @todo move to common lib +#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3} +static const __m128 gMaskToVec[] = { + MASKTOVEC(0,0,0,0), + MASKTOVEC(0,0,0,1), + MASKTOVEC(0,0,1,0), + MASKTOVEC(0,0,1,1), + MASKTOVEC(0,1,0,0), + MASKTOVEC(0,1,0,1), + MASKTOVEC(0,1,1,0), + MASKTOVEC(0,1,1,1), + MASKTOVEC(1,0,0,0), + MASKTOVEC(1,0,0,1), + MASKTOVEC(1,0,1,0), + MASKTOVEC(1,0,1,1), + MASKTOVEC(1,1,0,0), + MASKTOVEC(1,1,0,1), + MASKTOVEC(1,1,1,0), + MASKTOVEC(1,1,1,1), +}; + +typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, DWORD[4]); +static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS]; + +////////////////////////////////////////////////////////////////////////// +/// @brief Process compute work. +/// @param pDC - pointer to draw context (dispatch). +/// @param workerId - The unique worker ID that is assigned to this thread. +/// @param threadGroupId - the linear index for the thread group within the dispatch. +void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId) +{ + RDTSC_START(BEDispatch); + + SWR_CONTEXT *pContext = pDC->pContext; + + const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData(); + SWR_ASSERT(pTaskData != nullptr); + + // Ensure spill fill memory has been allocated. + if (pDC->pSpillFill[workerId] == nullptr) + { + ///@todo Add state which indicates the spill fill size. + pDC->pSpillFill[workerId] = (uint8_t*)pDC->pArena->AllocAlignedSync(4096 * 1024, sizeof(float) * 8); + } + + const API_STATE& state = GetApiState(pDC); + + SWR_CS_CONTEXT csContext{ 0 }; + csContext.tileCounter = threadGroupId; + csContext.dispatchDims[0] = pTaskData->threadGroupCountX; + csContext.dispatchDims[1] = pTaskData->threadGroupCountY; + csContext.dispatchDims[2] = pTaskData->threadGroupCountZ; + csContext.pTGSM = pContext->pScratch[workerId]; + csContext.pSpillFillBuffer = pDC->pSpillFill[workerId]; + + state.pfnCsFunc(GetPrivateState(pDC), &csContext); + + UPDATE_STAT(CsInvocations, state.totalThreadsInGroup); + + RDTSC_STOP(BEDispatch, 1, 0); +} + +void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData) +{ + SYNC_DESC *pSync = (SYNC_DESC*)pUserData; + + uint32_t x, y; + MacroTileMgr::getTileIndices(macroTile, x, y); + SWR_ASSERT(x == 0 && y == 0); + + if (pSync->pfnCallbackFunc != nullptr) + { + pSync->pfnCallbackFunc(pSync->userData, pSync->userData2, pSync->userData3); + } +} + +void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData) +{ + QUERY_DESC* pQueryDesc = (QUERY_DESC*)pUserData; + SWR_STATS* pStats = pQueryDesc->pStats; + SWR_CONTEXT *pContext = pDC->pContext; + + SWR_ASSERT(pStats != nullptr); + + for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) + { + pStats->DepthPassCount += pContext->stats[i].DepthPassCount; + + pStats->IaVertices += pContext->stats[i].IaVertices; + pStats->IaPrimitives += pContext->stats[i].IaPrimitives; + pStats->VsInvocations += pContext->stats[i].VsInvocations; + pStats->HsInvocations += pContext->stats[i].HsInvocations; + pStats->DsInvocations += pContext->stats[i].DsInvocations; + pStats->GsInvocations += pContext->stats[i].GsInvocations; + pStats->PsInvocations += pContext->stats[i].PsInvocations; + pStats->CInvocations += pContext->stats[i].CInvocations; + pStats->CsInvocations += pContext->stats[i].CsInvocations; + pStats->CPrimitives += pContext->stats[i].CPrimitives; + pStats->GsPrimitives += pContext->stats[i].GsPrimitives; + + for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream) + { + pStats->SoWriteOffset[stream] += pContext->stats[i].SoWriteOffset[stream]; + + /// @note client is required to provide valid write offset before every draw, so we clear + /// out the contents of the write offset when storing stats + pContext->stats[i].SoWriteOffset[stream] = 0; + + pStats->SoPrimStorageNeeded[stream] += pContext->stats[i].SoPrimStorageNeeded[stream]; + pStats->SoNumPrimsWritten[stream] += pContext->stats[i].SoNumPrimsWritten[stream]; + } + } +} + +template<SWR_FORMAT format> +void ClearRasterTile(BYTE *pTileBuffer, simdvector &value) +{ + auto lambda = [&](int comp) + { + FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]); + pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8); + }; + + const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM); + for (uint32_t i = 0; i < numIter; ++i) + { + UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda); + } +} + +template<SWR_FORMAT format> +INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, DWORD clear[4]) +{ + // convert clear color to hottile format + // clear color is in RGBA float/uint32 + simdvector vClear; + for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp) + { + simdscalar vComp; + vComp = _simd_load1_ps((const float*)&clear[comp]); + if (FormatTraits<format>::isNormalized(comp)) + { + vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp))); + vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp)); + } + vComp = FormatTraits<format>::pack(comp, vComp); + vClear.v[FormatTraits<format>::swizzle(comp)] = vComp; + } + + uint32_t tileX, tileY; + MacroTileMgr::getTileIndices(macroTile, tileX, tileY); + const API_STATE& state = GetApiState(pDC); + + int top = KNOB_MACROTILE_Y_DIM_FIXED * tileY; + int bottom = top + KNOB_MACROTILE_Y_DIM_FIXED - 1; + int left = KNOB_MACROTILE_X_DIM_FIXED * tileX; + int right = left + KNOB_MACROTILE_X_DIM_FIXED - 1; + + // intersect with scissor + top = std::max(top, state.scissorInFixedPoint.top); + left = std::max(left, state.scissorInFixedPoint.left); + bottom = std::min(bottom, state.scissorInFixedPoint.bottom); + right = std::min(right, state.scissorInFixedPoint.right); + + // translate to local hottile origin + top -= KNOB_MACROTILE_Y_DIM_FIXED * tileY; + bottom -= KNOB_MACROTILE_Y_DIM_FIXED * tileY; + left -= KNOB_MACROTILE_X_DIM_FIXED * tileX; + right -= KNOB_MACROTILE_X_DIM_FIXED * tileX; + + // convert to raster tiles + top >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); + bottom >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); + left >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); + right >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); + + const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount); + // compute steps between raster tile samples / raster tiles / macro tile rows + const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8; + const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples; + const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep; + const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8); + + HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples); + uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, left, top)) * numSamples; + uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples; + + // loop over all raster tiles in the current hot tile + for (int y = top; y <= bottom; ++y) + { + uint8_t* pRasterTile = pRasterTileRow; + for (int x = left; x <= right; ++x) + { + for( int sampleNum = 0; sampleNum < numSamples; sampleNum++) + { + ClearRasterTile<format>(pRasterTile, vClear); + pRasterTile += rasterTileSampleStep; + } + } + pRasterTileRow += macroTileRowStep; + } + + pHotTile->state = HOTTILE_DIRTY; +} + + +void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData) +{ + if (KNOB_FAST_CLEAR) + { + CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData; + SWR_CONTEXT *pContext = pDC->pContext; + SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount; + uint32_t numSamples = GetNumSamples(sampleCount); + + SWR_ASSERT(pClear->flags.bits != 0); // shouldn't be here without a reason. + + RDTSC_START(BEClear); + + if (pClear->flags.mask & SWR_CLEAR_COLOR) + { + HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_COLOR0, true, numSamples); + // All we want to do here is to mark the hot tile as being in a "needs clear" state. + pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]); + pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]); + pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]); + pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]); + pHotTile->state = HOTTILE_CLEAR; + } + + if (pClear->flags.mask & SWR_CLEAR_DEPTH) + { + HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples); + pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth; + pHotTile->state = HOTTILE_CLEAR; + } + + if (pClear->flags.mask & SWR_CLEAR_STENCIL) + { + HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples); + + pHotTile->clearData[0] = *(DWORD*)&pClear->clearStencil; + pHotTile->state = HOTTILE_CLEAR; + } + + RDTSC_STOP(BEClear, 0, 0); + } + else + { + // Legacy clear + CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData; + RDTSC_START(BEClear); + + if (pClear->flags.mask & SWR_CLEAR_COLOR) + { + /// @todo clear data should come in as RGBA32_FLOAT + DWORD clearData[4]; + float clearFloat[4]; + clearFloat[0] = ((BYTE*)(&pClear->clearRTColor))[0] / 255.0f; + clearFloat[1] = ((BYTE*)(&pClear->clearRTColor))[1] / 255.0f; + clearFloat[2] = ((BYTE*)(&pClear->clearRTColor))[2] / 255.0f; + clearFloat[3] = ((BYTE*)(&pClear->clearRTColor))[3] / 255.0f; + clearData[0] = *(DWORD*)&clearFloat[0]; + clearData[1] = *(DWORD*)&clearFloat[1]; + clearData[2] = *(DWORD*)&clearFloat[2]; + clearData[3] = *(DWORD*)&clearFloat[3]; + + PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT]; + SWR_ASSERT(pfnClearTiles != nullptr); + + pfnClearTiles(pDC, SWR_ATTACHMENT_COLOR0, macroTile, clearData); + } + + if (pClear->flags.mask & SWR_CLEAR_DEPTH) + { + DWORD clearData[4]; + clearData[0] = *(DWORD*)&pClear->clearDepth; + PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT]; + SWR_ASSERT(pfnClearTiles != nullptr); + + pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, clearData); + } + + if (pClear->flags.mask & SWR_CLEAR_STENCIL) + { + uint32_t value = pClear->clearStencil; + DWORD clearData[4]; + clearData[0] = *(DWORD*)&value; + PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT]; + + pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, clearData); + } + + RDTSC_STOP(BEClear, 0, 0); + } +} + + +void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) +{ + RDTSC_START(BEStoreTiles); + STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData; + SWR_CONTEXT *pContext = pDC->pContext; + +#ifdef KNOB_ENABLE_RDTSC + uint32_t numTiles = 0; +#endif + SWR_FORMAT srcFormat; + switch (pDesc->attachment) + { + case SWR_ATTACHMENT_COLOR0: + case SWR_ATTACHMENT_COLOR1: + case SWR_ATTACHMENT_COLOR2: + case SWR_ATTACHMENT_COLOR3: + case SWR_ATTACHMENT_COLOR4: + case SWR_ATTACHMENT_COLOR5: + case SWR_ATTACHMENT_COLOR6: + case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break; + case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break; + case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break; + default: SWR_ASSERT(false, "Unknown attachment: %d", pDesc->attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break; + } + + uint32_t x, y; + MacroTileMgr::getTileIndices(macroTile, x, y); + + // Only need to store the hottile if it's been rendered to... + HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, pDesc->attachment, false); + if (pHotTile) + { + // clear if clear is pending (i.e., not rendered to), then mark as dirty for store. + if (pHotTile->state == HOTTILE_CLEAR) + { + PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat]; + SWR_ASSERT(pfnClearTiles != nullptr); + + pfnClearTiles(pDC, pDesc->attachment, macroTile, pHotTile->clearData); + } + + if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY) + { + int destX = KNOB_MACROTILE_X_DIM * x; + int destY = KNOB_MACROTILE_Y_DIM * y; + + pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat, + pDesc->attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); + } + + + if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED) + { + pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState; + } + } + RDTSC_STOP(BEStoreTiles, numTiles, pDC->drawId); +} + + +void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) +{ + INVALIDATE_TILES_DESC *pDesc = (INVALIDATE_TILES_DESC*)pData; + SWR_CONTEXT *pContext = pDC->pContext; + + for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i) + { + if (pDesc->attachmentMask & (1 << i)) + { + HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, false); + if (pHotTile) + { + pHotTile->state = HOTTILE_INVALID; + } + } + } +} + +#if KNOB_SIMD_WIDTH == 8 +const __m256 vQuadCenterOffsetsX = { 0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5 }; +const __m256 vQuadCenterOffsetsY = { 0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5 }; +const __m256 vQuadULOffsetsX ={0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0}; +const __m256 vQuadULOffsetsY ={0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0}; +#define MASK 0xff +#else +#error Unsupported vector width +#endif + +INLINE +bool CanEarlyZ(const SWR_PS_STATE *pPSState) +{ + return (pPSState->forceEarlyZ || (!pPSState->writesODepth && !pPSState->usesSourceDepth && !pPSState->usesUAV)); +} + +simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ) +{ + simdscalar vClipMask = _simd_setzero_ps(); + uint32_t numClipDistance = _mm_popcnt_u32(clipMask); + + for (uint32_t i = 0; i < numClipDistance; ++i) + { + // pull triangle clip distance values from clip buffer + simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++); + simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++); + simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++); + + // interpolate + simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ); + + // clip if interpolated clip distance is < 0 || NAN + simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ); + + vClipMask = _simd_or_ps(vClipMask, vCull); + } + + return _simd_movemask_ps(vClipMask); +} + +template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount> +INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask) +{ + + // will need to update for avx512 + assert(KNOB_SIMD_WIDTH == 8); + + __m256i mask[2]; + __m256i sampleCoverage[2]; + if(bIsStandardPattern) + { + __m256i src = _mm256_set1_epi32(0); + __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1; + + if(MultisampleTraits<sampleCountT>::numSamples == 1) + { + mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 2) + { + mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 4) + { + mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 8) + { + mask[0] = _mm256_set1_epi32(-1); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 16) + { + mask[0] = _mm256_set1_epi32(-1); + mask[1] = _mm256_set1_epi32(-1); + index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8); + } + + // gather coverage for samples 0-7 + sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8)); + if(MultisampleTraits<sampleCountT>::numSamples > 8) + { + // gather coverage for samples 8-15 + sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8)); + } + } + else + { + // center coverage is the same for all samples; just broadcast to the sample slots + uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK); + if(MultisampleTraits<sampleCountT>::numSamples == 1) + { + sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 2) + { + sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 4) + { + sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 8) + { + sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 16) + { + sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); + sampleCoverage[1] = _mm256_set1_epi32(centerCoverage); + } + } + + mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0); + // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane + __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]); + + __m256i packedCoverage1; + if(MultisampleTraits<sampleCountT>::numSamples > 8) + { + // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane + packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]); + } + +#if (KNOB_ARCH == KNOB_ARCH_AVX) + // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane + __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83); + __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); + packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE)); + + __m256i packedSampleCoverage; + if(MultisampleTraits<sampleCountT>::numSamples > 8) + { + // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane + hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83); + shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); + shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE); + packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01))); + packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC)); + } + else + { + packedSampleCoverage = packedCoverage0; + } +#else + __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0); + // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane + packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask); + + __m256i packedSampleCoverage; + if(MultisampleTraits<sampleCountT>::numSamples > 8) + { + permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7); + // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane + packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask); + + // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane + packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C); + } + else + { + packedSampleCoverage = packedCoverage0; + } +#endif + + for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--) + { + // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2 + inputMask[i] = _simd_movemask_epi8(packedSampleCoverage); + + if(!bForcedSampleCount) + { + // input coverage has to be anded with sample mask if MSAA isn't forced on + inputMask[i] &= sampleMask; + } + + // shift to the next pixel in the 4x2 + packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1); + } +} + +template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount> +INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask) +{ + uint32_t inputMask[KNOB_SIMD_WIDTH]; + generateInputCoverage<sampleCountT, bIsStandardPattern, bForcedSampleCount>(coverageMask, inputMask, sampleMask); + inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0])); +} + +template<bool perspMask> +INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext) +{ + if(perspMask) + { + // evaluate I,J + psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center); + psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center); + psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet); + psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet); + + // interpolate 1/w + psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center); + } +} + +template<bool perspMask> +INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext) +{ + if(perspMask) + { + // evaluate I,J + psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample); + psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample); + psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet); + psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet); + + // interpolate 1/w + psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample); + } +} + + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Centroid behaves exactly as follows : +// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to +// have a sample location there). +// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the +// coverage with the SampleMask Rasterizer State. +// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is +// evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the +// SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point. +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +template<SWR_MULTISAMPLE_COUNT sampleCount, bool bForcedSampleCount> +INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const coverageMask, const uint32_t sampleMask, + const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL) +{ + uint32_t inputMask[KNOB_SIMD_WIDTH]; + + generateInputCoverage<sampleCount, 1, bForcedSampleCount>(coverageMask, inputMask, sampleMask); + + // Case (2) - partially covered pixel + + // scan for first covered sample per pixel in the 4x2 span + unsigned long sampleNum[KNOB_SIMD_WIDTH]; + (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0); + (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0); + (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0); + (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0); + (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0); + (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0); + (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0); + (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0); + + // look up and set the sample offsets from UL pixel corner for first covered sample + __m256 vXSample = _mm256_set_ps(MultisampleTraits<sampleCount>::X(sampleNum[7]), + MultisampleTraits<sampleCount>::X(sampleNum[6]), + MultisampleTraits<sampleCount>::X(sampleNum[5]), + MultisampleTraits<sampleCount>::X(sampleNum[4]), + MultisampleTraits<sampleCount>::X(sampleNum[3]), + MultisampleTraits<sampleCount>::X(sampleNum[2]), + MultisampleTraits<sampleCount>::X(sampleNum[1]), + MultisampleTraits<sampleCount>::X(sampleNum[0])); + + __m256 vYSample = _mm256_set_ps(MultisampleTraits<sampleCount>::Y(sampleNum[7]), + MultisampleTraits<sampleCount>::Y(sampleNum[6]), + MultisampleTraits<sampleCount>::Y(sampleNum[5]), + MultisampleTraits<sampleCount>::Y(sampleNum[4]), + MultisampleTraits<sampleCount>::Y(sampleNum[3]), + MultisampleTraits<sampleCount>::Y(sampleNum[2]), + MultisampleTraits<sampleCount>::Y(sampleNum[1]), + MultisampleTraits<sampleCount>::Y(sampleNum[0])); + // add sample offset to UL pixel corner + vXSample = _simd_add_ps(vXSamplePosUL, vXSample); + vYSample = _simd_add_ps(vYSamplePosUL, vYSample); + + // Case (1) and case (3b) - All samples covered or not covered with full SampleMask + static const __m256i vFullyCoveredMask = MultisampleTraits<sampleCount>::FullSampleMask(); + __m256i vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]); + __m256i vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask); + + static const __m256i vZero = _simd_setzero_si(); + const __m256i vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask); + __m256i vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero); + __m256i vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask); + __m256i vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask); + + __m256i vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b); + + // set the centroid position based on results from above + psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter)); + psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter)); + + // Case (3a) No samples covered and partial sample mask + __m256i vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask); + // sample mask should never be all 0's for this case, but handle it anyways + unsigned long firstCoveredSampleMaskSample = 0; + (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0); + + __m256i vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples); + + vXSample = _simd_set1_ps(MultisampleTraits<sampleCount>::X(firstCoveredSampleMaskSample)); + vYSample = _simd_set1_ps(MultisampleTraits<sampleCount>::Y(firstCoveredSampleMaskSample)); + + // blend in case 3a pixel locations + psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a)); + psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a)); +} + +template<uint32_t sampleCount, uint32_t persp, uint32_t standardPattern, uint32_t forcedMultisampleCount> +INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext, + const uint64_t *const coverageMask, const uint32_t sampleMask, + const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL) +{ + static const bool bPersp = (bool)persp; + static const bool bIsStandardPattern = (bool)standardPattern; + static const bool bForcedMultisampleCount = (bool)forcedMultisampleCount; + + // calculate centroid positions + if(bPersp) + { + if(bIsStandardPattern) + { + ///@ todo: don't need to generate input coverage 2x if input coverage and centroid + CalcCentroidPos<(SWR_MULTISAMPLE_COUNT)sampleCount, bForcedMultisampleCount>(psContext, coverageMask, sampleMask, vXSamplePosUL, vYSamplePosUL); + } + else + { + static const __m256 pixelCenter = _simd_set1_ps(0.5f); + psContext.vX.centroid = _simd_add_ps(vXSamplePosUL, pixelCenter); + psContext.vY.centroid = _simd_add_ps(vYSamplePosUL, pixelCenter); + } + // evaluate I,J + psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid); + psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid); + psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet); + psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet); + + // interpolate 1/w + psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid); + } +} + +template<uint32_t NumRT, uint32_t sampleCountT> +void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState, + const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask) +{ + // type safety guaranteed from template instantiation in BEChooser<>::GetFunc + static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT; + uint32_t rasterTileColorOffset = MultisampleTraits<sampleCount>::RasterTileColorOffset(sample); + for(uint32_t rt = 0; rt < NumRT; ++rt) + { + uint8_t *pColorSample; + if(sampleCount == SWR_MULTISAMPLE_1X) + { + pColorSample = pColorBase[rt]; + } + else + { + pColorSample = pColorBase[rt] + rasterTileColorOffset; + } + + const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt]; + + // Blend outputs and update coverage mask for alpha test + if(pfnBlendFunc[rt] != nullptr) + { + pfnBlendFunc[rt]( + pBlendState, + psContext.shaded[rt], + psContext.shaded[1], + sample, + pColorSample, + psContext.shaded[rt], + &psContext.oMask, + (simdscalari*)&coverageMask); + } + + // final write mask + simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask)); + + ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT. + static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format"); + + const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float); + + // store with color mask + if(!pRTBlend->writeDisableRed) + { + _simd_maskstore_ps((float*)pColorSample, outputMask, psContext.shaded[rt].x); + } + if(!pRTBlend->writeDisableGreen) + { + _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, psContext.shaded[rt].y); + } + if(!pRTBlend->writeDisableBlue) + { + _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, psContext.shaded[rt].z); + } + if(!pRTBlend->writeDisableAlpha) + { + _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, psContext.shaded[rt].w); + } + } +} + +template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount> +void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) +{ + RDTSC_START(BESetup); + // type safety guaranteed from template instantiation in BEChooser<>::GetFunc + static const bool bInputCoverage = (bool)inputCoverage; + static const bool bCentroidPos = (bool)centroidPos; + + SWR_CONTEXT *pContext = pDC->pContext; + const API_STATE& state = GetApiState(pDC); + const SWR_RASTSTATE& rastState = state.rastState; + const SWR_PS_STATE *pPSState = &state.psState; + const SWR_BLEND_STATE *pBlendState = &state.blendState; + const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; + uint64_t coverageMask = work.coverageMask[0]; + + // broadcast scalars + BarycentricCoeffs coeffs; + coeffs.vIa = _simd_broadcast_ss(&work.I[0]); + coeffs.vIb = _simd_broadcast_ss(&work.I[1]); + coeffs.vIc = _simd_broadcast_ss(&work.I[2]); + + coeffs.vJa = _simd_broadcast_ss(&work.J[0]); + coeffs.vJb = _simd_broadcast_ss(&work.J[1]); + coeffs.vJc = _simd_broadcast_ss(&work.J[2]); + + coeffs.vZa = _simd_broadcast_ss(&work.Z[0]); + coeffs.vZb = _simd_broadcast_ss(&work.Z[1]); + coeffs.vZc = _simd_broadcast_ss(&work.Z[2]); + + coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet); + + coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]); + coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]); + coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]); + + uint8_t *pColorBase[SWR_NUM_RENDERTARGETS]; + uint32_t NumRT = state.psState.numRenderTargets; + for(uint32_t rt = 0; rt < NumRT; ++rt) + { + pColorBase[rt] = renderBuffers.pColor[rt]; + } + uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil; + RDTSC_STOP(BESetup, 0, 0); + + SWR_PS_CONTEXT psContext; + psContext.pAttribs = work.pAttribs; + psContext.pPerspAttribs = work.pPerspAttribs; + psContext.frontFace = work.triFlags.frontFacing; + psContext.primID = work.triFlags.primID; + + // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs + psContext.I = work.I; + psContext.J = work.J; + psContext.recipDet = work.recipDet; + psContext.pRecipW = work.pRecipW; + psContext.pSamplePosX = (const float*)&MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosX; + psContext.pSamplePosY = (const float*)&MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosY; + + for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) + { + // UL pixel corner + psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy)); + // pixel center + psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy)); + + for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) + { + if(bInputCoverage) + { + generateInputCoverage<SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, false>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask); + } + + if(coverageMask & MASK) + { + RDTSC_START(BEBarycentric); + psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx)); + // pixel center + psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx)); + + backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext); + + if(bCentroidPos) + { + // for 1x case, centroid is pixel center + psContext.vX.centroid = psContext.vX.center; + psContext.vY.centroid = psContext.vY.center; + psContext.vI.centroid = psContext.vI.center; + psContext.vJ.centroid = psContext.vJ.center; + psContext.vOneOverW.centroid = psContext.vOneOverW.center; + } + + // interpolate z + psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); + RDTSC_STOP(BEBarycentric, 0, 0); + + simdmask clipCoverageMask = coverageMask & MASK; + + // interpolate user clip distance if available + if(rastState.clipDistanceMask) + { + clipCoverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer, + psContext.vI.center, psContext.vJ.center); + } + + simdscalar vCoverageMask = vMask(clipCoverageMask); + simdscalar depthPassMask = vCoverageMask; + simdscalar stencilPassMask = vCoverageMask; + + // Early-Z? + if(CanEarlyZ(pPSState)) + { + RDTSC_START(BEEarlyDepthTest); + depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, + psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask); + RDTSC_STOP(BEEarlyDepthTest, 0, 0); + + // early-exit if no pixels passed depth or earlyZ is forced on + if(pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask)) + { + DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, + pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask); + + if (!_simd_movemask_ps(depthPassMask)) + { + goto Endtile; + } + } + } + + psContext.sampleIndex = 0; + psContext.activeMask = _simd_castps_si(vCoverageMask); + + // execute pixel shader + RDTSC_START(BEPixelShader); + UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask))); + state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); + RDTSC_STOP(BEPixelShader, 0, 0); + + vCoverageMask = _simd_castsi_ps(psContext.activeMask); + + // late-Z + if(!CanEarlyZ(pPSState)) + { + RDTSC_START(BELateDepthTest); + depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, + psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask); + RDTSC_STOP(BELateDepthTest, 0, 0); + + if(!_simd_movemask_ps(depthPassMask)) + { + // need to call depth/stencil write for stencil write + DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, + pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask); + goto Endtile; + } + } + + uint32_t statMask = _simd_movemask_ps(depthPassMask); + uint32_t statCount = _mm_popcnt_u32(statMask); + UPDATE_STAT(DepthPassCount, statCount); + + // output merger + RDTSC_START(BEOutputMerger); + backendFuncs.pfnOutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc, + vCoverageMask, depthPassMask); + + // do final depth write after all pixel kills + if (!pPSState->forceEarlyZ) + { + DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, + pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask); + } + RDTSC_STOP(BEOutputMerger, 0, 0); + } + +Endtile: + RDTSC_START(BEEndTile); + coverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); + pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; + pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; + + for(uint32_t rt = 0; rt < NumRT; ++rt) + { + pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; + } + RDTSC_STOP(BEEndTile, 0, 0); + } + } +} + +template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount> +void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) +{ + // type safety guaranteed from template instantiation in BEChooser<>::GetFunc + static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT; + static const bool bInputCoverage = (bool)inputCoverage; + static const bool bCentroidPos = (bool)centroidPos; + + RDTSC_START(BESetup); + + SWR_CONTEXT *pContext = pDC->pContext; + const API_STATE& state = GetApiState(pDC); + const SWR_RASTSTATE& rastState = state.rastState; + const SWR_PS_STATE *pPSState = &state.psState; + const SWR_BLEND_STATE *pBlendState = &state.blendState; + const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; + + // broadcast scalars + BarycentricCoeffs coeffs; + coeffs.vIa = _simd_broadcast_ss(&work.I[0]); + coeffs.vIb = _simd_broadcast_ss(&work.I[1]); + coeffs.vIc = _simd_broadcast_ss(&work.I[2]); + + coeffs.vJa = _simd_broadcast_ss(&work.J[0]); + coeffs.vJb = _simd_broadcast_ss(&work.J[1]); + coeffs.vJc = _simd_broadcast_ss(&work.J[2]); + + coeffs.vZa = _simd_broadcast_ss(&work.Z[0]); + coeffs.vZb = _simd_broadcast_ss(&work.Z[1]); + coeffs.vZc = _simd_broadcast_ss(&work.Z[2]); + + coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet); + + coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]); + coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]); + coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]); + + uint8_t *pColorBase[SWR_NUM_RENDERTARGETS]; + uint32_t NumRT = state.psState.numRenderTargets; + for(uint32_t rt = 0; rt < NumRT; ++rt) + { + pColorBase[rt] = renderBuffers.pColor[rt]; + } + uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil; + RDTSC_STOP(BESetup, 0, 0); + + SWR_PS_CONTEXT psContext; + psContext.pAttribs = work.pAttribs; + psContext.pPerspAttribs = work.pPerspAttribs; + psContext.pRecipW = work.pRecipW; + psContext.frontFace = work.triFlags.frontFacing; + psContext.primID = work.triFlags.primID; + + // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs + psContext.I = work.I; + psContext.J = work.J; + psContext.recipDet = work.recipDet; + psContext.pSamplePosX = (const float*)&MultisampleTraits<sampleCount>::samplePosX; + psContext.pSamplePosY = (const float*)&MultisampleTraits<sampleCount>::samplePosY; + const uint32_t numSamples = MultisampleTraits<sampleCount>::numSamples; + + for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) + { + // UL pixel corner + psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy)); + // pixel center + psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy)); + + for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) + { + psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx)); + // pixel center + psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx)); + + RDTSC_START(BEBarycentric); + backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext); + RDTSC_STOP(BEBarycentric, 0, 0); + + if(bInputCoverage) + { + generateInputCoverage<sampleCount, SWR_MSAA_STANDARD_PATTERN, false>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask); + } + + if(bCentroidPos) + { + ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid + RDTSC_START(BEBarycentric); + backendFuncs.pfnCalcCentroidBarycentrics(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL); + RDTSC_STOP(BEBarycentric, 0, 0); + } + + for(uint32_t sample = 0; sample < numSamples; sample++) + { + if (work.coverageMask[sample] & MASK) + { + RDTSC_START(BEBarycentric); + + // calculate per sample positions + psContext.vX.sample = _simd_add_ps(psContext.vX.UL, MultisampleTraits<sampleCount>::vX(sample)); + psContext.vY.sample = _simd_add_ps(psContext.vY.UL, MultisampleTraits<sampleCount>::vY(sample)); + + simdmask coverageMask = work.coverageMask[sample] & MASK; + simdscalar vCoverageMask = vMask(coverageMask); + + backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext); + + // interpolate z + psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); + + RDTSC_STOP(BEBarycentric, 0, 0); + + // interpolate user clip distance if available + if (rastState.clipDistanceMask) + { + coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer, + psContext.vI.sample, psContext.vJ.sample); + } + + simdscalar depthPassMask = vCoverageMask; + simdscalar stencilPassMask = vCoverageMask; + + // offset depth/stencil buffers current sample + uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample); + uint8_t *pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample); + + // Early-Z? + if (CanEarlyZ(pPSState)) + { + RDTSC_START(BEEarlyDepthTest); + depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, + psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); + RDTSC_STOP(BEEarlyDepthTest, 0, 0); + + // early-exit if no samples passed depth or earlyZ is forced on. + if (pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask)) + { + DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, + pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); + + if (!_simd_movemask_ps(depthPassMask)) + { + work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); + continue; + } + } + } + + psContext.sampleIndex = sample; + psContext.activeMask = _simd_castps_si(vCoverageMask); + + // execute pixel shader + RDTSC_START(BEPixelShader); + UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask))); + state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); + RDTSC_STOP(BEPixelShader, 0, 0); + + vCoverageMask = _simd_castsi_ps(psContext.activeMask); + + //// late-Z + if (!CanEarlyZ(pPSState)) + { + RDTSC_START(BELateDepthTest); + depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, + psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); + RDTSC_STOP(BELateDepthTest, 0, 0); + + if (!_simd_movemask_ps(depthPassMask)) + { + // need to call depth/stencil write for stencil write + DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, + pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); + + work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); + continue; + } + } + + uint32_t statMask = _simd_movemask_ps(depthPassMask); + uint32_t statCount = _mm_popcnt_u32(statMask); + UPDATE_STAT(DepthPassCount, statCount); + + // output merger + RDTSC_START(BEOutputMerger); + backendFuncs.pfnOutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, + vCoverageMask, depthPassMask); + + // do final depth write after all pixel kills + if (!pPSState->forceEarlyZ) + { + DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, + pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); + } + RDTSC_STOP(BEOutputMerger, 0, 0); + } + work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); + } + RDTSC_START(BEEndTile); + pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; + pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; + + for (uint32_t rt = 0; rt < NumRT; ++rt) + { + pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; + } + RDTSC_STOP(BEEndTile, 0, 0); + } + } +} + +template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount> +void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) +{ + // type safety guaranteed from template instantiation in BEChooser<>::GetFunc + static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT; + static const bool bIsStandardPattern = (bool)samplePattern; + static const bool bInputCoverage = (bool)inputCoverage; + static const bool bCentroidPos = (bool)centroidPos; + static const bool bForcedSampleCount = (bool)forcedSampleCount; + + RDTSC_START(BESetup); + + SWR_CONTEXT *pContext = pDC->pContext; + const API_STATE& state = GetApiState(pDC); + const SWR_RASTSTATE& rastState = state.rastState; + const SWR_PS_STATE *pPSState = &state.psState; + const SWR_BLEND_STATE *pBlendState = &state.blendState; + const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; + + // broadcast scalars + BarycentricCoeffs coeffs; + coeffs.vIa = _simd_broadcast_ss(&work.I[0]); + coeffs.vIb = _simd_broadcast_ss(&work.I[1]); + coeffs.vIc = _simd_broadcast_ss(&work.I[2]); + + coeffs.vJa = _simd_broadcast_ss(&work.J[0]); + coeffs.vJb = _simd_broadcast_ss(&work.J[1]); + coeffs.vJc = _simd_broadcast_ss(&work.J[2]); + + coeffs.vZa = _simd_broadcast_ss(&work.Z[0]); + coeffs.vZb = _simd_broadcast_ss(&work.Z[1]); + coeffs.vZc = _simd_broadcast_ss(&work.Z[2]); + + coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet); + + coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]); + coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]); + coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]); + + uint8_t *pColorBase[SWR_NUM_RENDERTARGETS]; + uint32_t NumRT = state.psState.numRenderTargets; + for(uint32_t rt = 0; rt < NumRT; ++rt) + { + pColorBase[rt] = renderBuffers.pColor[rt]; + } + uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil; + RDTSC_STOP(BESetup, 0, 0); + + SWR_PS_CONTEXT psContext; + psContext.pAttribs = work.pAttribs; + psContext.pPerspAttribs = work.pPerspAttribs; + psContext.frontFace = work.triFlags.frontFacing; + psContext.primID = work.triFlags.primID; + psContext.pRecipW = work.pRecipW; + // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs + psContext.I = work.I; + psContext.J = work.J; + psContext.recipDet = work.recipDet; + psContext.pSamplePosX = (const float*)&MultisampleTraits<sampleCount>::samplePosX; + psContext.pSamplePosY = (const float*)&MultisampleTraits<sampleCount>::samplePosY; + psContext.sampleIndex = 0; + + uint32_t numCoverageSamples; + if(bIsStandardPattern) + { + numCoverageSamples = MultisampleTraits<sampleCount>::numSamples; + } + else + { + numCoverageSamples = 1; + } + + uint32_t numOMSamples; + // RT has to be single sample if we're in forcedMSAA mode + if(bForcedSampleCount && (sampleCount > SWR_MULTISAMPLE_1X)) + { + numOMSamples = 1; + } + // unless we're forced to single sample, in which case we run the OM at the sample count of the RT + else if(bForcedSampleCount && (sampleCount == SWR_MULTISAMPLE_1X)) + { + numOMSamples = GetNumSamples(pBlendState->sampleCount); + } + // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count + else + { + numOMSamples = MultisampleTraits<sampleCount>::numSamples; + } + + for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) + { + psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy)); + psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy)); + for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) + { + simdscalar vZ[MultisampleTraits<sampleCount>::numSamples]; + psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx)); + // set pixel center positions + psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx)); + + if (bInputCoverage) + { + generateInputCoverage<sampleCount, bIsStandardPattern, bForcedSampleCount>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask); + } + + if(bCentroidPos) + { + ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid + RDTSC_START(BEBarycentric); + backendFuncs.pfnCalcCentroidBarycentrics(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL); + RDTSC_STOP(BEBarycentric, 0, 0); + } + + // if oDepth written to, or there is a potential to discard any samples, we need to + // run the PS early, then interp or broadcast Z and test + if(pPSState->writesODepth || pPSState->killsPixel) + { + RDTSC_START(BEBarycentric); + backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext); + + // interpolate z + psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); + RDTSC_STOP(BEBarycentric, 0, 0); + + // execute pixel shader + RDTSC_START(BEPixelShader); + state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); + RDTSC_STOP(BEPixelShader, 0, 0); + } + else + { + psContext.activeMask = _simd_set1_epi32(-1); + } + + // need to declare enough space for all samples + simdscalar vCoverageMask[MultisampleTraits<sampleCount>::numSamples]; + simdscalar depthPassMask[MultisampleTraits<sampleCount>::numSamples]; + simdscalar stencilPassMask[MultisampleTraits<sampleCount>::numSamples]; + simdscalar anyDepthSamplePassed = _simd_setzero_ps(); + simdscalar anyStencilSamplePassed = _simd_setzero_ps(); + for(uint32_t sample = 0; sample < numCoverageSamples; sample++) + { + vCoverageMask[sample] = vMask(work.coverageMask[sample] & MASK); + + // pull mask back out for any discards and and with coverage + vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_castsi_ps(psContext.activeMask)); + + if (!_simd_movemask_ps(vCoverageMask[sample])) + { + vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps(); + continue; + } + + if(bForcedSampleCount) + { + // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set + const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si())); + anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, _simd_and_ps(vCoverageMask[sample], vSampleMask)); + continue; + } + + depthPassMask[sample] = vCoverageMask[sample]; + + // if oDepth isn't written to, we need to interpolate Z for each sample + // if clip distances are enabled, we need to interpolate for each sample + if(!pPSState->writesODepth || rastState.clipDistanceMask) + { + RDTSC_START(BEBarycentric); + if(bIsStandardPattern) + { + // calculate per sample positions + psContext.vX.sample = _simd_add_ps(psContext.vX.UL, MultisampleTraits<sampleCount>::vX(sample)); + psContext.vY.sample = _simd_add_ps(psContext.vY.UL, MultisampleTraits<sampleCount>::vY(sample)); + } + else + { + psContext.vX.sample = psContext.vX.center; + psContext.vY.sample = psContext.vY.center; + } + + // calc I & J per sample + backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext); + + // interpolate z + if (!pPSState->writesODepth) + { + vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); + } + + ///@todo: perspective correct vs non-perspective correct clipping? + // interpolate clip distances + if (rastState.clipDistanceMask) + { + uint8_t clipMask = ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer, + psContext.vI.sample, psContext.vJ.sample); + vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask)); + } + RDTSC_STOP(BEBarycentric, 0, 0); + } + // else 'broadcast' and test psContext.vZ written from the PS each sample + else + { + vZ[sample] = psContext.vZ; + } + + // offset depth/stencil buffers current sample + uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample); + uint8_t * pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample); + + // ZTest for this sample + RDTSC_START(BEEarlyDepthTest); + stencilPassMask[sample] = vCoverageMask[sample]; + depthPassMask[sample] = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, + vZ[sample], pDepthSample, vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]); + RDTSC_STOP(BEEarlyDepthTest, 0, 0); + + anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]); + anyStencilSamplePassed = _simd_or_ps(anyStencilSamplePassed, stencilPassMask[sample]); + uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]); + uint32_t statCount = _mm_popcnt_u32(statMask); + UPDATE_STAT(DepthPassCount, statCount); + } + + // if we didn't have to execute the PS early, and at least 1 sample passed the depth test, run the PS + if(!pPSState->writesODepth && !pPSState->killsPixel && _simd_movemask_ps(anyDepthSamplePassed)) + { + RDTSC_START(BEBarycentric); + backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext); + // interpolate z + psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); + RDTSC_STOP(BEBarycentric, 0, 0); + + // execute pixel shader + RDTSC_START(BEPixelShader); + state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); + RDTSC_STOP(BEPixelShader, 0, 0); + } + ///@todo: make sure this works for kill pixel + else if(!_simd_movemask_ps(anyStencilSamplePassed)) + { + goto Endtile; + } + + // loop over all samples, broadcasting the results of the PS to all passing pixels + for(uint32_t sample = 0; sample < numOMSamples; sample++) + { + uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample); + uint8_t * pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample); + + // output merger + RDTSC_START(BEOutputMerger); + + // skip if none of the pixels for this sample passed + simdscalar coverageMaskSample; + simdscalar depthMaskSample; + simdscalar stencilMaskSample; + simdscalar vInterpolatedZ; + + // forcedSampleCount outputs to any pixels with covered samples not masked off by SampleMask + // depth test is disabled, so just set the z val to 0. + if(bForcedSampleCount) + { + coverageMaskSample = depthMaskSample = anyDepthSamplePassed; + vInterpolatedZ = _simd_setzero_ps(); + } + else if(bIsStandardPattern) + { + if(!_simd_movemask_ps(depthPassMask[sample])) + { + depthPassMask[sample] = _simd_setzero_ps(); + DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample], pDepthSample, depthPassMask[sample], + vCoverageMask[sample], pStencilSample, stencilPassMask[sample]); + continue; + } + coverageMaskSample = vCoverageMask[sample]; + depthMaskSample = depthPassMask[sample]; + stencilMaskSample = stencilPassMask[sample]; + vInterpolatedZ = vZ[sample]; + } + else + { + // center pattern only needs to use a single depth test as all samples are at the same position + if(!_simd_movemask_ps(depthPassMask[0])) + { + depthPassMask[0] = _simd_setzero_ps(); + DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[0], pDepthSample, depthPassMask[0], + vCoverageMask[0], pStencilSample, stencilPassMask[0]); + continue; + } + coverageMaskSample = (vCoverageMask[0]); + depthMaskSample = depthPassMask[0]; + stencilMaskSample = stencilPassMask[0]; + vInterpolatedZ = vZ[0]; + } + + // output merger + RDTSC_START(BEOutputMerger); + backendFuncs.pfnOutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, + coverageMaskSample, depthMaskSample); + + DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vInterpolatedZ, pDepthSample, depthMaskSample, + coverageMaskSample, pStencilSample, stencilMaskSample); + RDTSC_STOP(BEOutputMerger, 0, 0); + } + +Endtile: + RDTSC_START(BEEndTile); + for(uint32_t sample = 0; sample < numCoverageSamples; sample++) + { + work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); + } + + pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; + pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; + + for(uint32_t rt = 0; rt < NumRT; ++rt) + { + pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; + } + RDTSC_STOP(BEEndTile, 0, 0); + } + } +} +// optimized backend flow with NULL PS +template<uint32_t sampleCountT> +void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) +{ + RDTSC_START(BESetup); + + static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT; + SWR_CONTEXT *pContext = pDC->pContext; + const API_STATE& state = GetApiState(pDC); + const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; + + // broadcast scalars + BarycentricCoeffs coeffs; + coeffs.vIa = _simd_broadcast_ss(&work.I[0]); + coeffs.vIb = _simd_broadcast_ss(&work.I[1]); + coeffs.vIc = _simd_broadcast_ss(&work.I[2]); + + coeffs.vJa = _simd_broadcast_ss(&work.J[0]); + coeffs.vJb = _simd_broadcast_ss(&work.J[1]); + coeffs.vJc = _simd_broadcast_ss(&work.J[2]); + + coeffs.vZa = _simd_broadcast_ss(&work.Z[0]); + coeffs.vZb = _simd_broadcast_ss(&work.Z[1]); + coeffs.vZc = _simd_broadcast_ss(&work.Z[2]); + + coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet); + + BYTE *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil; + + RDTSC_STOP(BESetup, 0, 0); + + SWR_PS_CONTEXT psContext; + for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) + { + // UL pixel corner + simdscalar vYSamplePosUL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy)); + + for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) + { + // UL pixel corners + simdscalar vXSamplePosUL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx)); + + // iterate over active samples + unsigned long sample = 0; + uint32_t sampleMask = state.blendState.sampleMask; + while (_BitScanForward(&sample, sampleMask)) + { + sampleMask &= ~(1 << sample); + if (work.coverageMask[sample] & MASK) + { + RDTSC_START(BEBarycentric); + // calculate per sample positions + psContext.vX.sample = _simd_add_ps(vXSamplePosUL, MultisampleTraits<sampleCount>::vX(sample)); + psContext.vY.sample = _simd_add_ps(vYSamplePosUL, MultisampleTraits<sampleCount>::vY(sample)); + + backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext); + + // interpolate z + psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); + + RDTSC_STOP(BEBarycentric, 0, 0); + + simdscalar vCoverageMask = vMask(work.coverageMask[sample] & MASK); + simdscalar stencilPassMask = vCoverageMask; + + // offset depth/stencil buffers current sample + uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample); + uint8_t *pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample); + + RDTSC_START(BEEarlyDepthTest); + simdscalar depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, + psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); + DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, + pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); + RDTSC_STOP(BEEarlyDepthTest, 0, 0); + + uint32_t statMask = _simd_movemask_ps(depthPassMask); + uint32_t statCount = _mm_popcnt_u32(statMask); + UPDATE_STAT(DepthPassCount, statCount); + } + work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); + } + pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; + pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; + } + } +} + +void InitClearTilesTable() +{ + memset(sClearTilesTable, 0, sizeof(sClearTilesTable)); + + sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>; + sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>; + sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>; + sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>; + sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>; +} + +PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX]; +PFN_BACKEND_FUNC gBackendSingleSample[2][2] = {}; +PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2] = {}; +PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2] = {}; +PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX] = {}; +PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2] = {}; +PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2] = {}; +PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2] = {}; + +// Recursive template used to auto-nest conditionals. Converts dynamic enum function +// arguments to static template arguments. +template <uint32_t... ArgsT> +struct OMChooser +{ + // Last Arg Terminator + static PFN_OUTPUT_MERGER GetFunc(SWR_MULTISAMPLE_COUNT tArg) + { + switch(tArg) + { + case SWR_MULTISAMPLE_1X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_1X>; break; + case SWR_MULTISAMPLE_2X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_2X>; break; + case SWR_MULTISAMPLE_4X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_4X>; break; + case SWR_MULTISAMPLE_8X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_8X>; break; + case SWR_MULTISAMPLE_16X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_16X>; break; + default: + SWR_ASSERT(0 && "Invalid sample count\n"); + return nullptr; + break; + } + } + + // Recursively parse args + template <typename... TArgsT> + static PFN_OUTPUT_MERGER GetFunc(uint32_t tArg, TArgsT... remainingArgs) + { + switch(tArg) + { + case 0: return OMChooser<ArgsT..., 0>::GetFunc(remainingArgs...); break; + case 1: return OMChooser<ArgsT..., 1>::GetFunc(remainingArgs...); break; + case 2: return OMChooser<ArgsT..., 2>::GetFunc(remainingArgs...); break; + case 3: return OMChooser<ArgsT..., 3>::GetFunc(remainingArgs...); break; + case 4: return OMChooser<ArgsT..., 4>::GetFunc(remainingArgs...); break; + case 5: return OMChooser<ArgsT..., 5>::GetFunc(remainingArgs...); break; + case 6: return OMChooser<ArgsT..., 6>::GetFunc(remainingArgs...); break; + case 7: return OMChooser<ArgsT..., 7>::GetFunc(remainingArgs...); break; + case 8: return OMChooser<ArgsT..., 8>::GetFunc(remainingArgs...); break; + default: + SWR_ASSERT(0 && "Invalid RT index\n"); + return nullptr; + break; + } + } +}; + +// Recursive template used to auto-nest conditionals. Converts dynamic enum function +// arguments to static template arguments. +template <uint32_t... ArgsT> +struct BECentroidBarycentricChooser +{ + + // Last Arg Terminator + template <typename... TArgsT> + static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(uint32_t tArg) + { + if(tArg > 0) + { + return CalcCentroidBarycentrics<ArgsT..., 1>; + } + + return CalcCentroidBarycentrics<ArgsT..., 0>; + } + + // Recursively parse args + template <typename... TArgsT> + static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs) + { + switch(tArg) + { + case SWR_MULTISAMPLE_1X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break; + case SWR_MULTISAMPLE_2X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break; + case SWR_MULTISAMPLE_4X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break; + case SWR_MULTISAMPLE_8X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break; + case SWR_MULTISAMPLE_16X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break; + default: + SWR_ASSERT(0 && "Invalid sample count\n"); + return nullptr; + break; + } + } + + // Recursively parse args + template <typename... TArgsT> + static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(uint32_t tArg, TArgsT... remainingArgs) + { + if(tArg > 0) + { + return BECentroidBarycentricChooser<ArgsT..., 1>::GetFunc(remainingArgs...); + } + + return BECentroidBarycentricChooser<ArgsT..., 0>::GetFunc(remainingArgs...); + } +}; + +// Recursive template used to auto-nest conditionals. Converts dynamic enum function +// arguments to static template arguments. +template <uint32_t... ArgsT> +struct BEChooser +{ + // Last Arg Terminator + static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg) + { + switch(tArg) + { + case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<ArgsT...>; break; + case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<ArgsT...>; break; + case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<ArgsT...>; break; + default: + SWR_ASSERT(0 && "Invalid backend func\n"); + return nullptr; + break; + } + } + + + // Recursively parse args + template <typename... TArgsT> + static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs) + { + switch(tArg) + { + case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break; + case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break; + case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break; + case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break; + case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break; + default: + SWR_ASSERT(0 && "Invalid sample count\n"); + return nullptr; + break; + } + } + + // Recursively parse args + template <typename... TArgsT> + static PFN_BACKEND_FUNC GetFunc(uint32_t tArg, TArgsT... remainingArgs) + { + if(tArg > 0) + { + return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...); + } + + return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...); + } +}; + +template <uint32_t numRenderTargets, SWR_MULTISAMPLE_COUNT numSampleRates> +void InitBackendOMFuncTable(PFN_OUTPUT_MERGER (&table)[numRenderTargets][numSampleRates]) +{ + for(uint32_t rtNum = SWR_ATTACHMENT_COLOR0; rtNum < numRenderTargets; rtNum++) + { + for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++) + { + table[rtNum][sampleCount] = + OMChooser<>::GetFunc((SWR_RENDERTARGET_ATTACHMENT)rtNum, (SWR_MULTISAMPLE_COUNT)sampleCount); + } + } +} + +template <SWR_MULTISAMPLE_COUNT numSampleRates> +void InitBackendBarycentricsTables(PFN_CALC_PIXEL_BARYCENTRICS (&pixelTable)[2], + PFN_CALC_SAMPLE_BARYCENTRICS (&sampleTable)[2], + PFN_CALC_CENTROID_BARYCENTRICS (¢roidTable)[numSampleRates][2][2][2]) +{ + pixelTable[0] = CalcPixelBarycentrics<0>; + pixelTable[1] = CalcPixelBarycentrics<1>; + + sampleTable[0] = CalcSampleBarycentrics<0>; + sampleTable[1] = CalcSampleBarycentrics<1>; + + for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++) + { + for(uint32_t baryMask = 0; baryMask < 2; baryMask++) + { + for(uint32_t patternNum = 0; patternNum < 2; patternNum++) + { + for(uint32_t forcedSampleEnable = 0; forcedSampleEnable < 2; forcedSampleEnable++) + { + centroidTable[sampleCount][baryMask][patternNum][forcedSampleEnable]= + BECentroidBarycentricChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, baryMask, patternNum, forcedSampleEnable); + } + } + } + } +} + +void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[2][2]) +{ + gBackendSingleSample[0][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NONE, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE); + gBackendSingleSample[0][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NONE, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE); + gBackendSingleSample[1][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NORMAL, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE); + gBackendSingleSample[1][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NORMAL, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE); +} + +template <SWR_MULTISAMPLE_COUNT numSampleRates, SWR_MSAA_SAMPLE_PATTERN numSamplePatterns, SWR_INPUT_COVERAGE numCoverageModes> +void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numSamplePatterns][numCoverageModes][2][2]) +{ + for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++) + { + for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < numSamplePatterns; samplePattern++) + { + for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < numCoverageModes; inputCoverage++) + { + for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++) + { + table[sampleCount][samplePattern][inputCoverage][isCentroid][0] = + BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, samplePattern, inputCoverage, isCentroid, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_PIXEL_RATE); + table[sampleCount][samplePattern][inputCoverage][isCentroid][1] = + BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, samplePattern, inputCoverage, isCentroid, 1, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_PIXEL_RATE); + } + } + } + } +} + +template <uint32_t numSampleRates, uint32_t numCoverageModes> +void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numCoverageModes][2]) +{ + for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++) + { + for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < numCoverageModes; inputCoverage++) + { + table[sampleCount][inputCoverage][0] = + BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, inputCoverage, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE); + table[sampleCount][inputCoverage][1] = + BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, inputCoverage, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE); + } + } +} + +void InitBackendFuncTables() +{ + InitBackendSampleFuncTable(gBackendSingleSample); + InitBackendPixelFuncTable<(SWR_MULTISAMPLE_COUNT)SWR_MULTISAMPLE_TYPE_MAX, SWR_MSAA_SAMPLE_PATTERN_MAX, SWR_INPUT_COVERAGE_MAX>(gBackendPixelRateTable); + InitBackendSampleFuncTable<SWR_MULTISAMPLE_TYPE_MAX, SWR_INPUT_COVERAGE_MAX>(gBackendSampleRateTable); + InitBackendOMFuncTable<SWR_NUM_RENDERTARGETS+1, SWR_MULTISAMPLE_TYPE_MAX>(gBackendOutputMergerTable); + InitBackendBarycentricsTables<(SWR_MULTISAMPLE_COUNT)(SWR_MULTISAMPLE_TYPE_MAX)>(gPixelBarycentricTable, gSampleBarycentricTable, gCentroidBarycentricTable); + + gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ; + gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ; + gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ; + gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ; + gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ; +} diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h new file mode 100644 index 00000000000..53089e5047b --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/backend.h @@ -0,0 +1,59 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file backend.h +* +* @brief Backend handles rasterization, pixel shading and output merger +* operations. +* +******************************************************************************/ +#pragma once + +#include "common/os.h" +#include "core/context.h" + +void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId); +void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); +void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); +void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); +void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); +void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); +void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers); +void InitClearTilesTable(); + +enum SWR_BACKEND_FUNCS +{ + SWR_BACKEND_SINGLE_SAMPLE, + SWR_BACKEND_MSAA_PIXEL_RATE, + SWR_BACKEND_MSAA_SAMPLE_RATE, + SWR_BACKEND_FUNCS_MAX, +}; +void InitBackendFuncTables(); + +extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX]; +extern PFN_BACKEND_FUNC gBackendSingleSample[2][2]; +extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2]; +extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2]; +extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX]; +extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2]; +extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2]; +extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2]; diff --git a/src/gallium/drivers/swr/rasterizer/core/blend.h b/src/gallium/drivers/swr/rasterizer/core/blend.h new file mode 100644 index 00000000000..626c237d75b --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/blend.h @@ -0,0 +1,318 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file blend.cpp +* +* @brief Implementation for blending operations. +* +******************************************************************************/ +#include "state.h" + +template<bool Color, bool Alpha> +INLINE +void GenerateBlendFactor(SWR_BLEND_FACTOR func, simdvector &constantColor, simdvector &src, simdvector &src1, simdvector &dst, simdvector &out) +{ + simdvector result; + + switch (func) + { + case BLENDFACTOR_ZERO: + result.x = _simd_setzero_ps(); + result.y = _simd_setzero_ps(); + result.z = _simd_setzero_ps(); + result.w = _simd_setzero_ps(); + break; + + case BLENDFACTOR_ONE: + result.x = _simd_set1_ps(1.0); + result.y = _simd_set1_ps(1.0); + result.z = _simd_set1_ps(1.0); + result.w = _simd_set1_ps(1.0); + break; + + case BLENDFACTOR_SRC_COLOR: + result = src; + break; + + case BLENDFACTOR_DST_COLOR: + result = dst; + break; + + case BLENDFACTOR_INV_SRC_COLOR: + result.x = _simd_sub_ps(_simd_set1_ps(1.0), src.x); + result.y = _simd_sub_ps(_simd_set1_ps(1.0), src.y); + result.z = _simd_sub_ps(_simd_set1_ps(1.0), src.z); + result.w = _simd_sub_ps(_simd_set1_ps(1.0), src.w); + break; + + case BLENDFACTOR_INV_DST_COLOR: + result.x = _simd_sub_ps(_simd_set1_ps(1.0), dst.x); + result.y = _simd_sub_ps(_simd_set1_ps(1.0), dst.y); + result.z = _simd_sub_ps(_simd_set1_ps(1.0), dst.z); + result.w = _simd_sub_ps(_simd_set1_ps(1.0), dst.w); + break; + + case BLENDFACTOR_SRC_ALPHA: result.x = src.w; + result.y = src.w; + result.z = src.w; + result.w = src.w; + break; + + case BLENDFACTOR_INV_SRC_ALPHA: + { + simdscalar oneMinusSrcA = _simd_sub_ps(_simd_set1_ps(1.0), src.w); + result.x = oneMinusSrcA; + result.y = oneMinusSrcA; + result.z = oneMinusSrcA; + result.w = oneMinusSrcA; + break; + } + + case BLENDFACTOR_DST_ALPHA: result.x = dst.w; + result.y = dst.w; + result.z = dst.w; + result.w = dst.w; + break; + + case BLENDFACTOR_INV_DST_ALPHA: + { + simdscalar oneMinusDstA = _simd_sub_ps(_simd_set1_ps(1.0), dst.w); + result.x = oneMinusDstA; + result.y = oneMinusDstA; + result.z = oneMinusDstA; + result.w = oneMinusDstA; + break; + } + + case BLENDFACTOR_SRC_ALPHA_SATURATE: + { + simdscalar sat = _simd_min_ps(src.w, _simd_sub_ps(_simd_set1_ps(1.0), dst.w)); + result.x = sat; + result.y = sat; + result.z = sat; + result.w = _simd_set1_ps(1.0); + break; + } + + case BLENDFACTOR_CONST_COLOR: + result.x = constantColor[0]; + result.y = constantColor[1]; + result.z = constantColor[2]; + result.w = constantColor[3]; + break; + + case BLENDFACTOR_CONST_ALPHA: + result.x = result.y = result.z = result.w = constantColor[3]; + break; + + case BLENDFACTOR_INV_CONST_COLOR: + { + result.x = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[0]); + result.y = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[1]); + result.z = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[2]); + result.w = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]); + break; + } + + case BLENDFACTOR_INV_CONST_ALPHA: + { + result.x = result.y = result.z = result.w = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]); + break; + } + + case BLENDFACTOR_SRC1_COLOR: + result.x = src1.x; + result.y = src1.y; + result.z = src1.z; + result.w = src1.w; + break; + + case BLENDFACTOR_SRC1_ALPHA: + result.x = result.y = result.z = result.w = src1.w; + break; + + case BLENDFACTOR_INV_SRC1_COLOR: + result.x = _simd_sub_ps(_simd_set1_ps(1.0f), src1.x); + result.y = _simd_sub_ps(_simd_set1_ps(1.0f), src1.y); + result.z = _simd_sub_ps(_simd_set1_ps(1.0f), src1.z); + result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w); + break; + + case BLENDFACTOR_INV_SRC1_ALPHA: + result.x = result.y = result.z = result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w); + break; + + default: SWR_ASSERT(false, "Unimplemented blend factor: %d", func); + } + + if (Color) + { + out.x = result.x; + out.y = result.y; + out.z = result.z; + } + if (Alpha) + { + out.w = result.w; + } + +} + +template<bool Color, bool Alpha> +INLINE void BlendFunc(SWR_BLEND_OP blendOp, simdvector &src, simdvector &srcFactor, simdvector &dst, simdvector &dstFactor, simdvector &out) +{ + simdvector result; + + switch (blendOp) + { + case BLENDOP_ADD: + result.x = _simd_fmadd_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x)); + result.y = _simd_fmadd_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y)); + result.z = _simd_fmadd_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z)); + result.w = _simd_fmadd_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w)); + break; + + case BLENDOP_SUBTRACT: + result.x = _simd_fmsub_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x)); + result.y = _simd_fmsub_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y)); + result.z = _simd_fmsub_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z)); + result.w = _simd_fmsub_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w)); + break; + + case BLENDOP_REVSUBTRACT: + result.x = _simd_fmsub_ps(dstFactor.x, dst.x, _simd_mul_ps(srcFactor.x, src.x)); + result.y = _simd_fmsub_ps(dstFactor.y, dst.y, _simd_mul_ps(srcFactor.y, src.y)); + result.z = _simd_fmsub_ps(dstFactor.z, dst.z, _simd_mul_ps(srcFactor.z, src.z)); + result.w = _simd_fmsub_ps(dstFactor.w, dst.w, _simd_mul_ps(srcFactor.w, src.w)); + break; + + case BLENDOP_MIN: + result.x = _simd_min_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x)); + result.y = _simd_min_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y)); + result.z = _simd_min_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z)); + result.w = _simd_min_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w)); + break; + + case BLENDOP_MAX: + result.x = _simd_max_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x)); + result.y = _simd_max_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y)); + result.z = _simd_max_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z)); + result.w = _simd_max_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w)); + break; + + default: + SWR_ASSERT(false, "Unimplemented blend function: %d", blendOp); + } + + if (Color) + { + out.x = result.x; + out.y = result.y; + out.z = result.z; + } + if (Alpha) + { + out.w = result.w; + } +} + +template<SWR_TYPE type> +INLINE void Clamp(simdvector &src) +{ + switch (type) + { + case SWR_TYPE_FLOAT: + break; + + case SWR_TYPE_UNORM: + src.x = _simd_max_ps(src.x, _simd_setzero_ps()); + src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f)); + + src.y = _simd_max_ps(src.y, _simd_setzero_ps()); + src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f)); + + src.z = _simd_max_ps(src.z, _simd_setzero_ps()); + src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f)); + + src.w = _simd_max_ps(src.w, _simd_setzero_ps()); + src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f)); + break; + + case SWR_TYPE_SNORM: + src.x = _simd_max_ps(src.x, _simd_set1_ps(-1.0f)); + src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f)); + + src.y = _simd_max_ps(src.y, _simd_set1_ps(-1.0f)); + src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f)); + + src.z = _simd_max_ps(src.z, _simd_set1_ps(-1.0f)); + src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f)); + + src.w = _simd_max_ps(src.w, _simd_set1_ps(-1.0f)); + src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f)); + break; + + default: + SWR_ASSERT(false, "Unimplemented clamp: %d", type); + break; + } +} + +template<SWR_TYPE type> +void Blend(const SWR_BLEND_STATE *pBlendState, const SWR_RENDER_TARGET_BLEND_STATE *pState, simdvector &src, simdvector& src1, BYTE *pDst, simdvector &result) +{ + // load render target + simdvector dst; + LoadSOA<KNOB_COLOR_HOT_TILE_FORMAT>(pDst, dst); + + simdvector constColor; + constColor.x = _simd_broadcast_ss(&pBlendState->constantColor[0]); + constColor.y = _simd_broadcast_ss(&pBlendState->constantColor[1]); + constColor.z = _simd_broadcast_ss(&pBlendState->constantColor[2]); + constColor.w = _simd_broadcast_ss(&pBlendState->constantColor[3]); + + // clamp src/dst/constant + Clamp<type>(src); + Clamp<type>(src1); + Clamp<type>(dst); + Clamp<type>(constColor); + + simdvector srcFactor, dstFactor; + if (pBlendState->independentAlphaBlendEnable) + { + GenerateBlendFactor<true, false>((SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor); + GenerateBlendFactor<false, true>((SWR_BLEND_FACTOR)pState->sourceAlphaBlendFactor, constColor, src, src1, dst, srcFactor); + + GenerateBlendFactor<true, false>((SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor); + GenerateBlendFactor<false, true>((SWR_BLEND_FACTOR)pState->destAlphaBlendFactor, constColor, src, src1, dst, dstFactor); + + BlendFunc<true, false>((SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result); + BlendFunc<false, true>((SWR_BLEND_OP)pState->alphaBlendFunc, src, srcFactor, dst, dstFactor, result); + } + else + { + GenerateBlendFactor<true, true>((SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor); + GenerateBlendFactor<true, true>((SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor); + + BlendFunc<true, true>((SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result); + } +} diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp new file mode 100644 index 00000000000..ce27bf71d3c --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp @@ -0,0 +1,201 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file clip.cpp +* +* @brief Implementation for clipping +* +******************************************************************************/ + +#include <assert.h> + +#include "common/os.h" +#include "core/clip.h" + +float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1) +{ + return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1)); +} + +template<SWR_CLIPCODES ClippingPlane> +inline void intersect( + int s, // index to first edge vertex v0 in pInPts. + int p, // index to second edge vertex v1 in pInPts. + const float *pInPts, // array of all the input positions. + const float *pInAttribs, // array of all attributes for all vertex. All the attributes for each vertex is contiguous. + int numInAttribs, // number of attributes per vertex. + int i, // output index. + float *pOutPts, // array of output positions. We'll write our new intersection point at i*4. + float *pOutAttribs) // array of output attributes. We'll write our new attributes at i*numInAttribs. +{ + float t; + + // Find the parameter of the intersection. + // t = (v1.w - v1.x) / ((v2.x - v1.x) - (v2.w - v1.w)) for x = w (RIGHT) plane, etc. + const float *v1 = &pInPts[s*4]; + const float *v2 = &pInPts[p*4]; + + switch (ClippingPlane) + { + case FRUSTUM_LEFT: t = ComputeInterpFactor(v1[3] + v1[0], v2[3] + v2[0]); break; + case FRUSTUM_RIGHT: t = ComputeInterpFactor(v1[3] - v1[0], v2[3] - v2[0]); break; + case FRUSTUM_TOP: t = ComputeInterpFactor(v1[3] + v1[1], v2[3] + v2[1]); break; + case FRUSTUM_BOTTOM: t = ComputeInterpFactor(v1[3] - v1[1], v2[3] - v2[1]); break; + case FRUSTUM_NEAR: t = ComputeInterpFactor(v1[2], v2[2]); break; + case FRUSTUM_FAR: t = ComputeInterpFactor(v1[3] - v1[2], v2[3] - v2[2]); break; + default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane); + }; + + + const float *a1 = &pInAttribs[s*numInAttribs]; + const float *a2 = &pInAttribs[p*numInAttribs]; + + float *pOutP = &pOutPts[i*4]; + float *pOutA = &pOutAttribs[i*numInAttribs]; + + // Interpolate new position. + for(int j = 0; j < 4; ++j) + { + pOutP[j] = v1[j] + (v2[j]-v1[j])*t; + } + + // Interpolate Attributes + for(int attr = 0; attr < numInAttribs; ++attr) + { + pOutA[attr] = a1[attr] + (a2[attr]-a1[attr])*t; + } +} + + +// Checks whether vertex v lies inside clipping plane +// in homogenous coords check -w < {x,y,z} < w; +// +template<SWR_CLIPCODES ClippingPlane> +inline int inside(const float v[4]) +{ + switch (ClippingPlane) + { + case FRUSTUM_LEFT : return (v[0]>=-v[3]); + case FRUSTUM_RIGHT : return (v[0]<= v[3]); + case FRUSTUM_TOP : return (v[1]>=-v[3]); + case FRUSTUM_BOTTOM : return (v[1]<= v[3]); + case FRUSTUM_NEAR : return (v[2]>=0.0f); + case FRUSTUM_FAR : return (v[2]<= v[3]); + default: + SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane); + return 0; + } +} + + +// Clips a polygon in homogenous coordinates to a particular clipping plane. +// Takes in vertices of the polygon (InPts) and the clipping plane +// Puts the vertices of the clipped polygon in OutPts +// Returns number of points in clipped polygon +// +template<SWR_CLIPCODES ClippingPlane> +int ClipTriToPlane( const float *pInPts, int numInPts, + const float *pInAttribs, int numInAttribs, + float *pOutPts, float *pOutAttribs) +{ + int i=0; // index number of OutPts, # of vertices in OutPts = i div 4; + + for (int j = 0; j < numInPts; ++j) + { + int s = j; + int p = (j + 1) % numInPts; + + int s_in = inside<ClippingPlane>(&pInPts[s*4]); + int p_in = inside<ClippingPlane>(&pInPts[p*4]); + + // test if vertex is to be added to output vertices + if (s_in != p_in) // edge crosses clipping plane + { + // find point of intersection + intersect<ClippingPlane>(s, p, pInPts, pInAttribs, numInAttribs, i, pOutPts, pOutAttribs); + i++; + } + if (p_in) // 2nd vertex is inside clipping volume, add it to output + { + // Copy 2nd vertex position of edge over to output. + for(int k = 0; k < 4; ++k) + { + pOutPts[i*4 + k] = pInPts[p*4 + k]; + } + // Copy 2nd vertex attributes of edge over to output. + for(int attr = 0; attr < numInAttribs; ++attr) + { + pOutAttribs[i*numInAttribs+attr] = pInAttribs[p*numInAttribs+attr]; + } + i++; + } + // edge does not cross clipping plane and vertex outside clipping volume + // => do not add vertex + } + return i; +} + + + +void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles, int *numVerts, float *pOutAttribs) +{ + // temp storage to hold at least 6 sets of vertices, the max number that can be created during clipping + OSALIGN(float, 16) tempPts[6 * 4]; + OSALIGN(float, 16) tempAttribs[6 * KNOB_NUM_ATTRIBUTES * 4]; + + // we opt to clip to viewport frustum to produce smaller triangles for rasterization precision + int NumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pTriangle, 3, pAttribs, numAttribs, tempPts, tempAttribs); + NumOutPts = ClipTriToPlane<FRUSTUM_FAR>(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs); + NumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pOutTriangles, NumOutPts, pOutAttribs, numAttribs, tempPts, tempAttribs); + NumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs); + NumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pOutTriangles, NumOutPts, pOutAttribs, numAttribs, tempPts, tempAttribs); + NumOutPts = ClipTriToPlane<FRUSTUM_TOP>(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs); + + SWR_ASSERT(NumOutPts <= 6); + + *numVerts = NumOutPts; + return; +} + +void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId) +{ + RDTSC_START(FEClipTriangles); + Clipper<3> clipper(workerId, pDC); + clipper.ExecuteStage(pa, prims, primMask, primId); + RDTSC_STOP(FEClipTriangles, 1, 0); +} + +void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId) +{ + RDTSC_START(FEClipLines); + Clipper<2> clipper(workerId, pDC); + clipper.ExecuteStage(pa, prims, primMask, primId); + RDTSC_STOP(FEClipLines, 1, 0); +} +void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId) +{ + RDTSC_START(FEClipPoints); + Clipper<1> clipper(workerId, pDC); + clipper.ExecuteStage(pa, prims, primMask, primId); + RDTSC_STOP(FEClipPoints, 1, 0); +} + diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h new file mode 100644 index 00000000000..49494a4e374 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -0,0 +1,868 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file clip.h +* +* @brief Definitions for clipping +* +******************************************************************************/ +#pragma once + +#include "common/simdintrin.h" +#include "core/context.h" +#include "core/pa.h" +#include "rdtsc_core.h" + +enum SWR_CLIPCODES +{ + // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare. + // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes. +#define CLIPCODE_SHIFT 23 + FRUSTUM_LEFT = (0x01 << CLIPCODE_SHIFT), + FRUSTUM_TOP = (0x02 << CLIPCODE_SHIFT), + FRUSTUM_RIGHT = (0x04 << CLIPCODE_SHIFT), + FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT), + + FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT), + FRUSTUM_FAR = (0x20 << CLIPCODE_SHIFT), + + NEGW = (0x40 << CLIPCODE_SHIFT), + + GUARDBAND_LEFT = (0x80 << CLIPCODE_SHIFT | 0x1), + GUARDBAND_TOP = (0x80 << CLIPCODE_SHIFT | 0x2), + GUARDBAND_RIGHT = (0x80 << CLIPCODE_SHIFT | 0x4), + GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8) +}; + +#define FRUSTUM_CLIP_MASK (FRUSTUM_LEFT|FRUSTUM_TOP|FRUSTUM_RIGHT|FRUSTUM_BOTTOM|FRUSTUM_NEAR|FRUSTUM_FAR) +#define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW) + +void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles, + int *numVerts, float *pOutAttribs); + +INLINE +void ComputeClipCodes(DRIVER_TYPE type, const API_STATE& state, const simdvector& vertex, simdscalar& clipCodes) +{ + clipCodes = _simd_setzero_ps(); + + // -w + simdscalar vNegW = _simd_mul_ps(vertex.w, _simd_set1_ps(-1.0f)); + + // FRUSTUM_LEFT + simdscalar vRes = _simd_cmplt_ps(vertex.x, vNegW); + clipCodes = _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT))); + + // FRUSTUM_TOP + vRes = _simd_cmplt_ps(vertex.y, vNegW); + clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP)))); + + // FRUSTUM_RIGHT + vRes = _simd_cmpgt_ps(vertex.x, vertex.w); + clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_RIGHT)))); + + // FRUSTUM_BOTTOM + vRes = _simd_cmpgt_ps(vertex.y, vertex.w); + clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_BOTTOM)))); + + if (state.rastState.depthClipEnable) + { + // FRUSTUM_NEAR + // DX clips depth [0..w], GL clips [-w..w] + if (type == DX) + { + vRes = _simd_cmplt_ps(vertex.z, _simd_setzero_ps()); + } + else + { + vRes = _simd_cmplt_ps(vertex.z, vNegW); + } + clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_NEAR)))); + + // FRUSTUM_FAR + vRes = _simd_cmpgt_ps(vertex.z, vertex.w); + clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_FAR)))); + } + + // NEGW + vRes = _simd_cmple_ps(vertex.w, _simd_setzero_ps()); + clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(NEGW)))); + + // GUARDBAND_LEFT + simdscalar gbMult = _simd_mul_ps(vNegW, _simd_set1_ps(state.gbState.left)); + vRes = _simd_cmplt_ps(vertex.x, gbMult); + clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_LEFT)))); + + // GUARDBAND_TOP + gbMult = _simd_mul_ps(vNegW, _simd_set1_ps(state.gbState.top)); + vRes = _simd_cmplt_ps(vertex.y, gbMult); + clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_TOP)))); + + // GUARDBAND_RIGHT + gbMult = _simd_mul_ps(vertex.w, _simd_set1_ps(state.gbState.right)); + vRes = _simd_cmpgt_ps(vertex.x, gbMult); + clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_RIGHT)))); + + // GUARDBAND_BOTTOM + gbMult = _simd_mul_ps(vertex.w, _simd_set1_ps(state.gbState.bottom)); + vRes = _simd_cmpgt_ps(vertex.y, gbMult); + clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_BOTTOM)))); +} + +template<uint32_t NumVertsPerPrim> +class Clipper +{ +public: + Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) : + workerId(in_workerId), driverType(in_pDC->pContext->driverType), pDC(in_pDC), state(GetApiState(in_pDC)) + { + static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim"); + } + + void ComputeClipCodes(simdvector vertex[]) + { + for (uint32_t i = 0; i < NumVertsPerPrim; ++i) + { + ::ComputeClipCodes(this->driverType, this->state, vertex[i], this->clipCodes[i]); + } + } + + simdscalar ComputeClipCodeIntersection() + { + simdscalar result = this->clipCodes[0]; + for (uint32_t i = 1; i < NumVertsPerPrim; ++i) + { + result = _simd_and_ps(result, this->clipCodes[i]); + } + return result; + } + + simdscalar ComputeClipCodeUnion() + { + simdscalar result = this->clipCodes[0]; + for (uint32_t i = 1; i < NumVertsPerPrim; ++i) + { + result = _simd_or_ps(result, this->clipCodes[i]); + } + return result; + } + + int ComputeNegWMask() + { + simdscalar clipCodeUnion = ComputeClipCodeUnion(); + clipCodeUnion = _simd_and_ps(clipCodeUnion, _simd_castsi_ps(_simd_set1_epi32(NEGW))); + return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion, _simd_setzero_ps())); + } + + int ComputeClipMask() + { + simdscalar clipUnion = ComputeClipCodeUnion(); + clipUnion = _simd_and_ps(clipUnion, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK))); + return _simd_movemask_ps(_simd_cmpneq_ps(clipUnion, _simd_setzero_ps())); + } + + // clipper is responsible for culling any prims with NAN coordinates + int ComputeNaNMask(simdvector prim[]) + { + simdscalar vNanMask = _simd_setzero_ps(); + for (uint32_t e = 0; e < NumVertsPerPrim; ++e) + { + simdscalar vNan01 = _simd_cmp_ps(prim[e].v[0], prim[e].v[1], _CMP_UNORD_Q); + vNanMask = _simd_or_ps(vNanMask, vNan01); + simdscalar vNan23 = _simd_cmp_ps(prim[e].v[2], prim[e].v[3], _CMP_UNORD_Q); + vNanMask = _simd_or_ps(vNanMask, vNan23); + } + + return _simd_movemask_ps(vNanMask); + } + + int ComputeUserClipCullMask(PA_STATE& pa, simdvector prim[]) + { + uint8_t cullMask = this->state.rastState.cullDistanceMask; + simdscalar vClipCullMask = _simd_setzero_ps(); + DWORD index; + + simdvector vClipCullDistLo[3]; + simdvector vClipCullDistHi[3]; + + pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo); + pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi); + while (_BitScanForward(&index, cullMask)) + { + cullMask &= ~(1 << index); + uint32_t slot = index >> 2; + uint32_t component = index & 0x3; + + simdscalar vCullMaskElem = _simd_set1_ps(-1.0f); + for (uint32_t e = 0; e < NumVertsPerPrim; ++e) + { + simdscalar vCullComp; + if (slot == 0) + { + vCullComp = vClipCullDistLo[e][component]; + } + else + { + vCullComp = vClipCullDistHi[e][component]; + } + + // cull if cull distance < 0 || NAN + simdscalar vCull = _simd_cmp_ps(_mm256_setzero_ps(), vCullComp, _CMP_NLE_UQ); + vCullMaskElem = _simd_and_ps(vCullMaskElem, vCull); + } + vClipCullMask = _simd_or_ps(vClipCullMask, vCullMaskElem); + } + + // clipper should also discard any primitive with NAN clip distance + uint8_t clipMask = this->state.rastState.clipDistanceMask; + while (_BitScanForward(&index, clipMask)) + { + clipMask &= ~(1 << index); + uint32_t slot = index >> 2; + uint32_t component = index & 0x3; + + for (uint32_t e = 0; e < NumVertsPerPrim; ++e) + { + simdscalar vClipComp; + if (slot == 0) + { + vClipComp = vClipCullDistLo[e][component]; + } + else + { + vClipComp = vClipCullDistHi[e][component]; + } + + simdscalar vClip = _simd_cmp_ps(vClipComp, vClipComp, _CMP_UNORD_Q); + vClipCullMask = _simd_or_ps(vClipCullMask, vClip); + } + } + + return _simd_movemask_ps(vClipCullMask); + } + + // clip a single primitive + int ClipScalar(PA_STATE& pa, uint32_t primIndex, float* pOutPos, float* pOutAttribs) + { + OSALIGN(float, 16) inVerts[3 * 4]; + OSALIGN(float, 16) inAttribs[3 * KNOB_NUM_ATTRIBUTES * 4]; + + // transpose primitive position + __m128 verts[3]; + pa.AssembleSingle(VERTEX_POSITION_SLOT, primIndex, verts); + _mm_store_ps(&inVerts[0], verts[0]); + _mm_store_ps(&inVerts[4], verts[1]); + _mm_store_ps(&inVerts[8], verts[2]); + + // transpose attribs + uint32_t numScalarAttribs = this->state.linkageCount * 4; + + int idx = 0; + DWORD slot = 0; + uint32_t mapIdx = 0; + uint32_t tmpLinkage = uint32_t(this->state.linkageMask); + while (_BitScanForward(&slot, tmpLinkage)) + { + tmpLinkage &= ~(1 << slot); + // Compute absolute attrib slot in vertex array + uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + this->state.linkageMap[mapIdx++]; + __m128 attrib[3]; // triangle attribs (always 4 wide) + pa.AssembleSingle(inputSlot, primIndex, attrib); + _mm_store_ps(&inAttribs[idx], attrib[0]); + _mm_store_ps(&inAttribs[idx + numScalarAttribs], attrib[1]); + _mm_store_ps(&inAttribs[idx + numScalarAttribs * 2], attrib[2]); + idx += 4; + } + + int numVerts; + Clip(inVerts, inAttribs, numScalarAttribs, pOutPos, &numVerts, pOutAttribs); + + return numVerts; + } + + // clip SIMD primitives + void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId) + { + // input/output vertex store for clipper + simdvertex vertices[7]; // maximum 7 verts generated per triangle + + LONG constantInterpMask = this->state.backendState.constantInterpolationMask; + uint32_t provokingVertex = 0; + if(pa.binTopology == TOP_TRIANGLE_FAN) + { + provokingVertex = this->state.frontendState.provokingVertex.triFan; + } + ///@todo: line topology for wireframe? + + // assemble pos + simdvector tmpVector[NumVertsPerPrim]; + pa.Assemble(VERTEX_POSITION_SLOT, tmpVector); + for (uint32_t i = 0; i < NumVertsPerPrim; ++i) + { + vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i]; + } + + // assemble attribs + DWORD slot = 0; + uint32_t mapIdx = 0; + uint32_t tmpLinkage = this->state.linkageMask; + + int32_t maxSlot = -1; + while (_BitScanForward(&slot, tmpLinkage)) + { + tmpLinkage &= ~(1 << slot); + // Compute absolute attrib slot in vertex array + uint32_t mapSlot = this->state.linkageMap[mapIdx++]; + maxSlot = std::max<int32_t>(maxSlot, mapSlot); + uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + mapSlot; + + pa.Assemble(inputSlot, tmpVector); + + // if constant interpolation enabled for this attribute, assign the provoking + // vertex values to all edges + if (_bittest(&constantInterpMask, slot)) + { + for (uint32_t i = 0; i < NumVertsPerPrim; ++i) + { + vertices[i].attrib[inputSlot] = tmpVector[provokingVertex]; + } + } + else + { + for (uint32_t i = 0; i < NumVertsPerPrim; ++i) + { + vertices[i].attrib[inputSlot] = tmpVector[i]; + } + } + } + + uint32_t numAttribs = maxSlot + 1; + + simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs); + + // set up new PA for binning clipped primitives + PFN_PROCESS_PRIMS pfnBinFunc = nullptr; + PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN; + if (NumVertsPerPrim == 3) + { + pfnBinFunc = BinTriangles; + clipTopology = TOP_TRIANGLE_FAN; + + // so that the binner knows to bloat wide points later + if (pa.binTopology == TOP_POINT_LIST) + clipTopology = TOP_POINT_LIST; + } + else if (NumVertsPerPrim == 2) + { + pfnBinFunc = BinLines; + clipTopology = TOP_LINE_LIST; + } + else + { + SWR_ASSERT(0 && "Unexpected points in clipper."); + } + + + uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts; + uint32_t* pPrimitiveId = (uint32_t*)&vPrimId; + + const simdscalari vOffsets = _mm256_set_epi32( + 0 * sizeof(simdvertex), // unused lane + 6 * sizeof(simdvertex), + 5 * sizeof(simdvertex), + 4 * sizeof(simdvertex), + 3 * sizeof(simdvertex), + 2 * sizeof(simdvertex), + 1 * sizeof(simdvertex), + 0 * sizeof(simdvertex)); + + // only need to gather 7 verts + // @todo dynamic mask based on actual # of verts generated per lane + const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1); + + uint32_t numClippedPrims = 0; + for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim) + { + uint32_t numEmittedVerts = pVertexCount[inputPrim]; + if (numEmittedVerts < NumVertsPerPrim) + { + continue; + } + SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper."); + + uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts); + numClippedPrims += numEmittedPrims; + + // tranpose clipper output so that each lane's vertices are in SIMD order + // set aside space for 2 vertices, as the PA will try to read up to 16 verts + // for triangle fan + simdvertex transposedPrims[2]; + + // transpose pos + uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim; + for (uint32_t c = 0; c < 4; ++c) + { + transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1); + pBase += sizeof(simdscalar); + } + + // transpose attribs + pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_ATTRIB_START_SLOT]) + sizeof(float) * inputPrim; + for (uint32_t attrib = 0; attrib < numAttribs; ++attrib) + { + uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib; + for (uint32_t c = 0; c < 4; ++c) + { + transposedPrims[0].attrib[attribSlot][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1); + pBase += sizeof(simdscalar); + } + } + + PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology); + + while (clipPa.GetNextStreamOutput()) + { + do + { + simdvector attrib[NumVertsPerPrim]; + bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib); + if (assemble) + { + static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff }; + pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim])); + } + } while (clipPa.NextPrim()); + } + } + + // update global pipeline stat + SWR_CONTEXT* pContext = this->pDC->pContext; + UPDATE_STAT(CPrimitives, numClippedPrims); + } + + // execute the clipper stage + void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId) + { + // set up binner based on PA state + PFN_PROCESS_PRIMS pfnBinner; + switch (pa.binTopology) + { + case TOP_POINT_LIST: + pfnBinner = BinPoints; + break; + case TOP_LINE_LIST: + case TOP_LINE_STRIP: + case TOP_LINE_LOOP: + case TOP_LINE_LIST_ADJ: + case TOP_LISTSTRIP_ADJ: + pfnBinner = BinLines; + break; + default: + pfnBinner = BinTriangles; + break; + }; + + // update clipper invocations pipeline stat + SWR_CONTEXT* pContext = this->pDC->pContext; + uint32_t numInvoc = _mm_popcnt_u32(primMask); + UPDATE_STAT(CInvocations, numInvoc); + + ComputeClipCodes(prim); + + // cull prims with NAN coords + primMask &= ~ComputeNaNMask(prim); + + // user cull distance cull + if (this->state.rastState.cullDistanceMask) + { + primMask &= ~ComputeUserClipCullMask(pa, prim); + } + + // cull prims outside view frustum + simdscalar clipIntersection = ComputeClipCodeIntersection(); + int validMask = primMask & _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection, _simd_setzero_ps())); + + // skip clipping for points + uint32_t clipMask = 0; + if (NumVertsPerPrim != 1) + { + clipMask = primMask & ComputeClipMask(); + } + + if (clipMask) + { + RDTSC_START(FEGuardbandClip); + // we have to clip tris, execute the clipper, which will also + // call the binner + ClipSimd(vMask(primMask), vMask(clipMask), pa, primId); + RDTSC_STOP(FEGuardbandClip, 1, 0); + } + else if (validMask) + { + // update CPrimitives pipeline state + SWR_CONTEXT* pContext = this->pDC->pContext; + UPDATE_STAT(CPrimitives, _mm_popcnt_u32(validMask)); + + // forward valid prims directly to binner + pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId); + } + } + +private: + inline simdscalar ComputeInterpFactor(simdscalar boundaryCoord0, simdscalar boundaryCoord1) + { + return _simd_div_ps(boundaryCoord0, _simd_sub_ps(boundaryCoord0, boundaryCoord1)); + } + + inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari vIndices, uint32_t component) + { + const uint32_t simdVertexStride = sizeof(simdvertex); + const uint32_t componentStride = sizeof(simdscalar); + const uint32_t attribStride = sizeof(simdvector); + const __m256i vElemOffset = _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float), + 3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float)); + + // step to the simdvertex + simdscalari vOffsets = _simd_mullo_epi32(vIndices, _simd_set1_epi32(simdVertexStride)); + + // step to the attribute and component + vOffsets = _simd_add_epi32(vOffsets, _simd_set1_epi32(attribStride * attrib + componentStride * component)); + + // step to the lane + vOffsets = _simd_add_epi32(vOffsets, vElemOffset); + + return vOffsets; + } + + // gathers a single component for a given attribute for each SIMD lane + inline simdscalar GatherComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component) + { + simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component); + simdscalar vSrc = _mm256_undefined_ps(); + return _simd_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1); + } + + inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component, simdscalar vSrc) + { + simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component); + + uint32_t* pOffsets = (uint32_t*)&vOffsets; + float* pSrc = (float*)&vSrc; + uint32_t mask = _simd_movemask_ps(vMask); + DWORD lane; + while (_BitScanForward(&lane, mask)) + { + mask &= ~(1 << lane); + uint8_t* pBuf = (uint8_t*)pBuffer + pOffsets[lane]; + *(float*)pBuf = pSrc[lane]; + } + } + + template<SWR_CLIPCODES ClippingPlane> + inline void intersect( + const simdscalar& vActiveMask, // active lanes to operate on + const simdscalari& s, // index to first edge vertex v0 in pInPts. + const simdscalari& p, // index to second edge vertex v1 in pInPts. + const simdvector& v1, // vertex 0 position + const simdvector& v2, // vertex 1 position + simdscalari& outIndex, // output index. + const float *pInVerts, // array of all the input positions. + uint32_t numInAttribs, // number of attributes per vertex. + float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4. + { + // compute interpolation factor + simdscalar t; + switch (ClippingPlane) + { + case FRUSTUM_LEFT: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[0]), _simd_add_ps(v2[3], v2[0])); break; + case FRUSTUM_RIGHT: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[0]), _simd_sub_ps(v2[3], v2[0])); break; + case FRUSTUM_TOP: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[1]), _simd_add_ps(v2[3], v2[1])); break; + case FRUSTUM_BOTTOM: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[1]), _simd_sub_ps(v2[3], v2[1])); break; + case FRUSTUM_NEAR: + // DX Znear plane is 0, GL is -w + if (this->driverType == DX) + { + t = ComputeInterpFactor(v1[2], v2[2]); + } + else + { + t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[2]), _simd_add_ps(v2[3], v2[2])); + } + break; + case FRUSTUM_FAR: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[2]), _simd_sub_ps(v2[3], v2[2])); break; + default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane); + }; + + // interpolate position and store + for (uint32_t c = 0; c < 4; ++c) + { + simdscalar vOutPos = _simd_fmadd_ps(_simd_sub_ps(v2[c], v1[c]), t, v1[c]); + ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos); + } + + // interpolate attributes and store + for (uint32_t a = 0; a < numInAttribs; ++a) + { + uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a; + for (uint32_t c = 0; c < 4; ++c) + { + simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); + simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); + simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0); + ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); + } + } + } + + template<SWR_CLIPCODES ClippingPlane> + inline simdscalar inside(const simdvector& v) + { + switch (ClippingPlane) + { + case FRUSTUM_LEFT: return _simd_cmpge_ps(v[0], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f))); + case FRUSTUM_RIGHT: return _simd_cmple_ps(v[0], v[3]); + case FRUSTUM_TOP: return _simd_cmpge_ps(v[1], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f))); + case FRUSTUM_BOTTOM: return _simd_cmple_ps(v[1], v[3]); + case FRUSTUM_NEAR: return _simd_cmpge_ps(v[2], this->driverType == DX ? _simd_setzero_ps() : _simd_mul_ps(v[3], _simd_set1_ps(-1.0f))); + case FRUSTUM_FAR: return _simd_cmple_ps(v[2], v[3]); + default: + SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane); + return _simd_setzero_ps(); + } + } + + template<SWR_CLIPCODES ClippingPlane> + simdscalari ClipTriToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts) + { + simdscalari vCurIndex = _simd_setzero_si(); + simdscalari vOutIndex = _simd_setzero_si(); + simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts)); + + while (!_simd_testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty + { + simdscalari s = vCurIndex; + simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1)); + simdscalari underFlowMask = _simd_cmpgt_epi32(vNumInPts, p); + p = _simd_castps_si(_simd_blendv_ps(_simd_setzero_ps(), _simd_castsi_ps(p), _simd_castsi_ps(underFlowMask))); + + // gather position + simdvector vInPos0, vInPos1; + for (uint32_t c = 0; c < 4; ++c) + { + vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c); + vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c); + } + + // compute inside mask + simdscalar s_in = inside<ClippingPlane>(vInPos0); + simdscalar p_in = inside<ClippingPlane>(vInPos1); + + // compute intersection mask (s_in != p_in) + simdscalar intersectMask = _simd_xor_ps(s_in, p_in); + intersectMask = _simd_and_ps(intersectMask, vActiveMask); + + // store s if inside + s_in = _simd_and_ps(s_in, vActiveMask); + if (!_simd_testz_ps(s_in, s_in)) + { + // store position + for (uint32_t c = 0; c < 4; ++c) + { + ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]); + } + + // store attribs + for (uint32_t a = 0; a < numInAttribs; ++a) + { + uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a; + for (uint32_t c = 0; c < 4; ++c) + { + simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); + ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); + } + } + + // increment outIndex + vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in); + } + + // compute and store intersection + if (!_simd_testz_ps(intersectMask, intersectMask)) + { + intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts); + + // increment outIndex for active lanes + vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask); + } + + // increment loop index and update active mask + vCurIndex = _simd_add_epi32(vCurIndex, _simd_set1_epi32(1)); + vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts)); + } + + return vOutIndex; + } + + template<SWR_CLIPCODES ClippingPlane> + simdscalari ClipLineToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts) + { + simdscalari vCurIndex = _simd_setzero_si(); + simdscalari vOutIndex = _simd_setzero_si(); + simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts)); + + if (!_simd_testz_ps(vActiveMask, vActiveMask)) + { + simdscalari s = vCurIndex; + simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1)); + + // gather position + simdvector vInPos0, vInPos1; + for (uint32_t c = 0; c < 4; ++c) + { + vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c); + vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c); + } + + // compute inside mask + simdscalar s_in = inside<ClippingPlane>(vInPos0); + simdscalar p_in = inside<ClippingPlane>(vInPos1); + + // compute intersection mask (s_in != p_in) + simdscalar intersectMask = _simd_xor_ps(s_in, p_in); + intersectMask = _simd_and_ps(intersectMask, vActiveMask); + + // store s if inside + s_in = _simd_and_ps(s_in, vActiveMask); + if (!_simd_testz_ps(s_in, s_in)) + { + for (uint32_t c = 0; c < 4; ++c) + { + ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]); + } + + // interpolate attributes and store + for (uint32_t a = 0; a < numInAttribs; ++a) + { + uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a; + for (uint32_t c = 0; c < 4; ++c) + { + simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); + ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); + } + } + + // increment outIndex + vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in); + } + + // compute and store intersection + if (!_simd_testz_ps(intersectMask, intersectMask)) + { + intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts); + + // increment outIndex for active lanes + vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask); + } + + // store p if inside + p_in = _simd_and_ps(p_in, vActiveMask); + if (!_simd_testz_ps(p_in, p_in)) + { + for (uint32_t c = 0; c < 4; ++c) + { + ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]); + } + + // interpolate attributes and store + for (uint32_t a = 0; a < numInAttribs; ++a) + { + uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a; + for (uint32_t c = 0; c < 4; ++c) + { + simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c); + ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib); + } + } + + // increment outIndex + vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), p_in); + } + } + + return vOutIndex; + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Vertical clipper. Clips SIMD primitives at a time + /// @param pVertices - pointer to vertices in SOA form. Clipper will read input and write results to this buffer + /// @param vPrimMask - mask of valid input primitives, including non-clipped prims + /// @param numAttribs - number of valid input attribs, including position + simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs) + { + // temp storage + simdvertex tempVertices[7]; + float* pTempVerts = (float*)&tempVertices[0]; + + // zero out num input verts for non-active lanes + simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim); + vNumInPts = _simd_blendv_epi32(_simd_setzero_si(), vNumInPts, vClipMask); + + // clip prims to frustum + simdscalari vNumOutPts; + if (NumVertsPerPrim == 3) + { + vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts); + vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices); + vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts); + vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices); + vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts); + vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices); + } + else + { + SWR_ASSERT(NumVertsPerPrim == 2); + vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts); + vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices); + vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts); + vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices); + vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts); + vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices); + } + + // restore num verts for non-clipped, active lanes + simdscalar vNonClippedMask = _simd_andnot_ps(vClipMask, vPrimMask); + vNumOutPts = _simd_blendv_epi32(vNumOutPts, _simd_set1_epi32(NumVertsPerPrim), vNonClippedMask); + + return vNumOutPts; + } + + const uint32_t workerId; + const DRIVER_TYPE driverType; + DRAW_CONTEXT* pDC; + const API_STATE& state; + simdscalar clipCodes[NumVertsPerPrim]; +}; + + +// pipeline stage functions +void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId); +void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId); +void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId); diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h new file mode 100644 index 00000000000..4a214aff1c8 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -0,0 +1,495 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file context.h +* +* @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT +* The SWR_CONTEXT is our global context and contains the DC ring, +* thread state, etc. +* +* The DRAW_CONTEXT contains all state associated with a draw operation. +* +******************************************************************************/ +#pragma once + +#include <condition_variable> +#include <algorithm> + +#include "core/api.h" +#include "core/utils.h" +#include "core/arena.h" +#include "core/fifo.hpp" +#include "core/knobs.h" +#include "common/simdintrin.h" +#include "core/threads.h" + +// x.8 fixed point precision values +#define FIXED_POINT_SHIFT 8 +#define FIXED_POINT_SCALE 256 + +// x.16 fixed point precision values +#define FIXED_POINT16_SHIFT 16 +#define FIXED_POINT16_SCALE 65536 + +struct SWR_CONTEXT; +struct DRAW_CONTEXT; + +struct TRI_FLAGS +{ + uint32_t frontFacing : 1; + uint32_t yMajor : 1; + uint32_t coverageMask : (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); + uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); + float pointSize; + uint32_t primID; + uint32_t renderTargetArrayIndex; +}; + +////////////////////////////////////////////////////////////////////////// +/// SWR_TRIANGLE_DESC +///////////////////////////////////////////////////////////////////////// +struct SWR_TRIANGLE_DESC +{ + float I[3]; + float J[3]; + float Z[3]; + float OneOverW[3]; + float recipDet; + + float *pRecipW; + float *pAttribs; + float *pPerspAttribs; + float *pSamplePos; + float *pUserClipBuffer; + + uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES]; + + TRI_FLAGS triFlags; +}; + +struct TRIANGLE_WORK_DESC +{ + float *pTriBuffer; + float *pAttribs; + float *pUserClipBuffer; + uint32_t numAttribs; + TRI_FLAGS triFlags; +}; + +union CLEAR_FLAGS +{ + struct + { + uint32_t mask : 3; + }; + uint32_t bits; +}; + +struct CLEAR_DESC +{ + CLEAR_FLAGS flags; + float clearRTColor[4]; // RGBA_32F + float clearDepth; // [0..1] + BYTE clearStencil; +}; + +struct INVALIDATE_TILES_DESC +{ + uint32_t attachmentMask; +}; + +struct SYNC_DESC +{ + PFN_CALLBACK_FUNC pfnCallbackFunc; + uint64_t userData; + uint64_t userData2; + uint64_t userData3; +}; + +struct QUERY_DESC +{ + SWR_STATS* pStats; +}; + +struct STORE_TILES_DESC +{ + SWR_RENDERTARGET_ATTACHMENT attachment; + SWR_TILE_STATE postStoreTileState; +}; + +struct COMPUTE_DESC +{ + uint32_t threadGroupCountX; + uint32_t threadGroupCountY; + uint32_t threadGroupCountZ; +}; + +typedef void(*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc); + +enum WORK_TYPE +{ + SYNC, + DRAW, + CLEAR, + INVALIDATETILES, + STORETILES, + QUERYSTATS, +}; + +struct BE_WORK +{ + WORK_TYPE type; + PFN_WORK_FUNC pfnWork; + union + { + SYNC_DESC sync; + TRIANGLE_WORK_DESC tri; + CLEAR_DESC clear; + INVALIDATE_TILES_DESC invalidateTiles; + STORE_TILES_DESC storeTiles; + QUERY_DESC queryStats; + } desc; +}; + +struct DRAW_WORK +{ + DRAW_CONTEXT* pDC; + union + { + uint32_t numIndices; // DrawIndexed: Number of indices for draw. + uint32_t numVerts; // Draw: Number of verts (triangles, lines, etc) + }; + union + { + const int32_t* pIB; // DrawIndexed: App supplied indices + uint32_t startVertex; // Draw: Starting vertex in VB to render from. + }; + int32_t baseVertex; + uint32_t numInstances; // Number of instances + uint32_t startInstance; // Instance offset + uint32_t startPrimID; // starting primitiveID for this draw batch + uint32_t startVertexID; // starting VertexID for this draw batch (only needed for non-indexed draws) + SWR_FORMAT type; // index buffer type +}; + +typedef void(*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pDesc); +struct FE_WORK +{ + WORK_TYPE type; + PFN_FE_WORK_FUNC pfnWork; + union + { + SYNC_DESC sync; + DRAW_WORK draw; + CLEAR_DESC clear; + INVALIDATE_TILES_DESC invalidateTiles; + STORE_TILES_DESC storeTiles; + QUERY_DESC queryStats; + } desc; +}; + +struct GUARDBAND +{ + float left, right, top, bottom; +}; + +struct PA_STATE; + +// function signature for pipeline stages that execute after primitive assembly +typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], + uint32_t primMask, simdscalari primID); + +OSALIGNLINE(struct) API_STATE +{ + // Vertex Buffers + SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS]; + + // Index Buffer + SWR_INDEX_BUFFER_STATE indexBuffer; + + // FS - Fetch Shader State + PFN_FETCH_FUNC pfnFetchFunc; + + // VS - Vertex Shader State + PFN_VERTEX_FUNC pfnVertexFunc; + + // GS - Geometry Shader State + PFN_GS_FUNC pfnGsFunc; + SWR_GS_STATE gsState; + + // CS - Compute Shader + PFN_CS_FUNC pfnCsFunc; + uint32_t totalThreadsInGroup; + + // FE - Frontend State + SWR_FRONTEND_STATE frontendState; + + // SOS - Streamout Shader State + PFN_SO_FUNC pfnSoFunc[MAX_SO_STREAMS]; + + // Streamout state + SWR_STREAMOUT_STATE soState; + mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS]; + + // Tessellation State + PFN_HS_FUNC pfnHsFunc; + PFN_DS_FUNC pfnDsFunc; + SWR_TS_STATE tsState; + + // Specifies which VS outputs are sent to PS. + // Does not include position + uint32_t linkageMask; + uint32_t linkageCount; + uint8_t linkageMap[MAX_ATTRIBUTES]; + + // attrib mask, specifies the total set of attributes used + // by the frontend (vs, so, gs) + uint32_t feAttribMask; + + PRIMITIVE_TOPOLOGY topology; + bool forceFront; + + // RS - Rasterizer State + SWR_RASTSTATE rastState; + // floating point multisample offsets + float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2]; + + GUARDBAND gbState; + + SWR_VIEWPORT vp[KNOB_NUM_VIEWPORTS_SCISSORS]; + SWR_VIEWPORT_MATRIX vpMatrix[KNOB_NUM_VIEWPORTS_SCISSORS]; + + BBOX scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS]; + BBOX scissorInFixedPoint; + + // Backend state + SWR_BACKEND_STATE backendState; + + // PS - Pixel shader state + SWR_PS_STATE psState; + + SWR_DEPTH_STENCIL_STATE depthStencilState; + + // OM - Output Merger State + SWR_BLEND_STATE blendState; + PFN_BLEND_JIT_FUNC pfnBlendFunc[SWR_NUM_RENDERTARGETS]; + + // Stats are incremented when this is true. + bool enableStats; + + struct + { + uint32_t colorHottileEnable : 8; + uint32_t depthHottileEnable: 1; + uint32_t stencilHottileEnable : 1; + }; +}; + +class MacroTileMgr; +class DispatchQueue; + +struct RenderOutputBuffers +{ + uint8_t* pColor[SWR_NUM_RENDERTARGETS]; + uint8_t* pDepth; + uint8_t* pStencil; +}; + +// Plane equation A/B/C coeffs used to evaluate I/J barycentric coords +struct BarycentricCoeffs +{ + simdscalar vIa; + simdscalar vIb; + simdscalar vIc; + + simdscalar vJa; + simdscalar vJb; + simdscalar vJc; + + simdscalar vZa; + simdscalar vZb; + simdscalar vZc; + + simdscalar vRecipDet; + + simdscalar vAOneOverW; + simdscalar vBOneOverW; + simdscalar vCOneOverW; +}; + +// pipeline function pointer types +typedef void(*PFN_BACKEND_FUNC)(DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&); +typedef void(*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT &, uint8_t* (&)[SWR_NUM_RENDERTARGETS], uint32_t, const SWR_BLEND_STATE*, + const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS], simdscalar&, simdscalar); +typedef void(*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &); +typedef void(*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&); +typedef void(*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &, const uint64_t *const, const uint32_t, + const simdscalar, const simdscalar); + +struct BACKEND_FUNCS +{ + PFN_BACKEND_FUNC pfnBackend; + PFN_CALC_PIXEL_BARYCENTRICS pfnCalcPixelBarycentrics; + PFN_CALC_SAMPLE_BARYCENTRICS pfnCalcSampleBarycentrics; + PFN_CALC_CENTROID_BARYCENTRICS pfnCalcCentroidBarycentrics; + PFN_OUTPUT_MERGER pfnOutputMerger; +}; + +// Draw State +struct DRAW_STATE +{ + API_STATE state; + + void* pPrivateState; // Its required the driver sets this up for each draw. + + // pipeline function pointers, filled in by API thread when setting up the draw + BACKEND_FUNCS backendFuncs; + PFN_PROCESS_PRIMS pfnProcessPrims; + + Arena* pArena; // This should only be used by API thread. +}; + +// Draw Context +// The api thread sets up a draw context that exists for the life of the draw. +// This draw context maintains all of the state needed for the draw operation. +struct DRAW_CONTEXT +{ + SWR_CONTEXT *pContext; + + uint64_t drawId; + + bool isCompute; // Is this DC a compute context? + + FE_WORK FeWork; + volatile OSALIGNLINE(uint32_t) FeLock; + volatile OSALIGNLINE(bool) inUse; + volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw? + + // Have all worker threads moved past draw in DC ring? + volatile OSALIGNLINE(uint32_t) threadsDoneFE; + volatile OSALIGNLINE(uint32_t) threadsDoneBE; + + uint64_t dependency; + + MacroTileMgr* pTileMgr; + + // The following fields are valid if isCompute is true. + volatile OSALIGNLINE(bool) doneCompute; // Is this dispatch done? (isCompute) + DispatchQueue* pDispatch; // Queue for thread groups. (isCompute) + + DRAW_STATE* pState; + Arena* pArena; + + uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS]; // Scratch space used for spill fills. +}; + +INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC) +{ + SWR_ASSERT(pDC != nullptr); + SWR_ASSERT(pDC->pState != nullptr); + + return pDC->pState->state; +} + +INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC) +{ + SWR_ASSERT(pDC != nullptr); + SWR_ASSERT(pDC->pState != nullptr); + + return pDC->pState->pPrivateState; +} + +class HotTileMgr; + +struct SWR_CONTEXT +{ + // Draw Context Ring + // Each draw needs its own state in order to support mulitple draws in flight across multiple threads. + // We maintain N draw contexts configured as a ring. The size of the ring limits the maximum number + // of draws that can be in flight at any given time. + // + // Description: + // 1. State - When an application first sets state we'll request a new draw context to use. + // a. If there are no available draw contexts then we'll have to wait until one becomes free. + // b. If one is available then set pCurDrawContext to point to it and mark it in use. + // c. All state calls set state on pCurDrawContext. + // 2. Draw - Creates submits a work item that is associated with current draw context. + // a. Set pPrevDrawContext = pCurDrawContext + // b. Set pCurDrawContext to NULL. + // 3. State - When an applications sets state after draw + // a. Same as step 1. + // b. State is copied from prev draw context to current. + DRAW_CONTEXT* dcRing; + + DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw. + DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from. + + // Draw State Ring + // When draw are very large (lots of primitives) then the API thread will break these up. + // These split draws all have identical state. So instead of storing the state directly + // in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs + // to reference a single entry in the DS ring. + DRAW_STATE* dsRing; + + uint32_t curStateId; // Current index to the next available entry in the DS ring. + + DRAW_STATE* subCtxSave; // Save area for inactive contexts. + uint32_t curSubCtxId; // Current index for active state subcontext. + uint32_t numSubContexts; // Number of available subcontexts + + uint32_t NumWorkerThreads; + + THREAD_POOL threadPool; // Thread pool associated with this context + + std::condition_variable FifosNotEmpty; + std::mutex WaitLock; + + // Draw Contexts will get a unique drawId generated from this + uint64_t nextDrawId; + + // most recent draw id enqueued by the API thread + // written by api thread, read by multiple workers + OSALIGNLINE(volatile uint64_t) DrawEnqueued; + + DRIVER_TYPE driverType; + + uint32_t privateStateSize; + + HotTileMgr *pHotTileMgr; + + // tile load/store functions, passed in at create context time + PFN_LOAD_TILE pfnLoadTile; + PFN_STORE_TILE pfnStoreTile; + PFN_CLEAR_TILE pfnClearTile; + + // Global Stats + SWR_STATS stats[KNOB_MAX_NUM_THREADS]; + + // Scratch space for workers. + uint8_t* pScratch[KNOB_MAX_NUM_THREADS]; +}; + +void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId); +void WakeAllThreads(SWR_CONTEXT *pContext); + +#define UPDATE_STAT(name, count) if (GetApiState(pDC).enableStats) { pContext->stats[workerId].name += count; } +#define SET_STAT(name, count) if (GetApiState(pDC).enableStats) { pContext->stats[workerId].name = count; } diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h new file mode 100644 index 00000000000..4f245c8c53e --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h @@ -0,0 +1,245 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file depthstencil.h +* +* @brief Implements depth/stencil functionality +* +******************************************************************************/ +#pragma once +#include "common/os.h" +#include "format_conversion.h" + +INLINE +void StencilOp(SWR_STENCILOP op, simdscalar mask, simdscalar stencilRefps, simdscalar &stencilps) +{ + simdscalari stencil = _simd_castps_si(stencilps); + + switch (op) + { + case STENCILOP_KEEP: + break; + case STENCILOP_ZERO: + stencilps = _simd_blendv_ps(stencilps, _simd_setzero_ps(), mask); + break; + case STENCILOP_REPLACE: + stencilps = _simd_blendv_ps(stencilps, stencilRefps, mask); + break; + case STENCILOP_INCRSAT: + { + simdscalari stencilincr = _simd_adds_epu8(stencil, _simd_set1_epi32(1)); + stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask); + break; + } + case STENCILOP_DECRSAT: + { + simdscalari stencildecr = _simd_subs_epu8(stencil, _simd_set1_epi32(1)); + stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask); + break; + } + case STENCILOP_INCR: + { + simdscalari stencilincr = _simd_add_epi8(stencil, _simd_set1_epi32(1)); + stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask); + break; + } + case STENCILOP_DECR: + { + simdscalari stencildecr = _simd_add_epi8(stencil, _simd_set1_epi32((-1) & 0xff)); + stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask); + break; + } + case STENCILOP_INVERT: + { + simdscalar stencilinvert = _simd_andnot_ps(stencilps, _simd_cmpeq_ps(_simd_setzero_ps(), _simd_setzero_ps())); + stencilps = _simd_blendv_ps(stencilps, stencilinvert, mask); + break; + } + default: + break; + } +} + + +INLINE +simdscalar DepthStencilTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState, + bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, simdscalar coverageMask, BYTE *pStencilBase, + simdscalar* pStencilMask) +{ + static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); + static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format"); + + simdscalar depthResult = _simd_set1_ps(-1.0f); + simdscalar zbuf; + + // clamp Z to viewport [minZ..maxZ] + simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ); + simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ); + interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, interpZ)); + + if (pDSState->depthTestEnable) + { + switch (pDSState->depthTestFunc) + { + case ZFUNC_NEVER: depthResult = _simd_setzero_ps(); break; + case ZFUNC_ALWAYS: break; + default: + zbuf = _simd_load_ps((const float*)pDepthBase); + } + + switch (pDSState->depthTestFunc) + { + case ZFUNC_LE: depthResult = _simd_cmple_ps(interpZ, zbuf); break; + case ZFUNC_LT: depthResult = _simd_cmplt_ps(interpZ, zbuf); break; + case ZFUNC_GT: depthResult = _simd_cmpgt_ps(interpZ, zbuf); break; + case ZFUNC_GE: depthResult = _simd_cmpge_ps(interpZ, zbuf); break; + case ZFUNC_EQ: depthResult = _simd_cmpeq_ps(interpZ, zbuf); break; + case ZFUNC_NE: depthResult = _simd_cmpneq_ps(interpZ, zbuf); break; + } + } + + simdscalar stencilMask = _simd_set1_ps(-1.0f); + + if (pDSState->stencilTestEnable) + { + uint8_t stencilRefValue; + uint32_t stencilTestFunc; + uint8_t stencilTestMask; + if (frontFacing || !pDSState->doubleSidedStencilTestEnable) + { + stencilRefValue = pDSState->stencilRefValue; + stencilTestFunc = pDSState->stencilTestFunc; + stencilTestMask = pDSState->stencilTestMask; + } + else + { + stencilRefValue = pDSState->backfaceStencilRefValue; + stencilTestFunc = pDSState->backfaceStencilTestFunc; + stencilTestMask = pDSState->backfaceStencilTestMask; + } + + simdvector sbuf; + simdscalar stencilWithMask; + simdscalar stencilRef; + switch(stencilTestFunc) + { + case ZFUNC_NEVER: stencilMask = _simd_setzero_ps(); break; + case ZFUNC_ALWAYS: break; + default: + LoadSOA<R8_UINT>(pStencilBase, sbuf); + + // apply stencil read mask + stencilWithMask = _simd_castsi_ps(_simd_and_si(_simd_castps_si(sbuf.v[0]), _simd_set1_epi32(stencilTestMask))); + + // do stencil compare in float to avoid simd integer emulation in AVX1 + stencilWithMask = _simd_cvtepi32_ps(_simd_castps_si(stencilWithMask)); + + stencilRef = _simd_set1_ps((float)(stencilRefValue & stencilTestMask)); + break; + } + + switch(stencilTestFunc) + { + case ZFUNC_LE: stencilMask = _simd_cmple_ps(stencilRef, stencilWithMask); break; + case ZFUNC_LT: stencilMask = _simd_cmplt_ps(stencilRef, stencilWithMask); break; + case ZFUNC_GT: stencilMask = _simd_cmpgt_ps(stencilRef, stencilWithMask); break; + case ZFUNC_GE: stencilMask = _simd_cmpge_ps(stencilRef, stencilWithMask); break; + case ZFUNC_EQ: stencilMask = _simd_cmpeq_ps(stencilRef, stencilWithMask); break; + case ZFUNC_NE: stencilMask = _simd_cmpneq_ps(stencilRef, stencilWithMask); break; + } + } + + simdscalar depthWriteMask = _simd_and_ps(depthResult, stencilMask); + depthWriteMask = _simd_and_ps(depthWriteMask, coverageMask); + + *pStencilMask = stencilMask; + return depthWriteMask; +} + +INLINE +void DepthStencilWrite(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState, + bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask, + BYTE *pStencilBase, const simdscalar& stencilMask) +{ + if (pDSState->depthWriteEnable) + { + // clamp Z to viewport [minZ..maxZ] + simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ); + simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ); + interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, interpZ)); + + simdscalar vMask = _simd_and_ps(depthMask, coverageMask); + _simd_maskstore_ps((float*)pDepthBase, _simd_castps_si(vMask), interpZ); + } + + if (pDSState->stencilWriteEnable) + { + simdvector sbuf; + LoadSOA<R8_UINT>(pStencilBase, sbuf); + simdscalar stencilbuf = sbuf.v[0]; + + uint8_t stencilRefValue; + uint32_t stencilFailOp; + uint32_t stencilPassDepthPassOp; + uint32_t stencilPassDepthFailOp; + uint8_t stencilWriteMask; + if (frontFacing || !pDSState->doubleSidedStencilTestEnable) + { + stencilRefValue = pDSState->stencilRefValue; + stencilFailOp = pDSState->stencilFailOp; + stencilPassDepthPassOp = pDSState->stencilPassDepthPassOp; + stencilPassDepthFailOp = pDSState->stencilPassDepthFailOp; + stencilWriteMask = pDSState->stencilWriteMask; + } + else + { + stencilRefValue = pDSState->backfaceStencilRefValue; + stencilFailOp = pDSState->backfaceStencilFailOp; + stencilPassDepthPassOp = pDSState->backfaceStencilPassDepthPassOp; + stencilPassDepthFailOp = pDSState->backfaceStencilPassDepthFailOp; + stencilWriteMask = pDSState->backfaceStencilWriteMask; + } + + simdscalar stencilps = stencilbuf; + simdscalar stencilRefps = _simd_castsi_ps(_simd_set1_epi32(stencilRefValue)); + + simdscalar stencilFailMask = _simd_andnot_ps(stencilMask, coverageMask); + simdscalar stencilPassDepthPassMask = _simd_and_ps(stencilMask, depthMask); + simdscalar stencilPassDepthFailMask = _simd_and_ps(stencilMask, _simd_andnot_ps(depthMask, _simd_set1_ps(-1))); + + simdscalar origStencil = stencilps; + + StencilOp((SWR_STENCILOP)stencilFailOp, stencilFailMask, stencilRefps, stencilps); + StencilOp((SWR_STENCILOP)stencilPassDepthFailOp, stencilPassDepthFailMask, stencilRefps, stencilps); + StencilOp((SWR_STENCILOP)stencilPassDepthPassOp, stencilPassDepthPassMask, stencilRefps, stencilps); + + // apply stencil write mask + simdscalari vWriteMask = _simd_set1_epi32(stencilWriteMask); + stencilps = _simd_and_ps(stencilps, _simd_castsi_ps(vWriteMask)); + stencilps = _simd_or_ps(_simd_andnot_ps(_simd_castsi_ps(vWriteMask), origStencil), stencilps); + + simdvector stencilResult; + stencilResult.v[0] = _simd_blendv_ps(origStencil, stencilps, coverageMask); + StoreSOA<R8_UINT>(stencilResult, pStencilBase); + } + +} diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp new file mode 100644 index 00000000000..7e556012e6b --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp @@ -0,0 +1,136 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file fifo.hpp +* +* @brief Definitions for our fifos used for thread communication. +* +******************************************************************************/ +#pragma once + + +#include "common/os.h" +#include "arena.h" + +#include <vector> +#include <cassert> + +template<class T> +struct QUEUE +{ + OSALIGNLINE(volatile uint32_t) mLock{ 0 }; + OSALIGNLINE(volatile uint32_t) mNumEntries{ 0 }; + std::vector<T*> mBlocks; + T* mCurBlock{ nullptr }; + uint32_t mHead{ 0 }; + uint32_t mTail{ 0 }; + uint32_t mCurBlockIdx{ 0 }; + + // power of 2 + static const uint32_t mBlockSizeShift = 6; + static const uint32_t mBlockSize = 1 << mBlockSizeShift; + + void clear(Arena& arena) + { + mHead = 0; + mTail = 0; + mBlocks.clear(); + T* pNewBlock = (T*)arena.Alloc(sizeof(T)*mBlockSize); + mBlocks.push_back(pNewBlock); + mCurBlock = pNewBlock; + mCurBlockIdx = 0; + + mNumEntries = 0; + _ReadWriteBarrier(); + mLock = 0; + } + + uint32_t getNumQueued() + { + return mNumEntries; + } + + bool tryLock() + { + if (mLock) + { + return false; + } + + // try to lock the FIFO + LONG initial = InterlockedCompareExchange(&mLock, 1, 0); + return (initial == 0); + } + + void unlock() + { + mLock = 0; + } + + T* peek() + { + if (mNumEntries == 0) + { + return nullptr; + } + uint32_t block = mHead >> mBlockSizeShift; + return &mBlocks[block][mHead & (mBlockSize-1)]; + } + + void dequeue_noinc() + { + mHead ++; + mNumEntries --; + } + + bool enqueue_try_nosync(Arena& arena, const T* entry) + { + memcpy(&mCurBlock[mTail], entry, sizeof(T)); + + mTail ++; + if (mTail == mBlockSize) + { + if (++mCurBlockIdx < mBlocks.size()) + { + mCurBlock = mBlocks[mCurBlockIdx]; + } + else + { + T* newBlock = (T*)arena.Alloc(sizeof(T)*mBlockSize); + SWR_ASSERT(newBlock); + + mBlocks.push_back(newBlock); + mCurBlock = newBlock; + } + + mTail = 0; + } + + mNumEntries ++; + return true; + } + + void destroy() + { + } + +}; diff --git a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h new file mode 100644 index 00000000000..83d85fc86d8 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h @@ -0,0 +1,196 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file format_conversion.h +* +* @brief API implementation +* +******************************************************************************/ +#include "format_types.h" +#include "format_traits.h" + +////////////////////////////////////////////////////////////////////////// +/// @brief Load SIMD packed pixels in SOA format and converts to +/// SOA RGBA32_FLOAT format. +/// @param pSrc - source data in SOA form +/// @param dst - output data in SOA form +template<SWR_FORMAT SrcFormat> +INLINE void LoadSOA(const BYTE *pSrc, simdvector &dst) +{ + // fast path for float32 + if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<SrcFormat>::GetBPC(0) == 32)) + { + auto lambda = [&](int comp) + { + simdscalar vComp = _simd_load_ps((const float*)(pSrc + comp*sizeof(simdscalar))); + + dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp; + }; + + UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda); + return; + } + + auto lambda = [&](int comp) + { + // load SIMD components + simdscalar vComp = FormatTraits<SrcFormat>::loadSOA(comp, pSrc); + + // unpack + vComp = FormatTraits<SrcFormat>::unpack(comp, vComp); + + // convert + if (FormatTraits<SrcFormat>::isNormalized(comp)) + { + vComp = _simd_cvtepi32_ps(_simd_castps_si(vComp)); + vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<SrcFormat>::toFloat(comp))); + } + + dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp; + + pSrc += (FormatTraits<SrcFormat>::GetBPC(comp) * KNOB_SIMD_WIDTH) / 8; + }; + + UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Clamps the given component based on the requirements on the +/// Format template arg +/// @param vComp - SIMD vector of floats +/// @param Component - component +template<SWR_FORMAT Format> +INLINE simdscalar Clamp(simdscalar vComp, uint32_t Component) +{ + if (FormatTraits<Format>::isNormalized(Component)) + { + if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UNORM) + { + vComp = _simd_max_ps(vComp, _simd_setzero_ps()); + } + + if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SNORM) + { + vComp = _simd_max_ps(vComp, _simd_set1_ps(-1.0f)); + } + vComp = _simd_min_ps(vComp, _simd_set1_ps(1.0f)); + } + else if (FormatTraits<Format>::GetBPC(Component) < 32) + { + if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UINT) + { + int iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1; + int iMin = 0; + simdscalari vCompi = _simd_castps_si(vComp); + vCompi = _simd_max_epu32(vCompi, _simd_set1_epi32(iMin)); + vCompi = _simd_min_epu32(vCompi, _simd_set1_epi32(iMax)); + vComp = _simd_castsi_ps(vCompi); + } + else if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SINT) + { + int iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1; + int iMin = -1 - iMax; + simdscalari vCompi = _simd_castps_si(vComp); + vCompi = _simd_max_epi32(vCompi, _simd_set1_epi32(iMin)); + vCompi = _simd_min_epi32(vCompi, _simd_set1_epi32(iMax)); + vComp = _simd_castsi_ps(vCompi); + } + } + + return vComp; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Normalize the given component based on the requirements on the +/// Format template arg +/// @param vComp - SIMD vector of floats +/// @param Component - component +template<SWR_FORMAT Format> +INLINE simdscalar Normalize(simdscalar vComp, uint32_t Component) +{ + if (FormatTraits<Format>::isNormalized(Component)) + { + vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<Format>::fromFloat(Component))); + vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp)); + } + return vComp; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Convert and store simdvector of pixels in SOA +/// RGBA32_FLOAT to SOA format +/// @param src - source data in SOA form +/// @param dst - output data in SOA form +template<SWR_FORMAT DstFormat> +INLINE void StoreSOA(const simdvector &src, BYTE *pDst) +{ + // fast path for float32 + if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<DstFormat>::GetBPC(0) == 32)) + { + for (uint32_t comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp) + { + simdscalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)]; + + // Gamma-correct + if (FormatTraits<DstFormat>::isSRGB) + { + if (comp < 3) // Input format is always RGBA32_FLOAT. + { + vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp); + } + } + + _simd_store_ps((float*)(pDst + comp*sizeof(simdscalar)), vComp); + } + return; + } + + auto lambda = [&](int comp) + { + simdscalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)]; + + // Gamma-correct + if (FormatTraits<DstFormat>::isSRGB) + { + if (comp < 3) // Input format is always RGBA32_FLOAT. + { + vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp); + } + } + + // clamp + vComp = Clamp<DstFormat>(vComp, comp); + + // normalize + vComp = Normalize<DstFormat>(vComp, comp); + + // pack + vComp = FormatTraits<DstFormat>::pack(comp, vComp); + + // store + FormatTraits<DstFormat>::storeSOA(comp, pDst, vComp); + + pDst += (FormatTraits<DstFormat>::GetBPC(comp) * KNOB_SIMD_WIDTH) / 8; + }; + + UnrollerL<0, FormatTraits<DstFormat>::numComps, 1>::step(lambda); +} diff --git a/src/gallium/drivers/swr/rasterizer/core/format_traits.h b/src/gallium/drivers/swr/rasterizer/core/format_traits.h new file mode 100644 index 00000000000..52340f4987a --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/format_traits.h @@ -0,0 +1,3548 @@ + +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file format_traits.h +* +* @brief auto-generated file +* +* DO NOT EDIT +* +******************************************************************************/ + +#pragma once + +#include "format_types.h" +#include "utils.h" + + +////////////////////////////////////////////////////////////////////////// +/// FormatSwizzle - Component swizzle selects +////////////////////////////////////////////////////////////////////////// +template<UINT comp0 = 0, uint32_t comp1 = 0, uint32_t comp2 = 0, uint32_t comp3 = 0> +struct FormatSwizzle +{ + // Return swizzle select for component. + INLINE static uint32_t swizzle(UINT c) + { + static const uint32_t s[4] = { comp0, comp1, comp2, comp3 }; + return s[c]; + } +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits - Format traits +////////////////////////////////////////////////////////////////////////// +template<SWR_FORMAT format> +struct FormatTraits : + ComponentTraits<SWR_TYPE_UNKNOWN, 0>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0> +{ + static const uint32_t bpp{ 0 }; + static const uint32_t numComps{ 0 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R32G32B32A32_FLOAT> - Format traits specialization for R32G32B32A32_FLOAT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R32G32B32A32_FLOAT> : + ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 128 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose32_32_32_32 TransposeT; + typedef Format4<32, 32, 32, 32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R32G32B32A32_SINT> - Format traits specialization for R32G32B32A32_SINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R32G32B32A32_SINT> : + ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 128 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose32_32_32_32 TransposeT; + typedef Format4<32, 32, 32, 32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R32G32B32A32_UINT> - Format traits specialization for R32G32B32A32_UINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R32G32B32A32_UINT> : + ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 128 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose32_32_32_32 TransposeT; + typedef Format4<32, 32, 32, 32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R32G32B32X32_FLOAT> - Format traits specialization for R32G32B32X32_FLOAT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R32G32B32X32_FLOAT> : + ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_UNUSED, 32>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 128 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose32_32_32_32 TransposeT; + typedef Format4<32, 32, 32, 32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R32G32B32A32_SSCALED> - Format traits specialization for R32G32B32A32_SSCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R32G32B32A32_SSCALED> : + ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 128 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose32_32_32_32 TransposeT; + typedef Format4<32, 32, 32, 32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R32G32B32A32_USCALED> - Format traits specialization for R32G32B32A32_USCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R32G32B32A32_USCALED> : + ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 128 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose32_32_32_32 TransposeT; + typedef Format4<32, 32, 32, 32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R32G32B32_FLOAT> - Format traits specialization for R32G32B32_FLOAT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R32G32B32_FLOAT> : + ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 96 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose32_32_32 TransposeT; + typedef Format3<32, 32, 32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R32G32B32_SINT> - Format traits specialization for R32G32B32_SINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R32G32B32_SINT> : + ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 96 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose32_32_32 TransposeT; + typedef Format3<32, 32, 32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R32G32B32_UINT> - Format traits specialization for R32G32B32_UINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R32G32B32_UINT> : + ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 96 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose32_32_32 TransposeT; + typedef Format3<32, 32, 32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R32G32B32_SSCALED> - Format traits specialization for R32G32B32_SSCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R32G32B32_SSCALED> : + ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 96 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose32_32_32 TransposeT; + typedef Format3<32, 32, 32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R32G32B32_USCALED> - Format traits specialization for R32G32B32_USCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R32G32B32_USCALED> : + ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 96 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose32_32_32 TransposeT; + typedef Format3<32, 32, 32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16G16B16A16_UNORM> - Format traits specialization for R16G16B16A16_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16G16B16A16_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16_16_16 TransposeT; + typedef Format4<16, 16, 16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16G16B16A16_SNORM> - Format traits specialization for R16G16B16A16_SNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16G16B16A16_SNORM> : + ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16_16_16 TransposeT; + typedef Format4<16, 16, 16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16G16B16A16_SINT> - Format traits specialization for R16G16B16A16_SINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16G16B16A16_SINT> : + ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16_16_16 TransposeT; + typedef Format4<16, 16, 16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16G16B16A16_UINT> - Format traits specialization for R16G16B16A16_UINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16G16B16A16_UINT> : + ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16_16_16 TransposeT; + typedef Format4<16, 16, 16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16G16B16A16_FLOAT> - Format traits specialization for R16G16B16A16_FLOAT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16G16B16A16_FLOAT> : + ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16_16_16 TransposeT; + typedef Format4<16, 16, 16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R32G32_FLOAT> - Format traits specialization for R32G32_FLOAT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R32G32_FLOAT> : + ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose32_32 TransposeT; + typedef Format2<32, 32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R32G32_SINT> - Format traits specialization for R32G32_SINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R32G32_SINT> : + ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose32_32 TransposeT; + typedef Format2<32, 32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R32G32_UINT> - Format traits specialization for R32G32_UINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R32G32_UINT> : + ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose32_32 TransposeT; + typedef Format2<32, 32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R32_FLOAT_X8X24_TYPELESS> - Format traits specialization for R32_FLOAT_X8X24_TYPELESS +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R32_FLOAT_X8X24_TYPELESS> : + ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_UNUSED, 32>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose32_32 TransposeT; + typedef Format2<32, 32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<X32_TYPELESS_G8X24_UINT> - Format traits specialization for X32_TYPELESS_G8X24_UINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<X32_TYPELESS_G8X24_UINT> : + ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UNUSED, 32>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose32_32 TransposeT; + typedef Format2<32, 32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<L32A32_FLOAT> - Format traits specialization for L32A32_FLOAT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<L32A32_FLOAT> : + ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>, + FormatSwizzle<0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 1 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose32_32 TransposeT; + typedef Format2<32, 32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16G16B16X16_UNORM> - Format traits specialization for R16G16B16X16_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16G16B16X16_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNUSED, 16>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16_16_16 TransposeT; + typedef Format4<16, 16, 16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16G16B16X16_FLOAT> - Format traits specialization for R16G16B16X16_FLOAT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16G16B16X16_FLOAT> : + ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_UNUSED, 16>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16_16_16 TransposeT; + typedef Format4<16, 16, 16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<L32X32_FLOAT> - Format traits specialization for L32X32_FLOAT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<L32X32_FLOAT> : + ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>, + FormatSwizzle<0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose32_32 TransposeT; + typedef Format2<32, 32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<I32X32_FLOAT> - Format traits specialization for I32X32_FLOAT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<I32X32_FLOAT> : + ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>, + FormatSwizzle<0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose32_32 TransposeT; + typedef Format2<32, 32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16G16B16A16_SSCALED> - Format traits specialization for R16G16B16A16_SSCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16G16B16A16_SSCALED> : + ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16_16_16 TransposeT; + typedef Format4<16, 16, 16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16G16B16A16_USCALED> - Format traits specialization for R16G16B16A16_USCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16G16B16A16_USCALED> : + ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16_16_16 TransposeT; + typedef Format4<16, 16, 16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R32G32_SSCALED> - Format traits specialization for R32G32_SSCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R32G32_SSCALED> : + ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose32_32 TransposeT; + typedef Format2<32, 32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R32G32_USCALED> - Format traits specialization for R32G32_USCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R32G32_USCALED> : + ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose32_32 TransposeT; + typedef Format2<32, 32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R32_FLOAT_X8X24_TYPELESS_LD> - Format traits specialization for R32_FLOAT_X8X24_TYPELESS_LD +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R32_FLOAT_X8X24_TYPELESS_LD> : + ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_UNUSED, 32>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose32_32 TransposeT; + typedef Format2<32, 32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<B8G8R8A8_UNORM> - Format traits specialization for B8G8R8A8_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<B8G8R8A8_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8_8_8 TransposeT; + typedef Format4<8, 8, 8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<B8G8R8A8_UNORM_SRGB> - Format traits specialization for B8G8R8A8_UNORM_SRGB +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<B8G8R8A8_UNORM_SRGB> : + ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ true }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8_8_8 TransposeT; + typedef Format4<8, 8, 8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R10G10B10A2_UNORM> - Format traits specialization for R10G10B10A2_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R10G10B10A2_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose10_10_10_2 TransposeT; + typedef Format4<10, 10, 10, 2> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R10G10B10A2_UNORM_SRGB> - Format traits specialization for R10G10B10A2_UNORM_SRGB +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R10G10B10A2_UNORM_SRGB> : + ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ true }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose10_10_10_2 TransposeT; + typedef Format4<10, 10, 10, 2> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R10G10B10A2_UINT> - Format traits specialization for R10G10B10A2_UINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R10G10B10A2_UINT> : + ComponentTraits<SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 2>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose10_10_10_2 TransposeT; + typedef Format4<10, 10, 10, 2> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8G8B8A8_UNORM> - Format traits specialization for R8G8B8A8_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8G8B8A8_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8_8_8 TransposeT; + typedef Format4<8, 8, 8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8G8B8A8_UNORM_SRGB> - Format traits specialization for R8G8B8A8_UNORM_SRGB +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8G8B8A8_UNORM_SRGB> : + ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ true }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8_8_8 TransposeT; + typedef Format4<8, 8, 8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8G8B8A8_SNORM> - Format traits specialization for R8G8B8A8_SNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8G8B8A8_SNORM> : + ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8_8_8 TransposeT; + typedef Format4<8, 8, 8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8G8B8A8_SINT> - Format traits specialization for R8G8B8A8_SINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8G8B8A8_SINT> : + ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8_8_8 TransposeT; + typedef Format4<8, 8, 8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8G8B8A8_UINT> - Format traits specialization for R8G8B8A8_UINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8G8B8A8_UINT> : + ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8_8_8 TransposeT; + typedef Format4<8, 8, 8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16G16_UNORM> - Format traits specialization for R16G16_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16G16_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16 TransposeT; + typedef Format2<16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16G16_SNORM> - Format traits specialization for R16G16_SNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16G16_SNORM> : + ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16 TransposeT; + typedef Format2<16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16G16_SINT> - Format traits specialization for R16G16_SINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16G16_SINT> : + ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16 TransposeT; + typedef Format2<16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16G16_UINT> - Format traits specialization for R16G16_UINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16G16_UINT> : + ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16 TransposeT; + typedef Format2<16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16G16_FLOAT> - Format traits specialization for R16G16_FLOAT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16G16_FLOAT> : + ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16 TransposeT; + typedef Format2<16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<B10G10R10A2_UNORM> - Format traits specialization for B10G10R10A2_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<B10G10R10A2_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose10_10_10_2 TransposeT; + typedef Format4<10, 10, 10, 2> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<B10G10R10A2_UNORM_SRGB> - Format traits specialization for B10G10R10A2_UNORM_SRGB +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<B10G10R10A2_UNORM_SRGB> : + ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ true }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose10_10_10_2 TransposeT; + typedef Format4<10, 10, 10, 2> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R11G11B10_FLOAT> - Format traits specialization for R11G11B10_FLOAT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R11G11B10_FLOAT> : + ComponentTraits<SWR_TYPE_FLOAT, 11, SWR_TYPE_FLOAT, 11, SWR_TYPE_FLOAT, 10>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose11_11_10 TransposeT; + typedef Format3<11, 11, 10> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R32_SINT> - Format traits specialization for R32_SINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R32_SINT> : + ComponentTraits<SWR_TYPE_SINT, 32>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<32> TransposeT; + typedef Format1<32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R32_UINT> - Format traits specialization for R32_UINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R32_UINT> : + ComponentTraits<SWR_TYPE_UINT, 32>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<32> TransposeT; + typedef Format1<32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R32_FLOAT> - Format traits specialization for R32_FLOAT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R32_FLOAT> : + ComponentTraits<SWR_TYPE_FLOAT, 32>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<32> TransposeT; + typedef Format1<32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R24_UNORM_X8_TYPELESS> - Format traits specialization for R24_UNORM_X8_TYPELESS +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R24_UNORM_X8_TYPELESS> : + ComponentTraits<SWR_TYPE_UNORM, 24>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<32> TransposeT; + typedef Format1<24> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R24_UNORM_X8_TYPELESS_LD> - Format traits specialization for R24_UNORM_X8_TYPELESS_LD +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R24_UNORM_X8_TYPELESS_LD> : + ComponentTraits<SWR_TYPE_UNORM, 24>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<32> TransposeT; + typedef Format1<24> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<L16A16_UNORM> - Format traits specialization for L16A16_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<L16A16_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>, + FormatSwizzle<0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 1 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16 TransposeT; + typedef Format2<16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<I24X8_UNORM> - Format traits specialization for I24X8_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<I24X8_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 24, SWR_TYPE_UNORM, 8>, + FormatSwizzle<0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose24_8 TransposeT; + typedef Format2<24, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<L24X8_UNORM> - Format traits specialization for L24X8_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<L24X8_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 24, SWR_TYPE_UNORM, 8>, + FormatSwizzle<0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose24_8 TransposeT; + typedef Format2<24, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<I32_FLOAT> - Format traits specialization for I32_FLOAT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<I32_FLOAT> : + ComponentTraits<SWR_TYPE_FLOAT, 32>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<32> TransposeT; + typedef Format1<32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<L32_FLOAT> - Format traits specialization for L32_FLOAT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<L32_FLOAT> : + ComponentTraits<SWR_TYPE_FLOAT, 32>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<32> TransposeT; + typedef Format1<32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<A32_FLOAT> - Format traits specialization for A32_FLOAT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<A32_FLOAT> : + ComponentTraits<SWR_TYPE_FLOAT, 32>, + FormatSwizzle<3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<32> TransposeT; + typedef Format1<32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<B8G8R8X8_UNORM> - Format traits specialization for B8G8R8X8_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<B8G8R8X8_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8_8_8 TransposeT; + typedef Format4<8, 8, 8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<B8G8R8X8_UNORM_SRGB> - Format traits specialization for B8G8R8X8_UNORM_SRGB +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<B8G8R8X8_UNORM_SRGB> : + ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ true }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8_8_8 TransposeT; + typedef Format4<8, 8, 8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8G8B8X8_UNORM> - Format traits specialization for R8G8B8X8_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8G8B8X8_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8_8_8 TransposeT; + typedef Format4<8, 8, 8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8G8B8X8_UNORM_SRGB> - Format traits specialization for R8G8B8X8_UNORM_SRGB +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8G8B8X8_UNORM_SRGB> : + ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ true }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8_8_8 TransposeT; + typedef Format4<8, 8, 8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R9G9B9E5_SHAREDEXP> - Format traits specialization for R9G9B9E5_SHAREDEXP +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R9G9B9E5_SHAREDEXP> : + ComponentTraits<SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 5>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose9_9_9_5 TransposeT; + typedef Format4<9, 9, 9, 5> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<B10G10R10X2_UNORM> - Format traits specialization for B10G10R10X2_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<B10G10R10X2_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNUSED, 2>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose10_10_10_2 TransposeT; + typedef Format4<10, 10, 10, 2> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<L16A16_FLOAT> - Format traits specialization for L16A16_FLOAT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<L16A16_FLOAT> : + ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>, + FormatSwizzle<0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 1 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16 TransposeT; + typedef Format2<16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R10G10B10X2_USCALED> - Format traits specialization for R10G10B10X2_USCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R10G10B10X2_USCALED> : + ComponentTraits<SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_UNUSED, 2>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose10_10_10_2 TransposeT; + typedef Format4<10, 10, 10, 2> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8G8B8A8_SSCALED> - Format traits specialization for R8G8B8A8_SSCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8G8B8A8_SSCALED> : + ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8_8_8 TransposeT; + typedef Format4<8, 8, 8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8G8B8A8_USCALED> - Format traits specialization for R8G8B8A8_USCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8G8B8A8_USCALED> : + ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8_8_8 TransposeT; + typedef Format4<8, 8, 8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16G16_SSCALED> - Format traits specialization for R16G16_SSCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16G16_SSCALED> : + ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16 TransposeT; + typedef Format2<16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16G16_USCALED> - Format traits specialization for R16G16_USCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16G16_USCALED> : + ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16 TransposeT; + typedef Format2<16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R32_SSCALED> - Format traits specialization for R32_SSCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R32_SSCALED> : + ComponentTraits<SWR_TYPE_SSCALED, 32>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<32> TransposeT; + typedef Format1<32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R32_USCALED> - Format traits specialization for R32_USCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R32_USCALED> : + ComponentTraits<SWR_TYPE_USCALED, 32>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<32> TransposeT; + typedef Format1<32> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<B5G6R5_UNORM> - Format traits specialization for B5G6R5_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<B5G6R5_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 6, SWR_TYPE_UNORM, 5>, + FormatSwizzle<2, 1, 0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose5_6_5 TransposeT; + typedef Format3<5, 6, 5> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<B5G6R5_UNORM_SRGB> - Format traits specialization for B5G6R5_UNORM_SRGB +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<B5G6R5_UNORM_SRGB> : + ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 6, SWR_TYPE_UNORM, 5>, + FormatSwizzle<2, 1, 0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ true }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose5_6_5 TransposeT; + typedef Format3<5, 6, 5> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<B5G5R5A1_UNORM> - Format traits specialization for B5G5R5A1_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<B5G5R5A1_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 1>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose5_5_5_1 TransposeT; + typedef Format4<5, 5, 5, 1> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<B5G5R5A1_UNORM_SRGB> - Format traits specialization for B5G5R5A1_UNORM_SRGB +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<B5G5R5A1_UNORM_SRGB> : + ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 1>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ true }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose5_5_5_1 TransposeT; + typedef Format4<5, 5, 5, 1> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<B4G4R4A4_UNORM> - Format traits specialization for B4G4R4A4_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<B4G4R4A4_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose4_4_4_4 TransposeT; + typedef Format4<4, 4, 4, 4> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<B4G4R4A4_UNORM_SRGB> - Format traits specialization for B4G4R4A4_UNORM_SRGB +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<B4G4R4A4_UNORM_SRGB> : + ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ true }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose4_4_4_4 TransposeT; + typedef Format4<4, 4, 4, 4> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8G8_UNORM> - Format traits specialization for R8G8_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8G8_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8 TransposeT; + typedef Format2<8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8G8_SNORM> - Format traits specialization for R8G8_SNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8G8_SNORM> : + ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8 TransposeT; + typedef Format2<8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8G8_SINT> - Format traits specialization for R8G8_SINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8G8_SINT> : + ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8 TransposeT; + typedef Format2<8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8G8_UINT> - Format traits specialization for R8G8_UINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8G8_UINT> : + ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8 TransposeT; + typedef Format2<8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16_UNORM> - Format traits specialization for R16_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 16>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<16> TransposeT; + typedef Format1<16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16_SNORM> - Format traits specialization for R16_SNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16_SNORM> : + ComponentTraits<SWR_TYPE_SNORM, 16>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<16> TransposeT; + typedef Format1<16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16_SINT> - Format traits specialization for R16_SINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16_SINT> : + ComponentTraits<SWR_TYPE_SINT, 16>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<16> TransposeT; + typedef Format1<16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16_UINT> - Format traits specialization for R16_UINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16_UINT> : + ComponentTraits<SWR_TYPE_UINT, 16>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<16> TransposeT; + typedef Format1<16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16_FLOAT> - Format traits specialization for R16_FLOAT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16_FLOAT> : + ComponentTraits<SWR_TYPE_FLOAT, 16>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<16> TransposeT; + typedef Format1<16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<I16_UNORM> - Format traits specialization for I16_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<I16_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 16>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<16> TransposeT; + typedef Format1<16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<L16_UNORM> - Format traits specialization for L16_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<L16_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 16>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<16> TransposeT; + typedef Format1<16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<A16_UNORM> - Format traits specialization for A16_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<A16_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 16>, + FormatSwizzle<3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<16> TransposeT; + typedef Format1<16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<L8A8_UNORM> - Format traits specialization for L8A8_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<L8A8_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, + FormatSwizzle<0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 1 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8 TransposeT; + typedef Format2<8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<I16_FLOAT> - Format traits specialization for I16_FLOAT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<I16_FLOAT> : + ComponentTraits<SWR_TYPE_FLOAT, 16>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<16> TransposeT; + typedef Format1<16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<L16_FLOAT> - Format traits specialization for L16_FLOAT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<L16_FLOAT> : + ComponentTraits<SWR_TYPE_FLOAT, 16>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<16> TransposeT; + typedef Format1<16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<A16_FLOAT> - Format traits specialization for A16_FLOAT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<A16_FLOAT> : + ComponentTraits<SWR_TYPE_FLOAT, 16>, + FormatSwizzle<3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<16> TransposeT; + typedef Format1<16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<L8A8_UNORM_SRGB> - Format traits specialization for L8A8_UNORM_SRGB +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<L8A8_UNORM_SRGB> : + ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, + FormatSwizzle<0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 1 }; + static const bool isSRGB{ true }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8 TransposeT; + typedef Format2<8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<B5G5R5X1_UNORM> - Format traits specialization for B5G5R5X1_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<B5G5R5X1_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNUSED, 1>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose5_5_5_1 TransposeT; + typedef Format4<5, 5, 5, 1> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<B5G5R5X1_UNORM_SRGB> - Format traits specialization for B5G5R5X1_UNORM_SRGB +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<B5G5R5X1_UNORM_SRGB> : + ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNUSED, 1>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ true }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose5_5_5_1 TransposeT; + typedef Format4<5, 5, 5, 1> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8G8_SSCALED> - Format traits specialization for R8G8_SSCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8G8_SSCALED> : + ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8 TransposeT; + typedef Format2<8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8G8_USCALED> - Format traits specialization for R8G8_USCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8G8_USCALED> : + ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8 TransposeT; + typedef Format2<8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16_SSCALED> - Format traits specialization for R16_SSCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16_SSCALED> : + ComponentTraits<SWR_TYPE_SSCALED, 16>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<16> TransposeT; + typedef Format1<16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16_USCALED> - Format traits specialization for R16_USCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16_USCALED> : + ComponentTraits<SWR_TYPE_USCALED, 16>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<16> TransposeT; + typedef Format1<16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<L8A8_UINT> - Format traits specialization for L8A8_UINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<L8A8_UINT> : + ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, + FormatSwizzle<0, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 1 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8 TransposeT; + typedef Format2<8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<L8A8_SINT> - Format traits specialization for L8A8_SINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<L8A8_SINT> : + ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>, + FormatSwizzle<0, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 16 }; + static const uint32_t numComps{ 2 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 1 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8 TransposeT; + typedef Format2<8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8_UNORM> - Format traits specialization for R8_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 8 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<8> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8_SNORM> - Format traits specialization for R8_SNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8_SNORM> : + ComponentTraits<SWR_TYPE_SNORM, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 8 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<8> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8_SINT> - Format traits specialization for R8_SINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8_SINT> : + ComponentTraits<SWR_TYPE_SINT, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 8 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<8> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8_UINT> - Format traits specialization for R8_UINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8_UINT> : + ComponentTraits<SWR_TYPE_UINT, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 8 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<8> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<A8_UNORM> - Format traits specialization for A8_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<A8_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 8>, + FormatSwizzle<3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 8 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<8> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<I8_UNORM> - Format traits specialization for I8_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<I8_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 8 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<8> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<L8_UNORM> - Format traits specialization for L8_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<L8_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 8 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<8> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8_SSCALED> - Format traits specialization for R8_SSCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8_SSCALED> : + ComponentTraits<SWR_TYPE_SSCALED, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 8 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<8> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8_USCALED> - Format traits specialization for R8_USCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8_USCALED> : + ComponentTraits<SWR_TYPE_USCALED, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 8 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<8> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<L8_UNORM_SRGB> - Format traits specialization for L8_UNORM_SRGB +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<L8_UNORM_SRGB> : + ComponentTraits<SWR_TYPE_UNORM, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 8 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ true }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<8> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<L8_UINT> - Format traits specialization for L8_UINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<L8_UINT> : + ComponentTraits<SWR_TYPE_UINT, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 8 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<8> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<L8_SINT> - Format traits specialization for L8_SINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<L8_SINT> : + ComponentTraits<SWR_TYPE_SINT, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 8 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<8> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<I8_UINT> - Format traits specialization for I8_UINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<I8_UINT> : + ComponentTraits<SWR_TYPE_UINT, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 8 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<8> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<I8_SINT> - Format traits specialization for I8_SINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<I8_SINT> : + ComponentTraits<SWR_TYPE_SINT, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 8 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef TransposeSingleComponent<8> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<YCRCB_SWAPUVY> - Format traits specialization for YCRCB_SWAPUVY +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<YCRCB_SWAPUVY> : + ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ true }; + static const uint32_t bcWidth{ 2 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8_8_8 TransposeT; + typedef Format4<8, 8, 8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<BC1_UNORM> - Format traits specialization for BC1_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<BC1_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ true }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 4 }; + static const uint32_t bcHeight{ 4 }; + + typedef TransposeSingleComponent<64> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<BC2_UNORM> - Format traits specialization for BC2_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<BC2_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 128 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ true }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 4 }; + static const uint32_t bcHeight{ 4 }; + + typedef TransposeSingleComponent<128> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<BC3_UNORM> - Format traits specialization for BC3_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<BC3_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 128 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ true }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 4 }; + static const uint32_t bcHeight{ 4 }; + + typedef TransposeSingleComponent<128> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<BC4_UNORM> - Format traits specialization for BC4_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<BC4_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ true }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 4 }; + static const uint32_t bcHeight{ 4 }; + + typedef TransposeSingleComponent<64> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<BC5_UNORM> - Format traits specialization for BC5_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<BC5_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 128 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ true }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 4 }; + static const uint32_t bcHeight{ 4 }; + + typedef TransposeSingleComponent<128> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<BC1_UNORM_SRGB> - Format traits specialization for BC1_UNORM_SRGB +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<BC1_UNORM_SRGB> : + ComponentTraits<SWR_TYPE_UNORM, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ true }; + static const bool isBC{ true }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 4 }; + static const uint32_t bcHeight{ 4 }; + + typedef TransposeSingleComponent<64> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<BC2_UNORM_SRGB> - Format traits specialization for BC2_UNORM_SRGB +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<BC2_UNORM_SRGB> : + ComponentTraits<SWR_TYPE_UNORM, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 128 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ true }; + static const bool isBC{ true }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 4 }; + static const uint32_t bcHeight{ 4 }; + + typedef TransposeSingleComponent<128> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<BC3_UNORM_SRGB> - Format traits specialization for BC3_UNORM_SRGB +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<BC3_UNORM_SRGB> : + ComponentTraits<SWR_TYPE_UNORM, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 128 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ true }; + static const bool isBC{ true }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 4 }; + static const uint32_t bcHeight{ 4 }; + + typedef TransposeSingleComponent<128> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<YCRCB_SWAPUV> - Format traits specialization for YCRCB_SWAPUV +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<YCRCB_SWAPUV> : + ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ true }; + static const uint32_t bcWidth{ 2 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8_8_8 TransposeT; + typedef Format4<8, 8, 8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8G8B8_UNORM> - Format traits specialization for R8G8B8_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8G8B8_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 24 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8_8 TransposeT; + typedef Format3<8, 8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8G8B8_SNORM> - Format traits specialization for R8G8B8_SNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8G8B8_SNORM> : + ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 24 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8_8 TransposeT; + typedef Format3<8, 8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8G8B8_SSCALED> - Format traits specialization for R8G8B8_SSCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8G8B8_SSCALED> : + ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 24 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8_8 TransposeT; + typedef Format3<8, 8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8G8B8_USCALED> - Format traits specialization for R8G8B8_USCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8G8B8_USCALED> : + ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 24 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8_8 TransposeT; + typedef Format3<8, 8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<BC4_SNORM> - Format traits specialization for BC4_SNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<BC4_SNORM> : + ComponentTraits<SWR_TYPE_SNORM, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 64 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ true }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 4 }; + static const uint32_t bcHeight{ 4 }; + + typedef TransposeSingleComponent<64> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<BC5_SNORM> - Format traits specialization for BC5_SNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<BC5_SNORM> : + ComponentTraits<SWR_TYPE_SNORM, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 128 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ true }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 4 }; + static const uint32_t bcHeight{ 4 }; + + typedef TransposeSingleComponent<128> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16G16B16_FLOAT> - Format traits specialization for R16G16B16_FLOAT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16G16B16_FLOAT> : + ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 48 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16_16 TransposeT; + typedef Format3<16, 16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16G16B16_UNORM> - Format traits specialization for R16G16B16_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16G16B16_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 48 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16_16 TransposeT; + typedef Format3<16, 16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16G16B16_SNORM> - Format traits specialization for R16G16B16_SNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16G16B16_SNORM> : + ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 48 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16_16 TransposeT; + typedef Format3<16, 16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16G16B16_SSCALED> - Format traits specialization for R16G16B16_SSCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16G16B16_SSCALED> : + ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 48 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16_16 TransposeT; + typedef Format3<16, 16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16G16B16_USCALED> - Format traits specialization for R16G16B16_USCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16G16B16_USCALED> : + ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 48 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16_16 TransposeT; + typedef Format3<16, 16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<BC6H_SF16> - Format traits specialization for BC6H_SF16 +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<BC6H_SF16> : + ComponentTraits<SWR_TYPE_SNORM, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 128 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ true }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 4 }; + static const uint32_t bcHeight{ 4 }; + + typedef TransposeSingleComponent<128> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<BC7_UNORM> - Format traits specialization for BC7_UNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<BC7_UNORM> : + ComponentTraits<SWR_TYPE_UNORM, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 128 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ true }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 4 }; + static const uint32_t bcHeight{ 4 }; + + typedef TransposeSingleComponent<128> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<BC7_UNORM_SRGB> - Format traits specialization for BC7_UNORM_SRGB +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<BC7_UNORM_SRGB> : + ComponentTraits<SWR_TYPE_UNORM, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 128 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ true }; + static const bool isBC{ true }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 4 }; + static const uint32_t bcHeight{ 4 }; + + typedef TransposeSingleComponent<128> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<BC6H_UF16> - Format traits specialization for BC6H_UF16 +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<BC6H_UF16> : + ComponentTraits<SWR_TYPE_UNORM, 8>, + FormatSwizzle<0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 128 }; + static const uint32_t numComps{ 1 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ true }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 4 }; + static const uint32_t bcHeight{ 4 }; + + typedef TransposeSingleComponent<128> TransposeT; + typedef Format1<8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8G8B8_UNORM_SRGB> - Format traits specialization for R8G8B8_UNORM_SRGB +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8G8B8_UNORM_SRGB> : + ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 24 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ true }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8_8 TransposeT; + typedef Format3<8, 8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16G16B16_UINT> - Format traits specialization for R16G16B16_UINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16G16B16_UINT> : + ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 48 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16_16 TransposeT; + typedef Format3<16, 16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R16G16B16_SINT> - Format traits specialization for R16G16B16_SINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R16G16B16_SINT> : + ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 48 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose16_16_16 TransposeT; + typedef Format3<16, 16, 16> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R10G10B10A2_SNORM> - Format traits specialization for R10G10B10A2_SNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R10G10B10A2_SNORM> : + ComponentTraits<SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 2>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose10_10_10_2 TransposeT; + typedef Format4<10, 10, 10, 2> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R10G10B10A2_USCALED> - Format traits specialization for R10G10B10A2_USCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R10G10B10A2_USCALED> : + ComponentTraits<SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 2>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose10_10_10_2 TransposeT; + typedef Format4<10, 10, 10, 2> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R10G10B10A2_SSCALED> - Format traits specialization for R10G10B10A2_SSCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R10G10B10A2_SSCALED> : + ComponentTraits<SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 2>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose10_10_10_2 TransposeT; + typedef Format4<10, 10, 10, 2> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R10G10B10A2_SINT> - Format traits specialization for R10G10B10A2_SINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R10G10B10A2_SINT> : + ComponentTraits<SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 2>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose10_10_10_2 TransposeT; + typedef Format4<10, 10, 10, 2> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<B10G10R10A2_SNORM> - Format traits specialization for B10G10R10A2_SNORM +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<B10G10R10A2_SNORM> : + ComponentTraits<SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 2>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose10_10_10_2 TransposeT; + typedef Format4<10, 10, 10, 2> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<B10G10R10A2_USCALED> - Format traits specialization for B10G10R10A2_USCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<B10G10R10A2_USCALED> : + ComponentTraits<SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 2>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose10_10_10_2 TransposeT; + typedef Format4<10, 10, 10, 2> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<B10G10R10A2_SSCALED> - Format traits specialization for B10G10R10A2_SSCALED +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<B10G10R10A2_SSCALED> : + ComponentTraits<SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 2>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose10_10_10_2 TransposeT; + typedef Format4<10, 10, 10, 2> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<B10G10R10A2_UINT> - Format traits specialization for B10G10R10A2_UINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<B10G10R10A2_UINT> : + ComponentTraits<SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 2>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose10_10_10_2 TransposeT; + typedef Format4<10, 10, 10, 2> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<B10G10R10A2_SINT> - Format traits specialization for B10G10R10A2_SINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<B10G10R10A2_SINT> : + ComponentTraits<SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 2>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 32 }; + static const uint32_t numComps{ 4 }; + static const bool hasAlpha{ true }; + static const uint32_t alphaComp{ 3 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose10_10_10_2 TransposeT; + typedef Format4<10, 10, 10, 2> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8G8B8_UINT> - Format traits specialization for R8G8B8_UINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8G8B8_UINT> : + ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 24 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8_8 TransposeT; + typedef Format3<8, 8, 8> FormatT; +}; + +////////////////////////////////////////////////////////////////////////// +/// FormatTraits<R8G8B8_SINT> - Format traits specialization for R8G8B8_SINT +////////////////////////////////////////////////////////////////////////// +template<> struct FormatTraits<R8G8B8_SINT> : + ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{ 24 }; + static const uint32_t numComps{ 3 }; + static const bool hasAlpha{ false }; + static const uint32_t alphaComp{ 0 }; + static const bool isSRGB{ false }; + static const bool isBC{ false }; + static const bool isSubsampled{ false }; + static const uint32_t bcWidth{ 1 }; + static const uint32_t bcHeight{ 1 }; + + typedef Transpose8_8_8 TransposeT; + typedef Format3<8, 8, 8> FormatT; +}; + diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h new file mode 100644 index 00000000000..aa350259a15 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h @@ -0,0 +1,1075 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file formats.h +* +* @brief Definitions for SWR_FORMAT functions. +* +******************************************************************************/ +#pragma once + +////////////////////////////////////////////////////////////////////////// +/// PackTraits - Helpers for packing / unpacking same pixel sizes +////////////////////////////////////////////////////////////////////////// +template <uint32_t NumBits, bool Signed = false> +struct PackTraits +{ + static const uint32_t MyNumBits = NumBits; + static simdscalar loadSOA(const BYTE *pSrc) = delete; + static void storeSOA(BYTE *pDst, simdscalar src) = delete; + static simdscalar unpack(simdscalar &in) = delete; + static simdscalar pack(simdscalar &in) = delete; +}; + +////////////////////////////////////////////////////////////////////////// +/// PackTraits - Helpers for packing / unpacking unused channels +////////////////////////////////////////////////////////////////////////// +template <> +struct PackTraits<0, false> +{ + static const uint32_t MyNumBits = 0; + + static simdscalar loadSOA(const BYTE *pSrc) { return _simd_setzero_ps(); } + static void storeSOA(BYTE *pDst, simdscalar src) { return; } + static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); } + static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); } +}; + + +////////////////////////////////////////////////////////////////////////// +/// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels +////////////////////////////////////////////////////////////////////////// +template <> +struct PackTraits<8, false> +{ + static const uint32_t MyNumBits = 8; + + static simdscalar loadSOA(const BYTE *pSrc) + { +#if KNOB_SIMD_WIDTH == 8 + __m256 result = _mm256_setzero_ps(); + __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc)); + return _mm256_insertf128_ps(result, vLo, 0); +#else +#error Unsupported vector width +#endif + } + + static void storeSOA(BYTE *pDst, simdscalar src) + { + // store simd bytes +#if KNOB_SIMD_WIDTH == 8 + _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src))); +#else +#error Unsupported vector width +#endif + } + + static simdscalar unpack(simdscalar &in) + { +#if KNOB_SIMD_WIDTH == 8 +#if KNOB_ARCH==KNOB_ARCH_AVX + __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); + __m128i resLo = _mm_cvtepu8_epi32(src); + __m128i resHi = _mm_shuffle_epi8(src, + _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004)); + + __m256i result = _mm256_castsi128_si256(resLo); + result = _mm256_insertf128_si256(result, resHi, 1); + return _mm256_castsi256_ps(result); +#elif KNOB_ARCH==KNOB_ARCH_AVX2 + return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); +#endif +#else +#error Unsupported vector width +#endif + } + + static simdscalar pack(simdscalar &in) + { +#if KNOB_SIMD_WIDTH == 8 + simdscalari src = _simd_castps_si(in); + __m128i res16 = _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)); + __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128()); + return _mm256_castsi256_ps(_mm256_castsi128_si256(res8)); +#else +#error Unsupported vector width +#endif + } +}; + +////////////////////////////////////////////////////////////////////////// +/// PackTraits - Helpers for packing / unpacking 8 bit signed channels +////////////////////////////////////////////////////////////////////////// +template <> +struct PackTraits<8, true> +{ + static const uint32_t MyNumBits = 8; + + static simdscalar loadSOA(const BYTE *pSrc) + { +#if KNOB_SIMD_WIDTH == 8 + __m256 result = _mm256_setzero_ps(); + __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc)); + return _mm256_insertf128_ps(result, vLo, 0); +#else +#error Unsupported vector width +#endif + } + + static void storeSOA(BYTE *pDst, simdscalar src) + { + // store simd bytes +#if KNOB_SIMD_WIDTH == 8 + _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src))); +#else +#error Unsupported vector width +#endif + } + + static simdscalar unpack(simdscalar &in) + { +#if KNOB_SIMD_WIDTH == 8 +#if KNOB_ARCH==KNOB_ARCH_AVX + SWR_ASSERT(0); // I think this may be incorrect. + __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); + __m128i resLo = _mm_cvtepi8_epi32(src); + __m128i resHi = _mm_shuffle_epi8(src, + _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004)); + + __m256i result = _mm256_castsi128_si256(resLo); + result = _mm256_insertf128_si256(result, resHi, 1); + return _mm256_castsi256_ps(result); +#elif KNOB_ARCH==KNOB_ARCH_AVX2 + return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); +#endif +#else +#error Unsupported vector width +#endif + } + + static simdscalar pack(simdscalar &in) + { +#if KNOB_SIMD_WIDTH == 8 + simdscalari src = _simd_castps_si(in); + __m128i res16 = _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)); + __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128()); + return _mm256_castsi256_ps(_mm256_castsi128_si256(res8)); +#else +#error Unsupported vector width +#endif + } +}; + +////////////////////////////////////////////////////////////////////////// +/// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels +////////////////////////////////////////////////////////////////////////// +template <> +struct PackTraits<16, false> +{ + static const uint32_t MyNumBits = 16; + + static simdscalar loadSOA(const BYTE *pSrc) + { +#if KNOB_SIMD_WIDTH == 8 + __m256 result = _mm256_setzero_ps(); + __m128 vLo = _mm_load_ps((const float*)pSrc); + return _mm256_insertf128_ps(result, vLo, 0); +#else +#error Unsupported vector width +#endif + } + + static void storeSOA(BYTE *pDst, simdscalar src) + { +#if KNOB_SIMD_WIDTH == 8 + // store 16B (2B * 8) + _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src)); +#else +#error Unsupported vector width +#endif + } + + static simdscalar unpack(simdscalar &in) + { +#if KNOB_SIMD_WIDTH == 8 +#if KNOB_ARCH==KNOB_ARCH_AVX + __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); + __m128i resLo = _mm_cvtepu16_epi32(src); + __m128i resHi = _mm_shuffle_epi8(src, + _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908)); + + __m256i result = _mm256_castsi128_si256(resLo); + result = _mm256_insertf128_si256(result, resHi, 1); + return _mm256_castsi256_ps(result); +#elif KNOB_ARCH==KNOB_ARCH_AVX2 + return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); +#endif +#else +#error Unsupported vector width +#endif + } + + static simdscalar pack(simdscalar &in) + { +#if KNOB_SIMD_WIDTH == 8 + simdscalari src = _simd_castps_si(in); + __m256i res = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1))); + return _mm256_castsi256_ps(res); +#else +#error Unsupported vector width +#endif + } +}; + +////////////////////////////////////////////////////////////////////////// +/// PackTraits - Helpers for packing / unpacking 16 bit signed channels +////////////////////////////////////////////////////////////////////////// +template <> +struct PackTraits<16, true> +{ + static const uint32_t MyNumBits = 16; + + static simdscalar loadSOA(const BYTE *pSrc) + { +#if KNOB_SIMD_WIDTH == 8 + __m256 result = _mm256_setzero_ps(); + __m128 vLo = _mm_load_ps((const float*)pSrc); + return _mm256_insertf128_ps(result, vLo, 0); +#else +#error Unsupported vector width +#endif + } + + static void storeSOA(BYTE *pDst, simdscalar src) + { +#if KNOB_SIMD_WIDTH == 8 + // store 16B (2B * 8) + _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src)); +#else +#error Unsupported vector width +#endif + } + + static simdscalar unpack(simdscalar &in) + { +#if KNOB_SIMD_WIDTH == 8 +#if KNOB_ARCH==KNOB_ARCH_AVX + SWR_ASSERT(0); // I think this is incorrectly implemented + __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); + __m128i resLo = _mm_cvtepi16_epi32(src); + __m128i resHi = _mm_shuffle_epi8(src, + _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908)); + + __m256i result = _mm256_castsi128_si256(resLo); + result = _mm256_insertf128_si256(result, resHi, 1); + return _mm256_castsi256_ps(result); +#elif KNOB_ARCH==KNOB_ARCH_AVX2 + return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); +#endif +#else +#error Unsupported vector width +#endif + } + + static simdscalar pack(simdscalar &in) + { +#if KNOB_SIMD_WIDTH == 8 + simdscalari src = _simd_castps_si(in); + __m256i res = _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1))); + return _mm256_castsi256_ps(res); +#else +#error Unsupported vector width +#endif + } +}; + +////////////////////////////////////////////////////////////////////////// +/// PackTraits - Helpers for packing / unpacking 32 bit channels +////////////////////////////////////////////////////////////////////////// +template <> +struct PackTraits<32, false> +{ + static const uint32_t MyNumBits = 32; + + static simdscalar loadSOA(const BYTE *pSrc) { return _simd_load_ps((const float*)pSrc); } + static void storeSOA(BYTE *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); } + static simdscalar unpack(simdscalar &in) { return in; } + static simdscalar pack(simdscalar &in) { return in; } +}; + +////////////////////////////////////////////////////////////////////////// +/// TypeTraits - Format type traits. +////////////////////////////////////////////////////////////////////////// +template<SWR_TYPE type, uint32_t NumBits> +struct TypeTraits : PackTraits<NumBits> +{ + static const SWR_TYPE MyType = type; + static float toFloat() { return 0.0; } + static float fromFloat() { SWR_ASSERT(0); return 0.0; } + static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } +}; + +////////////////////////////////////////////////////////////////////////// +/// TypeTraits - Format type traits specialization for UINT8 +////////////////////////////////////////////////////////////////////////// +template<> struct TypeTraits<SWR_TYPE_UINT, 8> : PackTraits<8> +{ + static const SWR_TYPE MyType = SWR_TYPE_UINT; + static float toFloat() { return 0.0; } + static float fromFloat() { SWR_ASSERT(0); return 0.0; } + static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } +}; + +////////////////////////////////////////////////////////////////////////// +/// TypeTraits - Format type traits specialization for UINT8 +////////////////////////////////////////////////////////////////////////// +template<> struct TypeTraits<SWR_TYPE_SINT, 8> : PackTraits<8, true> +{ + static const SWR_TYPE MyType = SWR_TYPE_SINT; + static float toFloat() { return 0.0; } + static float fromFloat() { SWR_ASSERT(0); return 0.0; } + static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } +}; + +////////////////////////////////////////////////////////////////////////// +/// TypeTraits - Format type traits specialization for UINT16 +////////////////////////////////////////////////////////////////////////// +template<> struct TypeTraits<SWR_TYPE_UINT, 16> : PackTraits<16> +{ + static const SWR_TYPE MyType = SWR_TYPE_UINT; + static float toFloat() { return 0.0; } + static float fromFloat() { SWR_ASSERT(0); return 0.0; } + static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } +}; + +////////////////////////////////////////////////////////////////////////// +/// TypeTraits - Format type traits specialization for SINT16 +////////////////////////////////////////////////////////////////////////// +template<> struct TypeTraits<SWR_TYPE_SINT, 16> : PackTraits<16, true> +{ + static const SWR_TYPE MyType = SWR_TYPE_SINT; + static float toFloat() { return 0.0; } + static float fromFloat() { SWR_ASSERT(0); return 0.0; } + static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } +}; + +////////////////////////////////////////////////////////////////////////// +/// TypeTraits - Format type traits specialization for UINT32 +////////////////////////////////////////////////////////////////////////// +template<> struct TypeTraits<SWR_TYPE_UINT, 32> : PackTraits<32> +{ + static const SWR_TYPE MyType = SWR_TYPE_UINT; + static float toFloat() { return 0.0; } + static float fromFloat() { SWR_ASSERT(0); return 0.0; } + static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } +}; + +////////////////////////////////////////////////////////////////////////// +/// TypeTraits - Format type traits specialization for UINT32 +////////////////////////////////////////////////////////////////////////// +template<> struct TypeTraits<SWR_TYPE_SINT, 32> : PackTraits<32> +{ + static const SWR_TYPE MyType = SWR_TYPE_SINT; + static float toFloat() { return 0.0; } + static float fromFloat() { SWR_ASSERT(0); return 0.0; } + static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } +}; + +////////////////////////////////////////////////////////////////////////// +/// TypeTraits - Format type traits specialization for UNORM5 +////////////////////////////////////////////////////////////////////////// +template<> struct TypeTraits<SWR_TYPE_UNORM, 5> : PackTraits<5> +{ + static const SWR_TYPE MyType = SWR_TYPE_UNORM; + static float toFloat() { return 1.0f / 31.0f; } + static float fromFloat() { return 31.0f; } + static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } +}; + +////////////////////////////////////////////////////////////////////////// +/// TypeTraits - Format type traits specialization for UNORM6 +////////////////////////////////////////////////////////////////////////// +template<> struct TypeTraits<SWR_TYPE_UNORM, 6> : PackTraits<6> +{ + static const SWR_TYPE MyType = SWR_TYPE_UNORM; + static float toFloat() { return 1.0f / 63.0f; } + static float fromFloat() { return 63.0f; } + static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } +}; + +////////////////////////////////////////////////////////////////////////// +/// TypeTraits - Format type traits specialization for UNORM8 +////////////////////////////////////////////////////////////////////////// +template<> struct TypeTraits<SWR_TYPE_UNORM, 8> : PackTraits<8> +{ + static const SWR_TYPE MyType = SWR_TYPE_UNORM; + static float toFloat() { return 1.0f / 255.0f; } + static float fromFloat() { return 255.0f; } + static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } +}; + +////////////////////////////////////////////////////////////////////////// +/// TypeTraits - Format type traits specialization for UNORM8 +////////////////////////////////////////////////////////////////////////// +template<> struct TypeTraits<SWR_TYPE_SNORM, 8> : PackTraits<8, true> +{ + static const SWR_TYPE MyType = SWR_TYPE_SNORM; + static float toFloat() { return 1.0f / 127.0f; } + static float fromFloat() { return 127.0f; } + static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } +}; + +////////////////////////////////////////////////////////////////////////// +/// TypeTraits - Format type traits specialization for UNORM16 +////////////////////////////////////////////////////////////////////////// +template<> struct TypeTraits<SWR_TYPE_UNORM, 16> : PackTraits<16> +{ + static const SWR_TYPE MyType = SWR_TYPE_UNORM; + static float toFloat() { return 1.0f / 65535.0f; } + static float fromFloat() { return 65535.0f; } + static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } +}; + +////////////////////////////////////////////////////////////////////////// +/// TypeTraits - Format type traits specialization for SNORM16 +////////////////////////////////////////////////////////////////////////// +template<> struct TypeTraits<SWR_TYPE_SNORM, 16> : PackTraits<16, true> +{ + static const SWR_TYPE MyType = SWR_TYPE_UNORM; + static float toFloat() { return 1.0f / 32767.0f; } + static float fromFloat() { return 32767.0f; } + static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } +}; + +////////////////////////////////////////////////////////////////////////// +/// TypeTraits - Format type traits specialization for UNORM24 +////////////////////////////////////////////////////////////////////////// +template<> +struct TypeTraits < SWR_TYPE_UNORM, 24 > : PackTraits<32> +{ + static const SWR_TYPE MyType = SWR_TYPE_UNORM; + static float toFloat() { return 1.0f / 16777215.0f; } + static float fromFloat() { return 16777215.0f; } + static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } +}; + +////////////////////////////////////////////////////////////////////////// +// FLOAT Specializations from here on... +////////////////////////////////////////////////////////////////////////// +#define TO_M128i(a) _mm_castps_si128(a) +#define TO_M128(a) _mm_castsi128_ps(a) + +#include "math.h" + +template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden > +inline static __m128 fastpow(__m128 arg) { + __m128 ret = arg; + + static const __m128 factor = _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f) + * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum)); + + // Apply a constant pre-correction factor. + ret = _mm_mul_ps(ret, factor); + + // Reinterpret arg as integer to obtain logarithm. + //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret)); + ret = _mm_cvtepi32_ps(_mm_castps_si128(ret)); + + // Multiply logarithm by power. + ret = _mm_mul_ps(ret, _mm_set1_ps(1.0f * expnum / expden)); + + // Convert back to "integer" to exponentiate. + //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret)); + ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret)); + + return ret; +} + +inline static __m128 pow512_4(__m128 arg) { + // 5/12 is too small, so compute the 4th root of 20/12 instead. + // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow. + // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3 + __m128 xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg); + __m128 xover = _mm_mul_ps(arg, xf); + + __m128 xfm1 = _mm_rsqrt_ps(xf); + __m128 x2 = _mm_mul_ps(arg, arg); + __m128 xunder = _mm_mul_ps(x2, xfm1); + + // sqrt2 * over + 2 * sqrt2 * under + __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), + _mm_add_ps(xover, xunder)); + + xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); + xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); + return xavg; +} + +inline static __m128 powf_wrapper(__m128 Base, float Exp) +{ + float *f = (float *)(&Base); + + return _mm_set_ps(powf(f[0], Exp), + powf(f[1], Exp), + powf(f[2], Exp), + powf(f[3], Exp)); +} + +static inline __m128 ConvertFloatToSRGB2(__m128& Src) +{ + // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value + __m128i CmpToSRGBThresholdMask = TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f), Src)); + + // squeeze the mask down to 16 bits (4 bits per DWORD) + int CompareResult = _mm_movemask_epi8(CmpToSRGBThresholdMask); + + __m128 Result; + + // + if (CompareResult == 0xFFFF) + { + // all DWORDs are <= the threshold + Result = _mm_mul_ps(Src, _mm_set1_ps(12.92f)); + } + else if (CompareResult == 0x0) + { + // all DWORDs are > the threshold + __m128 fSrc_0RGB = Src; + + // --> 1.055f * c(1.0f/2.4f) - 0.055f +#if KNOB_USE_FAST_SRGB == TRUE + // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation. + __m128 f = pow512_4(fSrc_0RGB); +#else + __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f); +#endif + f = _mm_mul_ps(f, _mm_set1_ps(1.055f)); + Result = _mm_sub_ps(f, _mm_set1_ps(0.055f)); + } + else + { + // some DWORDs are <= the threshold and some are > threshold + __m128 Src_0RGB_mul_denorm = _mm_mul_ps(Src, _mm_set1_ps(12.92f)); + + __m128 fSrc_0RGB = Src; + + // --> 1.055f * c(1.0f/2.4f) - 0.055f +#if KNOB_USE_FAST_SRGB == TRUE + // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation. + __m128 f = pow512_4(fSrc_0RGB); +#else + __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f); +#endif + f = _mm_mul_ps(f, _mm_set1_ps(1.055f)); + f = _mm_sub_ps(f, _mm_set1_ps(0.055f)); + + // Clear the alpha (is garbage after the sub) + __m128i i = _mm_and_si128(TO_M128i(f), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)); + + __m128i LessThanPart = _mm_and_si128(CmpToSRGBThresholdMask, TO_M128i(Src_0RGB_mul_denorm)); + __m128i GreaterEqualPart = _mm_andnot_si128(CmpToSRGBThresholdMask, i); + __m128i CombinedParts = _mm_or_si128(LessThanPart, GreaterEqualPart); + + Result = TO_M128(CombinedParts); + } + + return Result; +} + +////////////////////////////////////////////////////////////////////////// +/// TypeTraits - Format type traits specialization for FLOAT16 +////////////////////////////////////////////////////////////////////////// +template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16> +{ + static const SWR_TYPE MyType = SWR_TYPE_FLOAT; + static float toFloat() { return 1.0f; } + static float fromFloat() { return 1.0f; } + static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } + + static simdscalar pack(const simdscalar &in) + { +#if KNOB_SIMD_WIDTH == 8 +#if (KNOB_ARCH == KNOB_ARCH_AVX) + // input is 8 packed float32, output is 8 packed float16 + simdscalari src = _simd_castps_si(in); + + static const uint32_t FLOAT_EXP_BITS = 8; + static const uint32_t FLOAT_MANTISSA_BITS = 23; + static const uint32_t FLOAT_MANTISSA_MASK = (1U << FLOAT_MANTISSA_BITS) - 1; + static const uint32_t FLOAT_EXP_MASK = ((1U << FLOAT_EXP_BITS) - 1) << FLOAT_MANTISSA_BITS; + + static const uint32_t HALF_EXP_BITS = 5; + static const uint32_t HALF_MANTISSA_BITS = 10; + static const uint32_t HALF_MANTISSA_MASK = (1U << HALF_MANTISSA_BITS) - 1; + static const uint32_t HALF_EXP_MASK = ((1U << HALF_EXP_BITS) - 1) << HALF_MANTISSA_BITS; + + // minimum exponent required, exponents below this are flushed to 0. + static const int32_t HALF_EXP_MIN = -14; + static const int32_t FLOAT_EXP_BIAS = 127; + static const int32_t FLOAT_EXP_MIN = HALF_EXP_MIN + FLOAT_EXP_BIAS; + static const int32_t FLOAT_EXP_MIN_FTZ = FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand + + // maximum exponent required, exponents above this are set to infinity + static const int32_t HALF_EXP_MAX = 15; + static const int32_t FLOAT_EXP_MAX = HALF_EXP_MAX + FLOAT_EXP_BIAS; + + const simdscalari vSignMask = _simd_set1_epi32(0x80000000); + const simdscalari vExpMask = _simd_set1_epi32(FLOAT_EXP_MASK); + const simdscalari vManMask = _simd_set1_epi32(FLOAT_MANTISSA_MASK); + const simdscalari vExpMin = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS)); + const simdscalari vExpMinFtz = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS)); + const simdscalari vExpMax = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS)); + + simdscalari vSign = _simd_and_si(src, vSignMask); + simdscalari vExp = _simd_and_si(src, vExpMask); + simdscalari vMan = _simd_and_si(src, vManMask); + + simdscalari vFTZMask = _simd_cmplt_epi32(vExp, vExpMinFtz); + simdscalari vDenormMask = _simd_andnot_si(vFTZMask, _simd_cmplt_epi32(vExp, vExpMin)); + simdscalari vInfMask = _simd_cmpeq_epi32(vExpMask, vExp); + simdscalari vClampMask = _simd_andnot_si(vInfMask, _simd_cmplt_epi32(vExpMax, vExp)); + + simdscalari vHalfExp = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS)); + + // pack output 16-bits into the lower 16-bits of each 32-bit channel + simdscalari vDst = _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK)); + vDst = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS)); + + // Flush To Zero + vDst = _simd_andnot_si(vFTZMask, vDst); + // Apply Infinites / NaN + vDst = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK))); + + // Apply clamps + vDst = _simd_andnot_si(vClampMask, vDst); + vDst = _simd_or_si(vDst, + _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF))); + + // Compute Denormals (subnormals) + if (!_mm256_testz_si256(vDenormMask, vDenormMask)) + { + uint32_t *pDenormMask = (uint32_t*)&vDenormMask; + uint32_t *pExp = (uint32_t*)&vExp; + uint32_t *pMan = (uint32_t*)&vMan; + uint32_t *pDst = (uint32_t*)&vDst; + for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) + { + if (pDenormMask[i]) + { + // Need to compute subnormal value + uint32_t exponent = pExp[i] >> FLOAT_MANTISSA_BITS; + uint32_t mantissa = pMan[i] | + (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s. Make it explicit + + pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) + (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS)); + } + } + } + + // Add in sign bits + vDst = _simd_or_si(vDst, _simd_srli_epi32(vSign, 16)); + + // Pack to lower 128-bits + vDst = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1))); + +#if 0 +#if !defined(NDEBUG) + simdscalari vCheck = _mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC)); + + for (uint32_t i = 0; i < 4; ++i) + { + SWR_ASSERT(vCheck.m256i_i32[i] == vDst.m256i_i32[i]); + } +#endif +#endif + + return _simd_castsi_ps(vDst); + +#else + return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC))); +#endif +#else +#error Unsupported vector width +#endif + } + + static simdscalar unpack(const simdscalar &in) + { + // input is 8 packed float16, output is 8 packed float32 + SWR_ASSERT(0); // @todo + return _simd_setzero_ps(); + } +}; + +////////////////////////////////////////////////////////////////////////// +/// TypeTraits - Format type traits specialization for FLOAT32 +////////////////////////////////////////////////////////////////////////// +template<> struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32> +{ + static const SWR_TYPE MyType = SWR_TYPE_FLOAT; + static float toFloat() { return 1.0f; } + static float fromFloat() { return 1.0f; } + static inline simdscalar convertSrgb(simdscalar &in) + { +#if (KNOB_ARCH == KNOB_ARCH_AVX || KNOB_ARCH == KNOB_ARCH_AVX2) + __m128 srcLo = _mm256_extractf128_ps(in, 0); + __m128 srcHi = _mm256_extractf128_ps(in, 1); + + srcLo = ConvertFloatToSRGB2(srcLo); + srcHi = ConvertFloatToSRGB2(srcHi); + + in = _mm256_insertf128_ps(in, srcLo, 0); + in = _mm256_insertf128_ps(in, srcHi, 1); + +#endif + return in; + } +}; + +////////////////////////////////////////////////////////////////////////// +/// Format1 - Bitfield for single component formats. +////////////////////////////////////////////////////////////////////////// +template<uint32_t x> +struct Format1 +{ + union + { + uint32_t r : x; + + ///@ The following are here to provide full template needed in Formats. + uint32_t g : x; + uint32_t b : x; + uint32_t a : x; + }; +}; + +////////////////////////////////////////////////////////////////////////// +/// Format1 - Bitfield for single component formats - 8 bit specialization +////////////////////////////////////////////////////////////////////////// +template<> +struct Format1<8> +{ + union + { + uint8_t r; + + ///@ The following are here to provide full template needed in Formats. + uint8_t g; + uint8_t b; + uint8_t a; + }; +}; + +////////////////////////////////////////////////////////////////////////// +/// Format1 - Bitfield for single component formats - 16 bit specialization +////////////////////////////////////////////////////////////////////////// +template<> +struct Format1<16> +{ + union + { + uint16_t r; + + ///@ The following are here to provide full template needed in Formats. + uint16_t g; + uint16_t b; + uint16_t a; + }; +}; + +////////////////////////////////////////////////////////////////////////// +/// Format2 - Bitfield for 2 component formats. +////////////////////////////////////////////////////////////////////////// +template<uint32_t x, uint32_t y> +union Format2 +{ + struct + { + uint32_t r : x; + uint32_t g : y; + }; + struct + { + ///@ The following are here to provide full template needed in Formats. + uint32_t b : x; + uint32_t a : y; + }; +}; + +////////////////////////////////////////////////////////////////////////// +/// Format2 - Bitfield for 2 component formats - 16 bit specialization +////////////////////////////////////////////////////////////////////////// +template<> +union Format2<8,8> +{ + struct + { + uint16_t r : 8; + uint16_t g : 8; + }; + struct + { + ///@ The following are here to provide full template needed in Formats. + uint16_t b : 8; + uint16_t a : 8; + }; +}; + +////////////////////////////////////////////////////////////////////////// +/// Format3 - Bitfield for 3 component formats. +////////////////////////////////////////////////////////////////////////// +template<uint32_t x, uint32_t y, uint32_t z> +union Format3 +{ + struct + { + uint32_t r : x; + uint32_t g : y; + uint32_t b : z; + }; + uint32_t a; ///@note This is here to provide full template needed in Formats. +}; + +////////////////////////////////////////////////////////////////////////// +/// Format3 - Bitfield for 3 component formats - 16 bit specialization +////////////////////////////////////////////////////////////////////////// +template<> +union Format3<5,6,5> +{ + struct + { + uint16_t r : 5; + uint16_t g : 6; + uint16_t b : 5; + }; + uint16_t a; ///@note This is here to provide full template needed in Formats. +}; + +////////////////////////////////////////////////////////////////////////// +/// Format4 - Bitfield for 4 component formats. +////////////////////////////////////////////////////////////////////////// +template<uint32_t x, uint32_t y, uint32_t z, uint32_t w> +struct Format4 +{ + uint32_t r : x; + uint32_t g : y; + uint32_t b : z; + uint32_t a : w; +}; + +////////////////////////////////////////////////////////////////////////// +/// Format4 - Bitfield for 4 component formats - 16 bit specialization +////////////////////////////////////////////////////////////////////////// +template<> +struct Format4<5,5,5,1> +{ + uint16_t r : 5; + uint16_t g : 5; + uint16_t b : 5; + uint16_t a : 1; +}; + +////////////////////////////////////////////////////////////////////////// +/// Format4 - Bitfield for 4 component formats - 16 bit specialization +////////////////////////////////////////////////////////////////////////// +template<> +struct Format4<4,4,4,4> +{ + uint16_t r : 4; + uint16_t g : 4; + uint16_t b : 4; + uint16_t a : 4; +}; + +////////////////////////////////////////////////////////////////////////// +/// ComponentTraits - Default components +////////////////////////////////////////////////////////////////////////// +template<uint32_t x, uint32_t y, uint32_t z, uint32_t w> +struct Defaults +{ + INLINE static uint32_t GetDefault(uint32_t comp) + { + static const uint32_t defaults[4]{ x, y, z, w }; + return defaults[comp]; + } +}; + +////////////////////////////////////////////////////////////////////////// +/// ComponentTraits - Component type traits. +////////////////////////////////////////////////////////////////////////// +template<SWR_TYPE X, uint32_t NumBitsX, SWR_TYPE Y = SWR_TYPE_UNKNOWN, uint32_t NumBitsY = 0, SWR_TYPE Z = SWR_TYPE_UNKNOWN, uint32_t NumBitsZ = 0, SWR_TYPE W = SWR_TYPE_UNKNOWN, uint32_t NumBitsW = 0> +struct ComponentTraits +{ + INLINE static SWR_TYPE GetType(uint32_t comp) + { + static const SWR_TYPE CompType[4]{ X, Y, Z, W }; + return CompType[comp]; + } + + INLINE static uint32_t GetBPC(uint32_t comp) + { + static const uint32_t MyBpc[4]{ NumBitsX, NumBitsY, NumBitsZ, NumBitsW }; + return MyBpc[comp]; + } + + INLINE static bool isNormalized(uint32_t comp) + { + switch (comp) + { + case 0: + return (X == SWR_TYPE_UNORM || X == SWR_TYPE_SNORM) ? true : false; + case 1: + return (Y == SWR_TYPE_UNORM || Y == SWR_TYPE_SNORM) ? true : false; + case 2: + return (Z == SWR_TYPE_UNORM || Z == SWR_TYPE_SNORM) ? true : false; + case 3: + return (W == SWR_TYPE_UNORM || W == SWR_TYPE_SNORM) ? true : false; + } + SWR_ASSERT(0); + return false; + } + + INLINE static float toFloat(uint32_t comp) + { + switch (comp) + { + case 0: + return TypeTraits<X, NumBitsX>::toFloat(); + case 1: + return TypeTraits<Y, NumBitsY>::toFloat(); + case 2: + return TypeTraits<Z, NumBitsZ>::toFloat(); + case 3: + return TypeTraits<W, NumBitsW>::toFloat(); + } + SWR_ASSERT(0); + return TypeTraits<X, NumBitsX>::toFloat(); + + } + + INLINE static float fromFloat(uint32_t comp) + { + switch (comp) + { + case 0: + return TypeTraits<X, NumBitsX>::fromFloat(); + case 1: + return TypeTraits<Y, NumBitsY>::fromFloat(); + case 2: + return TypeTraits<Z, NumBitsZ>::fromFloat(); + case 3: + return TypeTraits<W, NumBitsW>::fromFloat(); + } + SWR_ASSERT(0); + return TypeTraits<X, NumBitsX>::fromFloat(); + } + + INLINE static simdscalar loadSOA(uint32_t comp, const BYTE* pSrc) + { + switch (comp) + { + case 0: + return TypeTraits<X, NumBitsX>::loadSOA(pSrc); + case 1: + return TypeTraits<Y, NumBitsY>::loadSOA(pSrc); + case 2: + return TypeTraits<Z, NumBitsZ>::loadSOA(pSrc); + case 3: + return TypeTraits<W, NumBitsW>::loadSOA(pSrc); + } + SWR_ASSERT(0); + return TypeTraits<X, NumBitsX>::loadSOA(pSrc); + } + + INLINE static void storeSOA(uint32_t comp, BYTE *pDst, simdscalar src) + { + switch (comp) + { + case 0: + TypeTraits<X, NumBitsX>::storeSOA(pDst, src); + return; + case 1: + TypeTraits<Y, NumBitsY>::storeSOA(pDst, src); + return; + case 2: + TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src); + return; + case 3: + TypeTraits<W, NumBitsW>::storeSOA(pDst, src); + return; + } + SWR_ASSERT(0); + TypeTraits<X, NumBitsX>::storeSOA(pDst, src); + } + + INLINE static simdscalar unpack(uint32_t comp, simdscalar &in) + { + switch (comp) + { + case 0: + return TypeTraits<X, NumBitsX>::unpack(in); + case 1: + return TypeTraits<Y, NumBitsY>::unpack(in); + case 2: + return TypeTraits<Z, NumBitsZ>::unpack(in); + case 3: + return TypeTraits<W, NumBitsW>::unpack(in); + } + SWR_ASSERT(0); + return TypeTraits<X, NumBitsX>::unpack(in); + } + + INLINE static simdscalar pack(uint32_t comp, simdscalar &in) + { + switch (comp) + { + case 0: + return TypeTraits<X, NumBitsX>::pack(in); + case 1: + return TypeTraits<Y, NumBitsY>::pack(in); + case 2: + return TypeTraits<Z, NumBitsZ>::pack(in); + case 3: + return TypeTraits<W, NumBitsW>::pack(in); + } + SWR_ASSERT(0); + return TypeTraits<X, NumBitsX>::pack(in); + } + + INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar &in) + { + switch (comp) + { + case 0: + return TypeTraits<X, NumBitsX>::convertSrgb(in);; + case 1: + return TypeTraits<Y, NumBitsY>::convertSrgb(in);; + case 2: + return TypeTraits<Z, NumBitsZ>::convertSrgb(in);; + case 3: + return TypeTraits<W, NumBitsW>::convertSrgb(in);; + } + SWR_ASSERT(0); + return TypeTraits<X, NumBitsX>::convertSrgb(in); + } +}; diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp new file mode 100644 index 00000000000..f43a672bd82 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -0,0 +1,2345 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file frontend.cpp +* +* @brief Implementation for Frontend which handles vertex processing, +* primitive assembly, clipping, binning, etc. +* +******************************************************************************/ + +#include "api.h" +#include "frontend.h" +#include "backend.h" +#include "context.h" +#include "rdtsc_core.h" +#include "rasterizer.h" +#include "utils.h" +#include "threads.h" +#include "pa.h" +#include "clip.h" +#include "tilemgr.h" +#include "tessellator.h" + +////////////////////////////////////////////////////////////////////////// +/// @brief Helper macro to generate a bitmask +static INLINE uint32_t GenMask(uint32_t numBits) +{ + SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__); + return ((1U << numBits) - 1); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Offsets added to post-viewport vertex positions based on +/// raster state. +static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] = +{ + _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER + _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief FE handler for SwrSync. +/// @param pContext - pointer to SWR context. +/// @param pDC - pointer to draw context. +/// @param workerId - thread's worker id. Even thread has a unique id. +/// @param pUserData - Pointer to user data passed back to sync callback. +/// @todo This should go away when we switch this to use compute threading. +void ProcessSync( + SWR_CONTEXT *pContext, + DRAW_CONTEXT *pDC, + uint32_t workerId, + void *pUserData) +{ + SYNC_DESC *pSync = (SYNC_DESC*)pUserData; + BE_WORK work; + work.type = SYNC; + work.pfnWork = ProcessSyncBE; + work.desc.sync = *pSync; + + MacroTileMgr *pTileMgr = pDC->pTileMgr; + pTileMgr->enqueue(0, 0, &work); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief FE handler for SwrGetStats. +/// @param pContext - pointer to SWR context. +/// @param pDC - pointer to draw context. +/// @param workerId - thread's worker id. Even thread has a unique id. +/// @param pUserData - Pointer to user data passed back to stats callback. +/// @todo This should go away when we switch this to use compute threading. +void ProcessQueryStats( + SWR_CONTEXT *pContext, + DRAW_CONTEXT *pDC, + uint32_t workerId, + void *pUserData) +{ + QUERY_DESC *pQueryStats = (QUERY_DESC*)pUserData; + BE_WORK work; + work.type = QUERYSTATS; + work.pfnWork = ProcessQueryStatsBE; + work.desc.queryStats = *pQueryStats; + + MacroTileMgr *pTileMgr = pDC->pTileMgr; + pTileMgr->enqueue(0, 0, &work); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief FE handler for SwrClearRenderTarget. +/// @param pContext - pointer to SWR context. +/// @param pDC - pointer to draw context. +/// @param workerId - thread's worker id. Even thread has a unique id. +/// @param pUserData - Pointer to user data passed back to clear callback. +/// @todo This should go away when we switch this to use compute threading. +void ProcessClear( + SWR_CONTEXT *pContext, + DRAW_CONTEXT *pDC, + uint32_t workerId, + void *pUserData) +{ + CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData; + MacroTileMgr *pTileMgr = pDC->pTileMgr; + + const API_STATE& state = GetApiState(pDC); + + // queue a clear to each macro tile + // compute macro tile bounds for the current scissor/viewport + uint32_t macroTileLeft = state.scissorInFixedPoint.left / KNOB_MACROTILE_X_DIM_FIXED; + uint32_t macroTileRight = state.scissorInFixedPoint.right / KNOB_MACROTILE_X_DIM_FIXED; + uint32_t macroTileTop = state.scissorInFixedPoint.top / KNOB_MACROTILE_Y_DIM_FIXED; + uint32_t macroTileBottom = state.scissorInFixedPoint.bottom / KNOB_MACROTILE_Y_DIM_FIXED; + + BE_WORK work; + work.type = CLEAR; + work.pfnWork = ProcessClearBE; + work.desc.clear = *pClear; + + for (uint32_t y = macroTileTop; y <= macroTileBottom; ++y) + { + for (uint32_t x = macroTileLeft; x <= macroTileRight; ++x) + { + pTileMgr->enqueue(x, y, &work); + } + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief FE handler for SwrStoreTiles. +/// @param pContext - pointer to SWR context. +/// @param pDC - pointer to draw context. +/// @param workerId - thread's worker id. Even thread has a unique id. +/// @param pUserData - Pointer to user data passed back to callback. +/// @todo This should go away when we switch this to use compute threading. +void ProcessStoreTiles( + SWR_CONTEXT *pContext, + DRAW_CONTEXT *pDC, + uint32_t workerId, + void *pUserData) +{ + RDTSC_START(FEProcessStoreTiles); + STORE_TILES_DESC *pStore = (STORE_TILES_DESC*)pUserData; + MacroTileMgr *pTileMgr = pDC->pTileMgr; + + const API_STATE& state = GetApiState(pDC); + + // queue a store to each macro tile + // compute macro tile bounds for the current render target + const uint32_t macroWidth = KNOB_MACROTILE_X_DIM; + const uint32_t macroHeight = KNOB_MACROTILE_Y_DIM; + + uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth; + uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight; + + // store tiles + BE_WORK work; + work.type = STORETILES; + work.pfnWork = ProcessStoreTileBE; + work.desc.storeTiles = *pStore; + + for (uint32_t x = 0; x < numMacroTilesX; ++x) + { + for (uint32_t y = 0; y < numMacroTilesY; ++y) + { + pTileMgr->enqueue(x, y, &work); + } + } + + RDTSC_STOP(FEProcessStoreTiles, 0, pDC->drawId); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief FE handler for SwrInvalidateTiles. +/// @param pContext - pointer to SWR context. +/// @param pDC - pointer to draw context. +/// @param workerId - thread's worker id. Even thread has a unique id. +/// @param pUserData - Pointer to user data passed back to callback. +/// @todo This should go away when we switch this to use compute threading. +void ProcessInvalidateTiles( + SWR_CONTEXT *pContext, + DRAW_CONTEXT *pDC, + uint32_t workerId, + void *pUserData) +{ + RDTSC_START(FEProcessInvalidateTiles); + INVALIDATE_TILES_DESC *pInv = (INVALIDATE_TILES_DESC*)pUserData; + MacroTileMgr *pTileMgr = pDC->pTileMgr; + + const API_STATE& state = GetApiState(pDC); + + // queue a store to each macro tile + // compute macro tile bounds for the current render target + uint32_t macroWidth = KNOB_MACROTILE_X_DIM; + uint32_t macroHeight = KNOB_MACROTILE_Y_DIM; + + uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth; + uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight; + + // load tiles + BE_WORK work; + work.type = INVALIDATETILES; + work.pfnWork = ProcessInvalidateTilesBE; + work.desc.invalidateTiles = *pInv; + + for (uint32_t x = 0; x < numMacroTilesX; ++x) + { + for (uint32_t y = 0; y < numMacroTilesY; ++y) + { + pTileMgr->enqueue(x, y, &work); + } + } + + RDTSC_STOP(FEProcessInvalidateTiles, 0, pDC->drawId); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Computes the number of primitives given the number of verts. +/// @param mode - primitive topology for draw operation. +/// @param numPrims - number of vertices or indices for draw. +/// @todo Frontend needs to be refactored. This will go in appropriate place then. +uint32_t GetNumPrims( + PRIMITIVE_TOPOLOGY mode, + uint32_t numPrims) +{ + switch (mode) + { + case TOP_POINT_LIST: return numPrims; + case TOP_TRIANGLE_LIST: return numPrims / 3; + case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2; + case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2; + case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1; + case TOP_QUAD_LIST: return numPrims / 4; + case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2; + case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1; + case TOP_LINE_LIST: return numPrims / 2; + case TOP_LINE_LOOP: return numPrims; + case TOP_RECT_LIST: return numPrims / 3; + case TOP_LINE_LIST_ADJ: return numPrims / 4; + case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3; + case TOP_TRI_LIST_ADJ: return numPrims / 6; + case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2; + + case TOP_PATCHLIST_1: + case TOP_PATCHLIST_2: + case TOP_PATCHLIST_3: + case TOP_PATCHLIST_4: + case TOP_PATCHLIST_5: + case TOP_PATCHLIST_6: + case TOP_PATCHLIST_7: + case TOP_PATCHLIST_8: + case TOP_PATCHLIST_9: + case TOP_PATCHLIST_10: + case TOP_PATCHLIST_11: + case TOP_PATCHLIST_12: + case TOP_PATCHLIST_13: + case TOP_PATCHLIST_14: + case TOP_PATCHLIST_15: + case TOP_PATCHLIST_16: + case TOP_PATCHLIST_17: + case TOP_PATCHLIST_18: + case TOP_PATCHLIST_19: + case TOP_PATCHLIST_20: + case TOP_PATCHLIST_21: + case TOP_PATCHLIST_22: + case TOP_PATCHLIST_23: + case TOP_PATCHLIST_24: + case TOP_PATCHLIST_25: + case TOP_PATCHLIST_26: + case TOP_PATCHLIST_27: + case TOP_PATCHLIST_28: + case TOP_PATCHLIST_29: + case TOP_PATCHLIST_30: + case TOP_PATCHLIST_31: + case TOP_PATCHLIST_32: + return numPrims / (mode - TOP_PATCHLIST_BASE); + + case TOP_POLYGON: + case TOP_POINT_LIST_BF: + case TOP_LINE_STRIP_CONT: + case TOP_LINE_STRIP_BF: + case TOP_LINE_STRIP_CONT_BF: + case TOP_TRIANGLE_FAN_NOSTIPPLE: + case TOP_TRI_STRIP_REVERSE: + case TOP_PATCHLIST_BASE: + case TOP_UNKNOWN: + SWR_ASSERT(false, "Unsupported topology: %d", mode); + return 0; + } + + return 0; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Computes the number of verts given the number of primitives. +/// @param mode - primitive topology for draw operation. +/// @param numPrims - number of primitives for draw. +uint32_t GetNumVerts( + PRIMITIVE_TOPOLOGY mode, + uint32_t numPrims) +{ + switch (mode) + { + case TOP_POINT_LIST: return numPrims; + case TOP_TRIANGLE_LIST: return numPrims * 3; + case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0; + case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0; + case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0; + case TOP_QUAD_LIST: return numPrims * 4; + case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0; + case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0; + case TOP_LINE_LIST: return numPrims * 2; + case TOP_LINE_LOOP: return numPrims; + case TOP_RECT_LIST: return numPrims * 3; + case TOP_LINE_LIST_ADJ: return numPrims * 4; + case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0; + case TOP_TRI_LIST_ADJ: return numPrims * 6; + case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0; + + case TOP_PATCHLIST_1: + case TOP_PATCHLIST_2: + case TOP_PATCHLIST_3: + case TOP_PATCHLIST_4: + case TOP_PATCHLIST_5: + case TOP_PATCHLIST_6: + case TOP_PATCHLIST_7: + case TOP_PATCHLIST_8: + case TOP_PATCHLIST_9: + case TOP_PATCHLIST_10: + case TOP_PATCHLIST_11: + case TOP_PATCHLIST_12: + case TOP_PATCHLIST_13: + case TOP_PATCHLIST_14: + case TOP_PATCHLIST_15: + case TOP_PATCHLIST_16: + case TOP_PATCHLIST_17: + case TOP_PATCHLIST_18: + case TOP_PATCHLIST_19: + case TOP_PATCHLIST_20: + case TOP_PATCHLIST_21: + case TOP_PATCHLIST_22: + case TOP_PATCHLIST_23: + case TOP_PATCHLIST_24: + case TOP_PATCHLIST_25: + case TOP_PATCHLIST_26: + case TOP_PATCHLIST_27: + case TOP_PATCHLIST_28: + case TOP_PATCHLIST_29: + case TOP_PATCHLIST_30: + case TOP_PATCHLIST_31: + case TOP_PATCHLIST_32: + return numPrims * (mode - TOP_PATCHLIST_BASE); + + case TOP_POLYGON: + case TOP_POINT_LIST_BF: + case TOP_LINE_STRIP_CONT: + case TOP_LINE_STRIP_BF: + case TOP_LINE_STRIP_CONT_BF: + case TOP_TRIANGLE_FAN_NOSTIPPLE: + case TOP_TRI_STRIP_REVERSE: + case TOP_PATCHLIST_BASE: + case TOP_UNKNOWN: + SWR_ASSERT(false, "Unsupported topology: %d", mode); + return 0; + } + + return 0; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Return number of verts per primitive. +/// @param topology - topology +/// @param includeAdjVerts - include adjacent verts in primitive vertices +INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts) +{ + uint32_t numVerts = 0; + switch (topology) + { + case TOP_POINT_LIST: + case TOP_POINT_LIST_BF: + numVerts = 1; + break; + case TOP_LINE_LIST: + case TOP_LINE_STRIP: + case TOP_LINE_LIST_ADJ: + case TOP_LINE_LOOP: + case TOP_LINE_STRIP_CONT: + case TOP_LINE_STRIP_BF: + case TOP_LISTSTRIP_ADJ: + numVerts = 2; + break; + case TOP_TRIANGLE_LIST: + case TOP_TRIANGLE_STRIP: + case TOP_TRIANGLE_FAN: + case TOP_TRI_LIST_ADJ: + case TOP_TRI_STRIP_ADJ: + case TOP_TRI_STRIP_REVERSE: + case TOP_RECT_LIST: + numVerts = 3; + break; + case TOP_QUAD_LIST: + case TOP_QUAD_STRIP: + numVerts = 4; + break; + case TOP_PATCHLIST_1: + case TOP_PATCHLIST_2: + case TOP_PATCHLIST_3: + case TOP_PATCHLIST_4: + case TOP_PATCHLIST_5: + case TOP_PATCHLIST_6: + case TOP_PATCHLIST_7: + case TOP_PATCHLIST_8: + case TOP_PATCHLIST_9: + case TOP_PATCHLIST_10: + case TOP_PATCHLIST_11: + case TOP_PATCHLIST_12: + case TOP_PATCHLIST_13: + case TOP_PATCHLIST_14: + case TOP_PATCHLIST_15: + case TOP_PATCHLIST_16: + case TOP_PATCHLIST_17: + case TOP_PATCHLIST_18: + case TOP_PATCHLIST_19: + case TOP_PATCHLIST_20: + case TOP_PATCHLIST_21: + case TOP_PATCHLIST_22: + case TOP_PATCHLIST_23: + case TOP_PATCHLIST_24: + case TOP_PATCHLIST_25: + case TOP_PATCHLIST_26: + case TOP_PATCHLIST_27: + case TOP_PATCHLIST_28: + case TOP_PATCHLIST_29: + case TOP_PATCHLIST_30: + case TOP_PATCHLIST_31: + case TOP_PATCHLIST_32: + numVerts = topology - TOP_PATCHLIST_BASE; + break; + default: + SWR_ASSERT(false, "Unsupported topology: %d", topology); + break; + } + + if (includeAdjVerts) + { + switch (topology) + { + case TOP_LISTSTRIP_ADJ: + case TOP_LINE_LIST_ADJ: numVerts = 4; break; + case TOP_TRI_STRIP_ADJ: + case TOP_TRI_LIST_ADJ: numVerts = 6; break; + default: break; + } + } + + return numVerts; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Generate mask from remaining work. +/// @param numWorkItems - Number of items being worked on by a SIMD. +static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining) +{ + uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining; + uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0; + return _simd_castps_si(vMask(mask)); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief StreamOut - Streams vertex data out to SO buffers. +/// Generally, we are only streaming out a SIMDs worth of triangles. +/// @param pDC - pointer to draw context. +/// @param workerId - thread's worker id. Even thread has a unique id. +/// @param numPrims - Number of prims to streamout (e.g. points, lines, tris) +static void StreamOut( + DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + uint32_t* pPrimData, + uint32_t streamIndex) +{ + RDTSC_START(FEStreamout); + + SWR_CONTEXT* pContext = pDC->pContext; + + const API_STATE& state = GetApiState(pDC); + const SWR_STREAMOUT_STATE &soState = state.soState; + + uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false); + + // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex. + uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t); + + SWR_STREAMOUT_CONTEXT soContext = { 0 }; + + // Setup buffer state pointers. + for (uint32_t i = 0; i < 4; ++i) + { + soContext.pBuffer[i] = &state.soBuffer[i]; + } + + uint32_t numPrims = pa.NumPrims(); + for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex) + { + DWORD slot = 0; + uint32_t soMask = soState.streamMasks[streamIndex]; + + // Write all entries into primitive data buffer for SOS. + while (_BitScanForward(&slot, soMask)) + { + __m128 attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide) + uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT; + pa.AssembleSingle(paSlot, primIndex, attrib); + + // Attribute offset is relative offset from start of vertex. + // Note that attributes start at slot 1 in the PA buffer. We need to write this + // to prim data starting at slot 0. Which is why we do (slot - 1). + // Also note: GL works slightly differently, and needs slot 0 + uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t); + + // Store each vertex's attrib at appropriate locations in pPrimData buffer. + for (uint32_t v = 0; v < soVertsPerPrim; ++v) + { + uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride); + + _mm_store_ps((float*)pPrimDataAttrib, attrib[v]); + } + soMask &= ~(1 << slot); + } + + // Update pPrimData pointer + soContext.pPrimData = pPrimData; + + // Call SOS + SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function."); + state.pfnSoFunc[streamIndex](soContext); + } + + // Update SO write offset. The driver provides memory for the update. + for (uint32_t i = 0; i < 4; ++i) + { + if (state.soBuffer[i].pWriteOffset) + { + *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t); + + // The SOS increments the existing write offset. So we don't want to increment + // the SoWriteOffset stat using an absolute offset instead of relative. + SET_STAT(SoWriteOffset[i], soContext.pBuffer[i]->streamOffset); + } + } + + UPDATE_STAT(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded); + UPDATE_STAT(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten); + + RDTSC_STOP(FEStreamout, 1, 0); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Computes number of invocations. The current index represents +/// the start of the SIMD. The max index represents how much work +/// items are remaining. If there is less then a SIMD's left of work +/// then return the remaining amount of work. +/// @param curIndex - The start index for the SIMD. +/// @param maxIndex - The last index for all work items. +static INLINE uint32_t GetNumInvocations( + uint32_t curIndex, + uint32_t maxIndex) +{ + uint32_t remainder = (maxIndex - curIndex); + return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Converts a streamId buffer to a cut buffer for the given stream id. +/// The geometry shader will loop over each active streamout buffer, assembling +/// primitives for the downstream stages. When multistream output is enabled, +/// the generated stream ID buffer from the GS needs to be converted to a cut +/// buffer for the primitive assembler. +/// @param stream - stream id to generate the cut buffer for +/// @param pStreamIdBase - pointer to the stream ID buffer +/// @param numEmittedVerts - Number of total verts emitted by the GS +/// @param pCutBuffer - output buffer to write cuts to +void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer) +{ + SWR_ASSERT(stream < MAX_SO_STREAMS); + + uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8; + uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U); + + for (uint32_t b = 0; b < numOutputBytes; ++b) + { + uint8_t curInputByte = pStreamIdBase[2*b]; + uint8_t outByte = 0; + for (uint32_t i = 0; i < 4; ++i) + { + if ((curInputByte & 0x3) != stream) + { + outByte |= (1 << i); + } + curInputByte >>= 2; + } + + curInputByte = pStreamIdBase[2 * b + 1]; + for (uint32_t i = 0; i < 4; ++i) + { + if ((curInputByte & 0x3) != stream) + { + outByte |= (1 << (i + 4)); + } + curInputByte >>= 2; + } + + *pCutBuffer++ = outByte; + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Implements GS stage. +/// @param pDC - pointer to draw context. +/// @param workerId - thread's worker id. Even thread has a unique id. +/// @param pa - The primitive assembly object. +/// @param pGsOut - output stream for GS +template < + bool HasStreamOutT, + bool HasRastT> +static void GeometryShaderStage( + DRAW_CONTEXT *pDC, + uint32_t workerId, + PA_STATE& pa, + void* pGsOut, + void* pCutBuffer, + void* pStreamCutBuffer, + uint32_t* pSoPrimData, + simdscalari primID) +{ + RDTSC_START(FEGeometryShader); + + SWR_GS_CONTEXT gsContext; + SWR_CONTEXT* pContext = pDC->pContext; + + const API_STATE& state = GetApiState(pDC); + const SWR_GS_STATE* pState = &state.gsState; + + SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized"); + SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized"); + + gsContext.pStream = (uint8_t*)pGsOut; + gsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer; + gsContext.PrimitiveID = primID; + + uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true); + simdvector attrib[MAX_ATTRIBUTES]; + + // assemble all attributes for the input primitive + for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot) + { + uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot; + pa.Assemble(attribSlot, attrib); + + for (uint32_t i = 0; i < numVertsPerPrim; ++i) + { + gsContext.vert[i].attrib[attribSlot] = attrib[i]; + } + } + + // assemble position + pa.Assemble(VERTEX_POSITION_SLOT, attrib); + for (uint32_t i = 0; i < numVertsPerPrim; ++i) + { + gsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i]; + } + + const uint32_t vertexStride = sizeof(simdvertex); + const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH; + const uint32_t inputPrimStride = numSimdBatches * vertexStride; + const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH; + uint32_t cutPrimStride; + uint32_t cutInstanceStride; + + if (pState->isSingleStream) + { + cutPrimStride = (state.gsState.maxNumVerts + 7) / 8; + cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH; + } + else + { + cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4); + cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH; + } + + // record valid prims from the frontend to avoid over binning the newly generated + // prims from the GS + uint32_t numInputPrims = pa.NumPrims(); + + for (uint32_t instance = 0; instance < pState->instanceCount; ++instance) + { + gsContext.InstanceID = instance; + gsContext.mask = GenerateMask(numInputPrims); + + // execute the geometry shader + state.pfnGsFunc(GetPrivateState(pDC), &gsContext); + + gsContext.pStream += instanceStride; + gsContext.pCutOrStreamIdBuffer += cutInstanceStride; + } + + // set up new binner and state for the GS output topology + PFN_PROCESS_PRIMS pfnClipFunc = nullptr; + if (HasRastT) + { + switch (pState->outputTopology) + { + case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break; + case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break; + case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break; + default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology); + } + } + + // foreach input prim: + // - setup a new PA based on the emitted verts for that prim + // - loop over the new verts, calling PA to assemble each prim + uint32_t* pVertexCount = (uint32_t*)&gsContext.vertexCount; + uint32_t* pPrimitiveId = (uint32_t*)&primID; + + uint32_t totalPrimsGenerated = 0; + for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim) + { + uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride; + uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride; + for (uint32_t instance = 0; instance < pState->instanceCount; ++instance) + { + uint32_t numEmittedVerts = pVertexCount[inputPrim]; + if (numEmittedVerts == 0) + { + continue; + } + + uint8_t* pBase = pInstanceBase + instance * instanceStride; + uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride; + + DWORD numAttribs; + _BitScanReverse(&numAttribs, state.feAttribMask); + numAttribs++; + + for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream) + { + bool processCutVerts = false; + + uint8_t* pCutBuffer = pCutBase; + + // assign default stream ID, only relevant when GS is outputting a single stream + uint32_t streamID = 0; + if (pState->isSingleStream) + { + processCutVerts = true; + streamID = pState->singleStreamID; + if (streamID != stream) continue; + } + else + { + // early exit if this stream is not enabled for streamout + if (HasStreamOutT && !state.soState.streamEnable[stream]) + { + continue; + } + + // multi-stream output, need to translate StreamID buffer to a cut buffer + ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer); + pCutBuffer = (uint8_t*)pStreamCutBuffer; + processCutVerts = false; + } + + PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts); + + while (gsPa.GetNextStreamOutput()) + { + do + { + bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib); + + if (assemble) + { + totalPrimsGenerated += gsPa.NumPrims(); + + if (HasStreamOutT) + { + StreamOut(pDC, gsPa, workerId, pSoPrimData, stream); + } + + if (HasRastT && state.soState.streamToRasterizer == stream) + { + simdscalari vPrimId; + // pull primitiveID from the GS output if available + if (state.gsState.emitsPrimitiveID) + { + simdvector primIdAttrib[3]; + gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib); + vPrimId = _simd_castps_si(primIdAttrib[0].x); + } + else + { + vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]); + } + + pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId); + } + } + } while (gsPa.NextPrim()); + } + } + } + } + + // update GS pipeline stats + UPDATE_STAT(GsInvocations, numInputPrims * pState->instanceCount); + UPDATE_STAT(GsPrimitives, totalPrimsGenerated); + + RDTSC_STOP(FEGeometryShader, 1, 0); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Allocate GS buffers +/// @param pDC - pointer to draw context. +/// @param state - API state +/// @param ppGsOut - pointer to GS output buffer allocation +/// @param ppCutBuffer - pointer to GS output cut buffer allocation +static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer, + void **ppStreamCutBuffer) +{ + Arena* pArena = pDC->pArena; + SWR_ASSERT(pArena != nullptr); + SWR_ASSERT(state.gsState.gsEnable); + // allocate arena space to hold GS output verts + // @todo pack attribs + // @todo support multiple streams + const uint32_t vertexStride = sizeof(simdvertex); + const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH; + uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH; + *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float)); + + const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8; + const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4); + const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH; + const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH; + + // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the + // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance + + // allocate space for temporary per-stream cut buffer if multi-stream is enabled + if (state.gsState.isSingleStream) + { + *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float)); + *ppStreamCutBuffer = nullptr; + } + else + { + *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float)); + *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float)); + } + +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Contains all data generated by the HS and passed to the +/// tessellator and DS. +struct TessellationThreadLocalData +{ + SWR_HS_CONTEXT hsContext; + ScalarPatch patchData[KNOB_SIMD_WIDTH]; + void* pTxCtx; + size_t tsCtxSize; + + simdscalar* pDSOutput; + size_t numDSOutputVectors; +}; + +THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr; + +////////////////////////////////////////////////////////////////////////// +/// @brief Allocate tessellation data for this worker thread. +INLINE +static void AllocateTessellationData(SWR_CONTEXT* pContext) +{ + /// @TODO - Don't use thread local storage. Use Worker local storage instead. + if (gt_pTessellationThreadData == nullptr) + { + gt_pTessellationThreadData = (TessellationThreadLocalData*) + _aligned_malloc(sizeof(TessellationThreadLocalData), 64); + memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData)); + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Implements Tessellation Stages. +/// @param pDC - pointer to draw context. +/// @param workerId - thread's worker id. Even thread has a unique id. +/// @param pa - The primitive assembly object. +/// @param pGsOut - output stream for GS +template < + bool HasGeometryShaderT, + bool HasStreamOutT, + bool HasRastT> +static void TessellationStages( + DRAW_CONTEXT *pDC, + uint32_t workerId, + PA_STATE& pa, + void* pGsOut, + void* pCutBuffer, + void* pCutStreamBuffer, + uint32_t* pSoPrimData, + simdscalari primID) +{ + const API_STATE& state = GetApiState(pDC); + const SWR_TS_STATE& tsState = state.tsState; + SWR_CONTEXT *pContext = pDC->pContext; // Needed for UPDATE_STATS macro + + SWR_ASSERT(gt_pTessellationThreadData); + + HANDLE tsCtx = TSInitCtx( + tsState.domain, + tsState.partitioning, + tsState.tsOutputTopology, + gt_pTessellationThreadData->pTxCtx, + gt_pTessellationThreadData->tsCtxSize); + if (tsCtx == nullptr) + { + gt_pTessellationThreadData->pTxCtx = _aligned_malloc(gt_pTessellationThreadData->tsCtxSize, 64); + tsCtx = TSInitCtx( + tsState.domain, + tsState.partitioning, + tsState.tsOutputTopology, + gt_pTessellationThreadData->pTxCtx, + gt_pTessellationThreadData->tsCtxSize); + } + SWR_ASSERT(tsCtx); + + PFN_PROCESS_PRIMS pfnClipFunc = nullptr; + if (HasRastT) + { + switch (tsState.postDSTopology) + { + case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break; + case TOP_LINE_LIST: pfnClipFunc = ClipLines; break; + case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break; + default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology); + } + } + + SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext; + hsContext.pCPout = gt_pTessellationThreadData->patchData; + hsContext.PrimitiveID = primID; + + uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false); + // Max storage for one attribute for an entire simdprimitive + simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM]; + + // assemble all attributes for the input primitives + for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot) + { + uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot; + pa.Assemble(attribSlot, simdattrib); + + for (uint32_t i = 0; i < numVertsPerPrim; ++i) + { + hsContext.vert[i].attrib[attribSlot] = simdattrib[i]; + } + } + +#if defined(_DEBUG) + memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH); +#endif + + uint32_t numPrims = pa.NumPrims(); + hsContext.mask = GenerateMask(numPrims); + + // Run the HS + RDTSC_START(FEHullShader); + state.pfnHsFunc(GetPrivateState(pDC), &hsContext); + RDTSC_STOP(FEHullShader, 0, 0); + + UPDATE_STAT(HsInvocations, numPrims); + + const uint32_t* pPrimId = (const uint32_t*)&primID; + + for (uint32_t p = 0; p < numPrims; ++p) + { + // Run Tessellator + SWR_TS_TESSELLATED_DATA tsData = { 0 }; + RDTSC_START(FETessellation); + TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData); + RDTSC_STOP(FETessellation, 0, 0); + + if (tsData.NumPrimitives == 0) + { + continue; + } + SWR_ASSERT(tsData.NumDomainPoints); + + // Allocate DS Output memory + uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH; + size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs; + size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors; + if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors) + { + _aligned_free(gt_pTessellationThreadData->pDSOutput); + gt_pTessellationThreadData->pDSOutput = (simdscalar*)_aligned_malloc(requiredAllocSize, 64); + gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors; + } + SWR_ASSERT(gt_pTessellationThreadData->pDSOutput); + SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors); + +#if defined(_DEBUG) + memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize); +#endif + + // Run Domain Shader + SWR_DS_CONTEXT dsContext; + dsContext.PrimitiveID = pPrimId[p]; + dsContext.pCpIn = &hsContext.pCPout[p]; + dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU; + dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV; + dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput; + dsContext.vectorStride = requiredDSVectorInvocations; + + uint32_t dsInvocations = 0; + + for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset) + { + dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations); + + RDTSC_START(FEDomainShader); + state.pfnDsFunc(GetPrivateState(pDC), &dsContext); + RDTSC_STOP(FEDomainShader, 0, 0); + + dsInvocations += KNOB_SIMD_WIDTH; + } + UPDATE_STAT(DsInvocations, tsData.NumDomainPoints); + + PA_TESS tessPa( + pDC, + dsContext.pOutputData, + dsContext.vectorStride, + tsState.numDsOutputAttribs, + tsData.ppIndices, + tsData.NumPrimitives, + tsState.postDSTopology); + + while (tessPa.HasWork()) + { + if (HasGeometryShaderT) + { + GeometryShaderStage<HasStreamOutT, HasRastT>( + pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, + _simd_set1_epi32(dsContext.PrimitiveID)); + } + else + { + if (HasStreamOutT) + { + StreamOut(pDC, tessPa, workerId, pSoPrimData, 0); + } + + if (HasRastT) + { + simdvector prim[3]; // Only deal with triangles, lines, or points + RDTSC_START(FEPAAssemble); +#if SWR_ENABLE_ASSERTS + bool assemble = +#endif + tessPa.Assemble(VERTEX_POSITION_SLOT, prim); + RDTSC_STOP(FEPAAssemble, 1, 0); + SWR_ASSERT(assemble); + + SWR_ASSERT(pfnClipFunc); + pfnClipFunc(pDC, tessPa, workerId, prim, + GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID)); + } + } + + tessPa.NextPrim(); + + } // while (tessPa.HasWork()) + } // for (uint32_t p = 0; p < numPrims; ++p) + + TSDestroyCtx(tsCtx); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief FE handler for SwrDraw. +/// @tparam IsIndexedT - Is indexed drawing enabled +/// @tparam HasTessellationT - Is tessellation enabled +/// @tparam HasGeometryShaderT - Is the geometry shader stage enabled +/// @tparam HasStreamOutT - Is stream-out enabled +/// @tparam HasRastT - Is rasterization enabled +/// @param pContext - pointer to SWR context. +/// @param pDC - pointer to draw context. +/// @param workerId - thread's worker id. +/// @param pUserData - Pointer to DRAW_WORK +template < + bool IsIndexedT, + bool HasTessellationT, + bool HasGeometryShaderT, + bool HasStreamOutT, + bool HasRastT> +void ProcessDraw( + SWR_CONTEXT *pContext, + DRAW_CONTEXT *pDC, + uint32_t workerId, + void *pUserData) +{ + +#if KNOB_ENABLE_TOSS_POINTS + if (KNOB_TOSS_QUEUE_FE) + { + return; + } +#endif + + RDTSC_START(FEProcessDraw); + + DRAW_WORK& work = *(DRAW_WORK*)pUserData; + const API_STATE& state = GetApiState(pDC); + __m256i vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + SWR_VS_CONTEXT vsContext; + simdvertex vin; + + int indexSize = 0; + uint32_t endVertex = work.numVerts; + + const int32_t* pLastRequestedIndex = nullptr; + if (IsIndexedT) + { + switch (work.type) + { + case R32_UINT: + indexSize = sizeof(uint32_t); + pLastRequestedIndex = &(work.pIB[endVertex]); + break; + case R16_UINT: + indexSize = sizeof(uint16_t); + // nasty address offset to last index + pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex])); + break; + case R8_UINT: + indexSize = sizeof(uint8_t); + // nasty address offset to last index + pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex])); + break; + default: + SWR_ASSERT(0); + } + } + else + { + // No cuts, prune partial primitives. + endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts)); + } + + SWR_FETCH_CONTEXT fetchInfo = { 0 }; + fetchInfo.pStreams = &state.vertexBuffers[0]; + fetchInfo.StartInstance = work.startInstance; + fetchInfo.StartVertex = 0; + + vsContext.pVin = &vin; + + if (IsIndexedT) + { + fetchInfo.BaseVertex = work.baseVertex; + + // if the entire index buffer isn't being consumed, set the last index + // so that fetches < a SIMD wide will be masked off + fetchInfo.pLastIndex = (const int32_t*)(((BYTE*)state.indexBuffer.pIndices) + state.indexBuffer.size); + if (pLastRequestedIndex < fetchInfo.pLastIndex) + { + fetchInfo.pLastIndex = pLastRequestedIndex; + } + } + else + { + fetchInfo.StartVertex = work.startVertex; + } + +#ifdef KNOB_ENABLE_RDTSC + uint32_t numPrims = GetNumPrims(state.topology, work.numVerts); +#endif + + void* pGsOut = nullptr; + void* pCutBuffer = nullptr; + void* pStreamCutBuffer = nullptr; + if (HasGeometryShaderT) + { + AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer); + } + + if (HasTessellationT) + { + SWR_ASSERT(state.tsState.tsEnable == true); + SWR_ASSERT(state.pfnHsFunc != nullptr); + SWR_ASSERT(state.pfnDsFunc != nullptr); + + AllocateTessellationData(pContext); + } + else + { + SWR_ASSERT(state.tsState.tsEnable == false); + SWR_ASSERT(state.pfnHsFunc == nullptr); + SWR_ASSERT(state.pfnDsFunc == nullptr); + } + + // allocate space for streamout input prim data + uint32_t* pSoPrimData = nullptr; + if (HasStreamOutT) + { + pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16); + + // update the + for (uint32_t i = 0; i < 4; ++i) + { + SET_STAT(SoWriteOffset[i], state.soBuffer[i].streamOffset); + } + + } + + // choose primitive assembler + PA_FACTORY<IsIndexedT> paFactory(pDC, state.topology, work.numVerts); + PA_STATE& pa = paFactory.GetPA(); + + /// @todo: temporarily move instance loop in the FE to ensure SO ordering + for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++) + { + simdscalari vIndex; + uint32_t i = 0; + + if (IsIndexedT) + { + fetchInfo.pIndices = work.pIB; + } + else + { + vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale); + fetchInfo.pIndices = (const int32_t*)&vIndex; + } + + fetchInfo.CurInstance = instanceNum; + vsContext.InstanceID = instanceNum; + + while (pa.HasWork()) + { + // PaGetNextVsOutput currently has the side effect of updating some PA state machine state. + // So we need to keep this outside of (i < endVertex) check. + simdmask* pvCutIndices = nullptr; + if (IsIndexedT) + { + pvCutIndices = &pa.GetNextVsIndices(); + } + + simdvertex& vout = pa.GetNextVsOutput(); + vsContext.pVout = &vout; + + if (i < endVertex) + { + + // 1. Execute FS/VS for a single SIMD. + RDTSC_START(FEFetchShader); + state.pfnFetchFunc(fetchInfo, vin); + RDTSC_STOP(FEFetchShader, 0, 0); + + // forward fetch generated vertex IDs to the vertex shader + vsContext.VertexID = fetchInfo.VertexID; + + // Setup active mask for vertex shader. + vsContext.mask = GenerateMask(endVertex - i); + + // forward cut mask to the PA + if (IsIndexedT) + { + *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask)); + } + + UPDATE_STAT(IaVertices, GetNumInvocations(i, endVertex)); + +#if KNOB_ENABLE_TOSS_POINTS + if (!KNOB_TOSS_FETCH) +#endif + { + RDTSC_START(FEVertexShader); + state.pfnVertexFunc(GetPrivateState(pDC), &vsContext); + RDTSC_STOP(FEVertexShader, 0, 0); + + UPDATE_STAT(VsInvocations, GetNumInvocations(i, endVertex)); + } + } + + // 2. Assemble primitives given the last two SIMD. + do + { + simdvector prim[MAX_NUM_VERTS_PER_PRIM]; + // PaAssemble returns false if there is not enough verts to assemble. + RDTSC_START(FEPAAssemble); + bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim); + RDTSC_STOP(FEPAAssemble, 1, 0); + +#if KNOB_ENABLE_TOSS_POINTS + if (!KNOB_TOSS_FETCH) +#endif + { +#if KNOB_ENABLE_TOSS_POINTS + if (!KNOB_TOSS_VS) +#endif + { + if (assemble) + { + UPDATE_STAT(IaPrimitives, pa.NumPrims()); + + if (HasTessellationT) + { + TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>( + pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID)); + } + else if (HasGeometryShaderT) + { + GeometryShaderStage<HasStreamOutT, HasRastT>( + pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID)); + } + else + { + // If streamout is enabled then stream vertices out to memory. + if (HasStreamOutT) + { + StreamOut(pDC, pa, workerId, pSoPrimData, 0); + } + + if (HasRastT) + { + SWR_ASSERT(pDC->pState->pfnProcessPrims); + pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, + GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID)); + } + } + } + } + } + } while (pa.NextPrim()); + + i += KNOB_SIMD_WIDTH; + if (IsIndexedT) + { + fetchInfo.pIndices = (int*)((BYTE*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize); + } + else + { + vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH)); + } + } + pa.Reset(); + } + + RDTSC_STOP(FEProcessDraw, numPrims * work.numInstances, pDC->drawId); +} +// Explicit Instantiation of all combinations +template void ProcessDraw<false, false, false, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<false, false, false, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<false, false, false, true, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<false, false, false, true, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<false, false, true, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<false, false, true, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<false, false, true, true, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<false, false, true, true, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<false, true, false, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<false, true, false, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<false, true, false, true, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<false, true, false, true, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<false, true, true, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<false, true, true, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<false, true, true, true, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<false, true, true, true, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<true, false, false, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<true, false, false, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<true, false, false, true, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<true, false, false, true, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<true, false, true, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<true, false, true, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<true, false, true, true, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<true, false, true, true, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<true, true, false, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<true, true, false, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<true, true, false, true, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<true, true, false, true, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<true, true, true, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<true, true, true, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<true, true, true, true, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +template void ProcessDraw<true, true, true, true, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); + + +////////////////////////////////////////////////////////////////////////// +/// @brief Processes attributes for the backend based on linkage mask and +/// linkage map. Essentially just doing an SOA->AOS conversion and pack. +/// @param pDC - Draw context +/// @param pa - Primitive Assembly state +/// @param linkageMask - Specifies which VS outputs are routed to PS. +/// @param pLinkageMap - maps VS attribute slot to PS slot +/// @param triIndex - Triangle to process attributes for +/// @param pBuffer - Output result +template<uint32_t NumVerts> +INLINE void ProcessAttributes( + DRAW_CONTEXT *pDC, + PA_STATE&pa, + uint32_t linkageMask, + const uint8_t* pLinkageMap, + uint32_t triIndex, + float *pBuffer) +{ + DWORD slot = 0; + uint32_t mapIdx = 0; + LONG constantInterpMask = pDC->pState->state.backendState.constantInterpolationMask; + const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex; + + while (_BitScanForward(&slot, linkageMask)) + { + linkageMask &= ~(1 << slot); // done with this bit. + + // compute absolute slot in vertex attrib array + uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + pLinkageMap[mapIdx]; + + __m128 attrib[3]; // triangle attribs (always 4 wide) + pa.AssembleSingle(inputSlot, triIndex, attrib); + + if (_bittest(&constantInterpMask, mapIdx)) + { + for (uint32_t i = 0; i < NumVerts; ++i) + { + _mm_store_ps(pBuffer, attrib[provokingVertex]); + pBuffer += 4; + } + } + else + { + for (uint32_t i = 0; i < NumVerts; ++i) + { + _mm_store_ps(pBuffer, attrib[i]); + pBuffer += 4; + } + } + + // pad out the attrib buffer to 3 verts to ensure the triangle + // interpolation code in the pixel shader works correctly for the + // 3 topologies - point, line, tri. This effectively zeros out the + // effect of the missing vertices in the triangle interpolation. + for (uint32_t i = NumVerts; i < 3; ++i) + { + _mm_store_ps(pBuffer, attrib[NumVerts - 1]); + pBuffer += 4; + } + + mapIdx++; + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Processes enabled user clip distances. Loads the active clip +/// distances from the PA, sets up barycentric equations, and +/// stores the results to the output buffer +/// @param pa - Primitive Assembly state +/// @param primIndex - primitive index to process +/// @param clipDistMask - mask of enabled clip distances +/// @param pUserClipBuffer - buffer to store results +template<uint32_t NumVerts> +void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer) +{ + DWORD clipDist; + while (_BitScanForward(&clipDist, clipDistMask)) + { + clipDistMask &= ~(1 << clipDist); + uint32_t clipSlot = clipDist >> 2; + uint32_t clipComp = clipDist & 0x3; + uint32_t clipAttribSlot = clipSlot == 0 ? + VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT; + + __m128 primClipDist[3]; + pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist); + + float vertClipDist[NumVerts]; + for (uint32_t e = 0; e < NumVerts; ++e) + { + OSALIGNSIMD(float) aVertClipDist[4]; + _mm_store_ps(aVertClipDist, primClipDist[e]); + vertClipDist[e] = aVertClipDist[clipComp]; + }; + + // setup plane equations for barycentric interpolation in the backend + float baryCoeff[NumVerts]; + for (uint32_t e = 0; e < NumVerts - 1; ++e) + { + baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1]; + } + baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1]; + + for (uint32_t e = 0; e < NumVerts; ++e) + { + *(pUserClipBuffer++) = baryCoeff[e]; + } + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Bin triangle primitives to macro tiles. Performs setup, clipping +/// culling, viewport transform, etc. +/// @param pDC - pointer to draw context. +/// @param pa - The primitive assembly object. +/// @param workerId - thread's worker id. Even thread has a unique id. +/// @param tri - Contains triangle position data for SIMDs worth of triangles. +/// @param primID - Primitive ID for each triangle. +void BinTriangles( + DRAW_CONTEXT *pDC, + PA_STATE& pa, + uint32_t workerId, + simdvector tri[3], + uint32_t triMask, + simdscalari primID) +{ + RDTSC_START(FEBinTriangles); + + const API_STATE& state = GetApiState(pDC); + const SWR_RASTSTATE& rastState = state.rastState; + const SWR_FRONTEND_STATE& feState = state.frontendState; + const SWR_GS_STATE& gsState = state.gsState; + + // Simple wireframe mode for debugging purposes only + + simdscalar vRecipW0 = _simd_set1_ps(1.0f); + simdscalar vRecipW1 = _simd_set1_ps(1.0f); + simdscalar vRecipW2 = _simd_set1_ps(1.0f); + + if (!feState.vpTransformDisable) + { + // perspective divide + vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w); + vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w); + vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w); + + tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0); + tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1); + tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2); + + tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0); + tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1); + tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2); + + tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0); + tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1); + tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2); + + // viewport transform to screen coords + viewportTransform<3>(tri, state.vpMatrix[0]); + } + + // adjust for pixel center location + simdscalar offset = g_pixelOffsets[rastState.pixelLocation]; + tri[0].x = _simd_add_ps(tri[0].x, offset); + tri[0].y = _simd_add_ps(tri[0].y, offset); + + tri[1].x = _simd_add_ps(tri[1].x, offset); + tri[1].y = _simd_add_ps(tri[1].y, offset); + + tri[2].x = _simd_add_ps(tri[2].x, offset); + tri[2].y = _simd_add_ps(tri[2].y, offset); + + // convert to fixed point + simdscalari vXi[3], vYi[3]; + vXi[0] = fpToFixedPointVertical(tri[0].x); + vYi[0] = fpToFixedPointVertical(tri[0].y); + vXi[1] = fpToFixedPointVertical(tri[1].x); + vYi[1] = fpToFixedPointVertical(tri[1].y); + vXi[2] = fpToFixedPointVertical(tri[2].x); + vYi[2] = fpToFixedPointVertical(tri[2].y); + + // triangle setup + simdscalari vAi[3], vBi[3]; + triangleSetupABIntVertical(vXi, vYi, vAi, vBi); + + // determinant + simdscalari vDet[2]; + calcDeterminantIntVertical(vAi, vBi, vDet); + + // cull zero area + int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si()))); + int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si()))); + + int cullZeroAreaMask = maskLo | ((maskHi << KNOB_SIMD_WIDTH / 2)); + + uint32_t origTriMask = triMask; + triMask &= ~cullZeroAreaMask; + + // determine front winding tris + // CW +det + // CCW -det + maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si()))); + maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si()))); + int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) ); + + uint32_t frontWindingTris; + if (rastState.frontWinding == SWR_FRONTWINDING_CW) + { + frontWindingTris = cwTriMask; + } + else + { + frontWindingTris = ~cwTriMask; + } + + // cull + uint32_t cullTris; + switch ((SWR_CULLMODE)rastState.cullMode) + { + case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break; + case SWR_CULLMODE_NONE: cullTris = 0x0; break; + case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break; + case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break; + default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break; + } + + triMask &= ~cullTris; + + if (origTriMask ^ triMask) + { + RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0); + } + + // compute per tri backface + uint32_t frontFaceMask = frontWindingTris; + + uint32_t *pPrimID = (uint32_t *)&primID; + DWORD triIndex = 0; + + if (!triMask) + { + goto endBinTriangles; + } + + // Calc bounding box of triangles + simdBBox bbox; + calcBoundingBoxIntVertical(vXi, vYi, bbox); + + // determine if triangle falls between pixel centers and discard + // only discard for non-MSAA case + // (left + 127) & ~255 + // (right + 128) & ~255 + + if(rastState.sampleCount == SWR_MULTISAMPLE_1X) + { + origTriMask = triMask; + + int cullCenterMask; + { + simdscalari left = _simd_add_epi32(bbox.left, _simd_set1_epi32(127)); + left = _simd_and_si(left, _simd_set1_epi32(~255)); + simdscalari right = _simd_add_epi32(bbox.right, _simd_set1_epi32(128)); + right = _simd_and_si(right, _simd_set1_epi32(~255)); + + simdscalari vMaskH = _simd_cmpeq_epi32(left, right); + + simdscalari top = _simd_add_epi32(bbox.top, _simd_set1_epi32(127)); + top = _simd_and_si(top, _simd_set1_epi32(~255)); + simdscalari bottom = _simd_add_epi32(bbox.bottom, _simd_set1_epi32(128)); + bottom = _simd_and_si(bottom, _simd_set1_epi32(~255)); + + simdscalari vMaskV = _simd_cmpeq_epi32(top, bottom); + vMaskV = _simd_or_si(vMaskH, vMaskV); + cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV)); + } + + triMask &= ~cullCenterMask; + + if(origTriMask ^ triMask) + { + RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0); + } + } + + // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive. + bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left)); + bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top)); + bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right)); + bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom)); + + // Cull tris completely outside scissor + { + simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right); + simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom); + simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); + uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); + triMask = triMask & ~maskOutsideScissor; + } + + if (!triMask) + { + goto endBinTriangles; + } + + // Convert triangle bbox to macrotile units. + bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + + OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; + _simd_store_si((simdscalari*)aMTLeft, bbox.left); + _simd_store_si((simdscalari*)aMTRight, bbox.right); + _simd_store_si((simdscalari*)aMTTop, bbox.top); + _simd_store_si((simdscalari*)aMTBottom, bbox.bottom); + + // transpose verts needed for backend + /// @todo modify BE to take non-transformed verts + __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8]; + vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x); + vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y); + vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z); + vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2); + + // store render target array index + OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; + if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) + { + simdvector vRtai[3]; + pa.Assemble(VERTEX_RTAI_SLOT, vRtai); + simdscalari vRtaii; + vRtaii = _simd_castps_si(vRtai[0].x); + _simd_store_si((simdscalari*)aRTAI, vRtaii); + } + else + { + _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); + } + + // scan remaining valid triangles and bin each separately + while (_BitScanForward(&triIndex, triMask)) + { + uint32_t linkageCount = state.linkageCount; + uint32_t linkageMask = state.linkageMask; + uint32_t numScalarAttribs = linkageCount * 4; + + BE_WORK work; + work.type = DRAW; + + TRIANGLE_WORK_DESC &desc = work.desc.tri; + + desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1); + desc.triFlags.primID = pPrimID[triIndex]; + desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex]; + + if(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) + { + work.pfnWork = gRasterizerTable[rastState.scissorEnable][rastState.sampleCount]; + } + else + { + // for center sample pattern, all samples are at pixel center; calculate coverage + // once at center and broadcast the results in the backend + work.pfnWork = gRasterizerTable[rastState.scissorEnable][SWR_MULTISAMPLE_1X]; + } + + Arena* pArena = pDC->pArena; + SWR_ASSERT(pArena != nullptr); + + // store active attribs + float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); + desc.pAttribs = pAttribs; + desc.numAttribs = linkageCount; + ProcessAttributes<3>(pDC, pa, linkageMask, state.linkageMap, triIndex, desc.pAttribs); + + // store triangle vertex data + desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16); + + _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]); + _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]); + _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]); + _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]); + + // store user clip distances + if (rastState.clipDistanceMask) + { + uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); + desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float)); + ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer); + } + + MacroTileMgr *pTileMgr = pDC->pTileMgr; + for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y) + { + for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x) + { +#if KNOB_ENABLE_TOSS_POINTS + if (!KNOB_TOSS_SETUP_TRIS) +#endif + { + pTileMgr->enqueue(x, y, &work); + } + } + } + + triMask &= ~(1 << triIndex); + } + +endBinTriangles: + RDTSC_STOP(FEBinTriangles, 1, 0); +} + + + +////////////////////////////////////////////////////////////////////////// +/// @brief Bin SIMD points to the backend. Only supports point size of 1 +/// @param pDC - pointer to draw context. +/// @param pa - The primitive assembly object. +/// @param workerId - thread's worker id. Even thread has a unique id. +/// @param tri - Contains point position data for SIMDs worth of points. +/// @param primID - Primitive ID for each point. +void BinPoints( + DRAW_CONTEXT *pDC, + PA_STATE& pa, + uint32_t workerId, + simdvector prim[3], + uint32_t primMask, + simdscalari primID) +{ + RDTSC_START(FEBinPoints); + + simdvector& primVerts = prim[0]; + + const API_STATE& state = GetApiState(pDC); + const SWR_FRONTEND_STATE& feState = state.frontendState; + const SWR_GS_STATE& gsState = state.gsState; + const SWR_RASTSTATE& rastState = state.rastState; + + if (!feState.vpTransformDisable) + { + // perspective divide + simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w); + primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0); + primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0); + primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0); + + // viewport transform to screen coords + viewportTransform<1>(&primVerts, state.vpMatrix[0]); + } + + // adjust for pixel center location + simdscalar offset = g_pixelOffsets[rastState.pixelLocation]; + primVerts.x = _simd_add_ps(primVerts.x, offset); + primVerts.y = _simd_add_ps(primVerts.y, offset); + + // convert to fixed point + simdscalari vXi, vYi; + vXi = fpToFixedPointVertical(primVerts.x); + vYi = fpToFixedPointVertical(primVerts.y); + + if (CanUseSimplePoints(pDC)) + { + // adjust for top-left rule + vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1)); + vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1)); + + // cull points off the top-left edge of the viewport + primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi)); + primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi)); + + // compute macro tile coordinates + simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + + OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH]; + _simd_store_si((simdscalari*)aMacroX, macroX); + _simd_store_si((simdscalari*)aMacroY, macroY); + + // compute raster tile coordinates + simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); + simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); + + // compute raster tile relative x,y for coverage mask + simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT); + simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT); + + simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX); + simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY); + + OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH]; + OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH]; + _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX); + _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY); + + OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH]; + OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH]; + _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX); + _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY); + + OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH]; + _simd_store_ps((float*)aZ, primVerts.z); + + // store render target array index + OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; + if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) + { + simdvector vRtai; + pa.Assemble(VERTEX_RTAI_SLOT, &vRtai); + simdscalari vRtaii = _simd_castps_si(vRtai.x); + _simd_store_si((simdscalari*)aRTAI, vRtaii); + } + else + { + _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); + } + + uint32_t *pPrimID = (uint32_t *)&primID; + DWORD primIndex = 0; + // scan remaining valid triangles and bin each separately + while (_BitScanForward(&primIndex, primMask)) + { + uint32_t linkageCount = state.linkageCount; + uint32_t linkageMask = state.linkageMask; + + uint32_t numScalarAttribs = linkageCount * 4; + + BE_WORK work; + work.type = DRAW; + + TRIANGLE_WORK_DESC &desc = work.desc.tri; + + // points are always front facing + desc.triFlags.frontFacing = 1; + desc.triFlags.primID = pPrimID[primIndex]; + desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; + + work.pfnWork = RasterizeSimplePoint; + + Arena* pArena = pDC->pArena; + SWR_ASSERT(pArena != nullptr); + + // store attributes + float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16); + desc.pAttribs = pAttribs; + desc.numAttribs = linkageCount; + + ProcessAttributes<1>(pDC, pa, linkageMask, state.linkageMap, primIndex, pAttribs); + + // store raster tile aligned x, y, perspective correct z + float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16); + desc.pTriBuffer = pTriBuffer; + *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex]; + *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex]; + *pTriBuffer = aZ[primIndex]; + + uint32_t tX = aTileRelativeX[primIndex]; + uint32_t tY = aTileRelativeY[primIndex]; + + // pack the relative x,y into the coverageMask, the rasterizer will + // generate the true coverage mask from it + work.desc.tri.triFlags.coverageMask = tX | (tY << 4); + + // bin it + MacroTileMgr *pTileMgr = pDC->pTileMgr; +#if KNOB_ENABLE_TOSS_POINTS + if (!KNOB_TOSS_SETUP_TRIS) +#endif + { + pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work); + } + primMask &= ~(1 << primIndex); + } + } + else + { + // non simple points need to be potentially binned to multiple macro tiles + simdscalar vPointSize; + if (rastState.pointParam) + { + simdvector size[3]; + pa.Assemble(VERTEX_POINT_SIZE_SLOT, size); + vPointSize = size[0].x; + } + else + { + vPointSize = _simd_set1_ps(rastState.pointSize); + } + + // bloat point to bbox + simdBBox bbox; + bbox.left = bbox.right = vXi; + bbox.top = bbox.bottom = vYi; + + simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f)); + simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth); + bbox.left = _simd_sub_epi32(bbox.left, vHalfWidthi); + bbox.right = _simd_add_epi32(bbox.right, vHalfWidthi); + bbox.top = _simd_sub_epi32(bbox.top, vHalfWidthi); + bbox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi); + + // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive. + bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left)); + bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top)); + bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right)); + bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom)); + + // Cull bloated points completely outside scissor + simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right); + simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom); + simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); + uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); + primMask = primMask & ~maskOutsideScissor; + + // Convert bbox to macrotile units. + bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + + OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; + _simd_store_si((simdscalari*)aMTLeft, bbox.left); + _simd_store_si((simdscalari*)aMTRight, bbox.right); + _simd_store_si((simdscalari*)aMTTop, bbox.top); + _simd_store_si((simdscalari*)aMTBottom, bbox.bottom); + + // store render target array index + OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; + if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) + { + simdvector vRtai[2]; + pa.Assemble(VERTEX_RTAI_SLOT, vRtai); + simdscalari vRtaii = _simd_castps_si(vRtai[0].x); + _simd_store_si((simdscalari*)aRTAI, vRtaii); + } + else + { + _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); + } + + OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH]; + _simd_store_ps((float*)aPointSize, vPointSize); + + uint32_t *pPrimID = (uint32_t *)&primID; + + OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH]; + OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH]; + OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH]; + + _simd_store_ps((float*)aPrimVertsX, primVerts.x); + _simd_store_ps((float*)aPrimVertsY, primVerts.y); + _simd_store_ps((float*)aPrimVertsZ, primVerts.z); + + // scan remaining valid prims and bin each separately + DWORD primIndex; + while (_BitScanForward(&primIndex, primMask)) + { + uint32_t linkageCount = state.linkageCount; + uint32_t linkageMask = state.linkageMask; + uint32_t numScalarAttribs = linkageCount * 4; + + BE_WORK work; + work.type = DRAW; + + TRIANGLE_WORK_DESC &desc = work.desc.tri; + + desc.triFlags.frontFacing = 1; + desc.triFlags.primID = pPrimID[primIndex]; + desc.triFlags.pointSize = aPointSize[primIndex]; + desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; + + work.pfnWork = RasterizeTriPoint; + + Arena* pArena = pDC->pArena; + SWR_ASSERT(pArena != nullptr); + + // store active attribs + desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); + desc.numAttribs = linkageCount; + ProcessAttributes<1>(pDC, pa, linkageMask, state.linkageMap, primIndex, desc.pAttribs); + + // store point vertex data + float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16); + desc.pTriBuffer = pTriBuffer; + *pTriBuffer++ = aPrimVertsX[primIndex]; + *pTriBuffer++ = aPrimVertsY[primIndex]; + *pTriBuffer = aPrimVertsZ[primIndex]; + + // store user clip distances + if (rastState.clipDistanceMask) + { + uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); + desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float)); + ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer); + } + + MacroTileMgr *pTileMgr = pDC->pTileMgr; + for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y) + { + for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x) + { +#if KNOB_ENABLE_TOSS_POINTS + if (!KNOB_TOSS_SETUP_TRIS) +#endif + { + pTileMgr->enqueue(x, y, &work); + } + } + } + + primMask &= ~(1 << primIndex); + } + } + + + + + RDTSC_STOP(FEBinPoints, 1, 0); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Bin SIMD lines to the backend. +/// @param pDC - pointer to draw context. +/// @param pa - The primitive assembly object. +/// @param workerId - thread's worker id. Even thread has a unique id. +/// @param tri - Contains line position data for SIMDs worth of points. +/// @param primID - Primitive ID for each line. +void BinLines( + DRAW_CONTEXT *pDC, + PA_STATE& pa, + uint32_t workerId, + simdvector prim[], + uint32_t primMask, + simdscalari primID) +{ + RDTSC_START(FEBinLines); + + const API_STATE& state = GetApiState(pDC); + const SWR_RASTSTATE& rastState = state.rastState; + const SWR_FRONTEND_STATE& feState = state.frontendState; + const SWR_GS_STATE& gsState = state.gsState; + + simdscalar vRecipW0 = _simd_set1_ps(1.0f); + simdscalar vRecipW1 = _simd_set1_ps(1.0f); + + if (!feState.vpTransformDisable) + { + // perspective divide + vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w); + vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w); + + prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0); + prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1); + + prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0); + prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1); + + prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0); + prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1); + + // viewport transform to screen coords + viewportTransform<2>(prim, state.vpMatrix[0]); + } + + // adjust for pixel center location + simdscalar offset = g_pixelOffsets[rastState.pixelLocation]; + prim[0].x = _simd_add_ps(prim[0].x, offset); + prim[0].y = _simd_add_ps(prim[0].y, offset); + + prim[1].x = _simd_add_ps(prim[1].x, offset); + prim[1].y = _simd_add_ps(prim[1].y, offset); + + // convert to fixed point + simdscalari vXi[2], vYi[2]; + vXi[0] = fpToFixedPointVertical(prim[0].x); + vYi[0] = fpToFixedPointVertical(prim[0].y); + vXi[1] = fpToFixedPointVertical(prim[1].x); + vYi[1] = fpToFixedPointVertical(prim[1].y); + + // compute x-major vs y-major mask + simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1])); + simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1])); + simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength)); + uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask); + + // cull zero-length lines + simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si()); + vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si())); + + primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask)); + + uint32_t *pPrimID = (uint32_t *)&primID; + + simdscalar vUnused = _simd_setzero_ps(); + + // Calc bounding box of lines + simdBBox bbox; + bbox.left = _simd_min_epi32(vXi[0], vXi[1]); + bbox.right = _simd_max_epi32(vXi[0], vXi[1]); + bbox.top = _simd_min_epi32(vYi[0], vYi[1]); + bbox.bottom = _simd_max_epi32(vYi[0], vYi[1]); + + // bloat bbox by line width along minor axis + simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f); + simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth); + simdBBox bloatBox; + bloatBox.left = _simd_sub_epi32(bbox.left, vHalfWidthi); + bloatBox.right = _simd_add_epi32(bbox.right, vHalfWidthi); + bloatBox.top = _simd_sub_epi32(bbox.top, vHalfWidthi); + bloatBox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi); + + bbox.left = _simd_blendv_epi32(bbox.left, bloatBox.left, vYmajorMask); + bbox.right = _simd_blendv_epi32(bbox.right, bloatBox.right, vYmajorMask); + bbox.top = _simd_blendv_epi32(bloatBox.top, bbox.top, vYmajorMask); + bbox.bottom = _simd_blendv_epi32(bloatBox.bottom, bbox.bottom, vYmajorMask); + + // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive. + bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left)); + bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top)); + bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right)); + bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom)); + + // Cull prims completely outside scissor + { + simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right); + simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom); + simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); + uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); + primMask = primMask & ~maskOutsideScissor; + } + + if (!primMask) + { + goto endBinLines; + } + + // Convert triangle bbox to macrotile units. + bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + + OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; + _simd_store_si((simdscalari*)aMTLeft, bbox.left); + _simd_store_si((simdscalari*)aMTRight, bbox.right); + _simd_store_si((simdscalari*)aMTTop, bbox.top); + _simd_store_si((simdscalari*)aMTBottom, bbox.bottom); + + // transpose verts needed for backend + /// @todo modify BE to take non-transformed verts + __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8]; + vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused); + vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused); + vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused); + vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused); + + // store render target array index + OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; + if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) + { + simdvector vRtai[2]; + pa.Assemble(VERTEX_RTAI_SLOT, vRtai); + simdscalari vRtaii = _simd_castps_si(vRtai[0].x); + _simd_store_si((simdscalari*)aRTAI, vRtaii); + } + else + { + _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); + } + + // scan remaining valid prims and bin each separately + DWORD primIndex; + while (_BitScanForward(&primIndex, primMask)) + { + uint32_t linkageCount = state.linkageCount; + uint32_t linkageMask = state.linkageMask; + uint32_t numScalarAttribs = linkageCount * 4; + + BE_WORK work; + work.type = DRAW; + + TRIANGLE_WORK_DESC &desc = work.desc.tri; + + desc.triFlags.frontFacing = 1; + desc.triFlags.primID = pPrimID[primIndex]; + desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1; + desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; + + work.pfnWork = RasterizeLine; + + Arena* pArena = pDC->pArena; + SWR_ASSERT(pArena != nullptr); + + // store active attribs + desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); + desc.numAttribs = linkageCount; + ProcessAttributes<2>(pDC, pa, linkageMask, state.linkageMap, primIndex, desc.pAttribs); + + // store line vertex data + desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16); + _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]); + _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]); + _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]); + _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]); + + // store user clip distances + if (rastState.clipDistanceMask) + { + uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); + desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float)); + ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer); + } + + MacroTileMgr *pTileMgr = pDC->pTileMgr; + for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y) + { + for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x) + { +#if KNOB_ENABLE_TOSS_POINTS + if (!KNOB_TOSS_SETUP_TRIS) +#endif + { + pTileMgr->enqueue(x, y, &work); + } + } + } + + primMask &= ~(1 << primIndex); + } + +endBinLines: + + RDTSC_STOP(FEBinLines, 1, 0); +} diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h new file mode 100644 index 00000000000..acb935fc251 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h @@ -0,0 +1,327 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file frontend.h +* +* @brief Definitions for Frontend which handles vertex processing, +* primitive assembly, clipping, binning, etc. +* +******************************************************************************/ +#pragma once +#include "context.h" + +INLINE +__m128i fpToFixedPoint(const __m128 vIn) +{ + __m128 vFixed = _mm_mul_ps(vIn, _mm_set1_ps(FIXED_POINT_SCALE)); + return _mm_cvtps_epi32(vFixed); +} + +INLINE +simdscalari fpToFixedPointVertical(const simdscalar vIn) +{ + simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(FIXED_POINT_SCALE)); + return _simd_cvtps_epi32(vFixed); +} + + +// Calculates the A and B coefficients for the 3 edges of the triangle +// +// maths for edge equations: +// standard form of a line in 2d +// Ax + By + C = 0 +// A = y0 - y1 +// B = x1 - x0 +// C = x0y1 - x1y0 +INLINE +void triangleSetupAB(const __m128 vX, const __m128 vY, __m128 & vA, __m128 & vB) +{ + // vYsub = y1 y2 y0 dc + __m128 vYsub = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(3, 0, 2, 1)); + // vY = y0 y1 y2 dc + vA = _mm_sub_ps(vY, vYsub); + + // Result: + // A[0] = y0 - y1 + // A[1] = y1 - y2 + // A[2] = y2 - y0 + + // vXsub = x1 x2 x0 dc + __m128 vXsub = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(3, 0, 2, 1)); + // vX = x0 x1 x2 dc + vB = _mm_sub_ps(vXsub, vX); + + // Result: + // B[0] = x1 - x0 + // B[1] = x2 - x1 + // B[2] = x0 - x2 +} + +INLINE +void triangleSetupABVertical(const simdscalar vX[3], const simdscalar vY[3], simdscalar (&vA)[3], simdscalar (&vB)[3]) +{ + // generate edge equations + // A = y0 - y1 + // B = x1 - x0 + vA[0] = _simd_sub_ps(vY[0], vY[1]); + vA[1] = _simd_sub_ps(vY[1], vY[2]); + vA[2] = _simd_sub_ps(vY[2], vY[0]); + + vB[0] = _simd_sub_ps(vX[1], vX[0]); + vB[1] = _simd_sub_ps(vX[2], vX[1]); + vB[2] = _simd_sub_ps(vX[0], vX[2]); +} + +INLINE +void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i & vA, __m128i & vB) +{ + // generate edge equations + // A = y0 - y1 + // B = x1 - x0 + // C = x0y1 - x1y0 + __m128i vYsub = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 2, 1)); + vA = _mm_sub_epi32(vY, vYsub); + + __m128i vXsub = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 2, 1)); + vB = _mm_sub_epi32(vXsub, vX); +} + +INLINE +void triangleSetupABIntVertical(const simdscalari vX[3], const simdscalari vY[3], simdscalari (&vA)[3], simdscalari (&vB)[3]) +{ + // A = y0 - y1 + // B = x1 - x0 + vA[0] = _simd_sub_epi32(vY[0], vY[1]); + vA[1] = _simd_sub_epi32(vY[1], vY[2]); + vA[2] = _simd_sub_epi32(vY[2], vY[0]); + + vB[0] = _simd_sub_epi32(vX[1], vX[0]); + vB[1] = _simd_sub_epi32(vX[2], vX[1]); + vB[2] = _simd_sub_epi32(vX[0], vX[2]); +} +// Calculate the determinant of the triangle +// 2 vectors between the 3 points: P, Q +// Px = x0-x2, Py = y0-y2 +// Qx = x1-x2, Qy = y1-y2 +// |Px Qx| +// det = | | = PxQy - PyQx +// |Py Qy| +// simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2) +// try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx +// : B[2]*A[1] - (-(y2-y0))*(-(x2-x1)) +// : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1) +// : B[2]*A[1] - A[2]*B[1] +INLINE +float calcDeterminantInt(const __m128i vA, const __m128i vB) +{ + // vAShuf = [A1, A0, A2, A0] + __m128i vAShuf = _mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 2, 0, 1)); + // vBShuf = [B2, B0, B1, B0] + __m128i vBShuf = _mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 1, 0, 2)); + // vMul = [A1*B2, B1*A2] + __m128i vMul = _mm_mul_epi32(vAShuf, vBShuf); + + // shuffle upper to lower + // vMul2 = [B1*A2, B1*A2] + __m128i vMul2 = _mm_shuffle_epi32(vMul, _MM_SHUFFLE(3, 2, 3, 2)); + //vMul = [A1*B2 - B1*A2] + vMul = _mm_sub_epi64(vMul, vMul2); + + // According to emmintrin.h __mm_store1_pd(), address must be 16-byte aligned + OSALIGN(int64_t, 16) result; + _mm_store1_pd((double*)&result, _mm_castsi128_pd(vMul)); + + double fResult = (double)result; + fResult = fResult * (1.0 / FIXED_POINT16_SCALE); + + return (float)fResult; +} + +INLINE +void calcDeterminantIntVertical(const simdscalari vA[3], const simdscalari vB[3], simdscalari *pvDet) +{ + // refer to calcDeterminantInt comment for calculation explanation + // A1*B2 + simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5 + simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7 + + simdscalari vB2Lo = _simd_unpacklo_epi32(vB[2], vB[2]); + simdscalari vB2Hi = _simd_unpackhi_epi32(vB[2], vB[2]); + + simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5 + simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7 + + // B1*A2 + simdscalari vA2Lo = _simd_unpacklo_epi32(vA[2], vA[2]); + simdscalari vA2Hi = _simd_unpackhi_epi32(vA[2], vA[2]); + + simdscalari vB1Lo = _simd_unpacklo_epi32(vB[1], vB[1]); + simdscalari vB1Hi = _simd_unpackhi_epi32(vB[1], vB[1]); + + simdscalari vA2B1Lo = _simd_mul_epi32(vA2Lo, vB1Lo); + simdscalari vA2B1Hi = _simd_mul_epi32(vA2Hi, vB1Hi); + + // A1*B2 - A2*B1 + simdscalari detLo = _simd_sub_epi64(vA1B2Lo, vA2B1Lo); + simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi); + + // shuffle 0 1 4 5 -> 0 1 2 3 + simdscalari vResultLo = _mm256_permute2f128_si256(detLo, detHi, 0x20); + simdscalari vResultHi = _mm256_permute2f128_si256(detLo, detHi, 0x31); + + pvDet[0] = vResultLo; + pvDet[1] = vResultHi; +} + +INLINE +void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128 &vB, __m128 &vC) +{ + // C = -Ax - By + vC = _mm_mul_ps(vA, vX); + __m128 vCy = _mm_mul_ps(vB, vY); + vC = _mm_mul_ps(vC, _mm_set1_ps(-1.0f)); + vC = _mm_sub_ps(vC, vCy); +} + +INLINE +void viewportTransform(__m128 &vX, __m128 &vY, __m128 &vZ, const SWR_VIEWPORT_MATRIX &vpMatrix) +{ + vX = _mm_mul_ps(vX, _mm_set1_ps(vpMatrix.m00)); + vX = _mm_add_ps(vX, _mm_set1_ps(vpMatrix.m30)); + + vY = _mm_mul_ps(vY, _mm_set1_ps(vpMatrix.m11)); + vY = _mm_add_ps(vY, _mm_set1_ps(vpMatrix.m31)); + + vZ = _mm_mul_ps(vZ, _mm_set1_ps(vpMatrix.m22)); + vZ = _mm_add_ps(vZ, _mm_set1_ps(vpMatrix.m32)); +} + +template<uint32_t NumVerts> +INLINE +void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRIX & vpMatrix) +{ + simdscalar m00 = _simd_load1_ps(&vpMatrix.m00); + simdscalar m30 = _simd_load1_ps(&vpMatrix.m30); + simdscalar m11 = _simd_load1_ps(&vpMatrix.m11); + simdscalar m31 = _simd_load1_ps(&vpMatrix.m31); + simdscalar m22 = _simd_load1_ps(&vpMatrix.m22); + simdscalar m32 = _simd_load1_ps(&vpMatrix.m32); + + for (uint32_t i = 0; i < NumVerts; ++i) + { + v[i].x = _simd_fmadd_ps(v[i].x, m00, m30); + v[i].y = _simd_fmadd_ps(v[i].y, m11, m31); + v[i].z = _simd_fmadd_ps(v[i].z, m22, m32); + } +} + +INLINE +void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, BBOX &bbox) +{ + // Need horizontal fp min here + __m128i vX1 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 2, 0, 1)); + __m128i vX2 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 1, 2)); + + __m128i vY1 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 2, 0, 1)); + __m128i vY2 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 1, 2)); + + + __m128i vMinX = _mm_min_epi32(vX, vX1); + vMinX = _mm_min_epi32(vMinX, vX2); + + __m128i vMaxX = _mm_max_epi32(vX, vX1); + vMaxX = _mm_max_epi32(vMaxX, vX2); + + __m128i vMinY = _mm_min_epi32(vY, vY1); + vMinY = _mm_min_epi32(vMinY, vY2); + + __m128i vMaxY = _mm_max_epi32(vY, vY1); + vMaxY = _mm_max_epi32(vMaxY, vY2); + + bbox.left = _mm_extract_epi32(vMinX, 0); + bbox.right = _mm_extract_epi32(vMaxX, 0); + bbox.top = _mm_extract_epi32(vMinY, 0); + bbox.bottom = _mm_extract_epi32(vMaxY, 0); + +#if 0 + Jacob: A = _mm_shuffle_ps(X, Y, 0 0 0 0) +B = _mm_shuffle_ps(Z, W, 0 0 0 0) +A = _mm_shuffle_epi32(A, 3 0 3 0) +A = _mm_shuffle_ps(A, B, 1 0 1 0) +#endif + +} + +INLINE +void calcBoundingBoxIntVertical(const simdscalari (&vX)[3], const simdscalari (&vY)[3], simdBBox &bbox) +{ + simdscalari vMinX = vX[0]; + vMinX = _simd_min_epi32(vMinX, vX[1]); + vMinX = _simd_min_epi32(vMinX, vX[2]); + + simdscalari vMaxX = vX[0]; + vMaxX = _simd_max_epi32(vMaxX, vX[1]); + vMaxX = _simd_max_epi32(vMaxX, vX[2]); + + simdscalari vMinY = vY[0]; + vMinY = _simd_min_epi32(vMinY, vY[1]); + vMinY = _simd_min_epi32(vMinY, vY[2]); + + simdscalari vMaxY = vY[0]; + vMaxY = _simd_max_epi32(vMaxY, vY[1]); + vMaxY = _simd_max_epi32(vMaxY, vY[2]); + + bbox.left = vMinX; + bbox.right = vMaxX; + bbox.top = vMinY; + bbox.bottom = vMaxY; +} + +INLINE +bool CanUseSimplePoints(DRAW_CONTEXT *pDC) +{ + const API_STATE& state = GetApiState(pDC); + + return (state.rastState.sampleCount == SWR_MULTISAMPLE_1X && + state.rastState.pointSize == 1.0f && + !state.rastState.pointParam && + !state.rastState.pointSpriteEnable); +} + +uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements); +uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts); + +// Templated Draw front-end function. All combinations of template parameter values are available +template <bool IsIndexedT, bool HasTessellationT, bool HasGeometryShaderT, bool HasStreamOutT, bool HasRastT> +void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); + +void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +void ProcessInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +void ProcessQueryStats(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); + +struct PA_STATE_BASE; // forward decl +void BinTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector tri[3], uint32_t primMask, simdscalari primID); +void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID); +void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID); + diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h new file mode 100644 index 00000000000..d7feb86273d --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h @@ -0,0 +1,142 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file knobs.h +* +* @brief Static (Compile-Time) Knobs for Core. +* +******************************************************************************/ +#pragma once + +#include <stdint.h> +#include <gen_knobs.h> + +#define KNOB_ARCH_AVX 0 +#define KNOB_ARCH_AVX2 1 +#define KNOB_ARCH_AVX512 2 + +/////////////////////////////////////////////////////////////////////////////// +// Architecture validation +/////////////////////////////////////////////////////////////////////////////// +#if !defined(KNOB_ARCH) +#define KNOB_ARCH KNOB_ARCH_AVX +#endif + +#if (KNOB_ARCH == KNOB_ARCH_AVX) +#define KNOB_ARCH_ISA AVX +#define KNOB_ARCH_STR "AVX" +#define KNOB_SIMD_WIDTH 8 +#elif (KNOB_ARCH == KNOB_ARCH_AVX2) +#define KNOB_ARCH_ISA AVX2 +#define KNOB_ARCH_STR "AVX2" +#define KNOB_SIMD_WIDTH 8 +#elif (KNOB_ARCH == KNOB_ARCH_AVX512) +#define KNOB_ARCH_ISA AVX512F +#define KNOB_ARCH_STR "AVX512" +#define KNOB_SIMD_WIDTH 16 +#error "AVX512 not yet supported" +#else +#error "Unknown architecture" +#endif + +#define MAX_KNOB_ARCH_STR_LEN sizeof("AVX512_PLUS_PADDING") + +/////////////////////////////////////////////////////////////////////////////// +// Configuration knobs +/////////////////////////////////////////////////////////////////////////////// +#define KNOB_MAX_NUM_THREADS 256 // Supports up to dual-HSW-Xeon. + +// Maximum supported number of active vertex buffer streams +#define KNOB_NUM_STREAMS 32 + +// Maximum supported number of attributes per vertex +#define KNOB_NUM_ATTRIBUTES 38 + +// Maximum supported active viewports and scissors +#define KNOB_NUM_VIEWPORTS_SCISSORS 16 + +// Guardband range used by the clipper +#define KNOB_GUARDBAND_WIDTH 32768.0f +#define KNOB_GUARDBAND_HEIGHT 32768.0f + +/////////////////////////////// +// Macro tile configuration +/////////////////////////////// + +// raster tile dimensions +#define KNOB_TILE_X_DIM 8 +#define KNOB_TILE_X_DIM_SHIFT 3 +#define KNOB_TILE_Y_DIM 8 +#define KNOB_TILE_Y_DIM_SHIFT 3 + +// fixed macrotile pixel dimension for now, eventually will be +// dynamically set based on tile format and pixel size +#define KNOB_MACROTILE_X_DIM 64 +#define KNOB_MACROTILE_Y_DIM 64 +#define KNOB_MACROTILE_X_DIM_FIXED (KNOB_MACROTILE_X_DIM << 8) +#define KNOB_MACROTILE_Y_DIM_FIXED (KNOB_MACROTILE_Y_DIM << 8) +#define KNOB_MACROTILE_X_DIM_FIXED_SHIFT 14 +#define KNOB_MACROTILE_Y_DIM_FIXED_SHIFT 14 +#define KNOB_MACROTILE_X_DIM_IN_TILES (KNOB_MACROTILE_X_DIM >> KNOB_TILE_X_DIM_SHIFT) +#define KNOB_MACROTILE_Y_DIM_IN_TILES (KNOB_MACROTILE_Y_DIM >> KNOB_TILE_Y_DIM_SHIFT) + +// total # of hot tiles available. This should be enough to +// fully render a 16kx16k 128bpp render target +#define KNOB_NUM_HOT_TILES_X 256 +#define KNOB_NUM_HOT_TILES_Y 256 +#define KNOB_COLOR_HOT_TILE_FORMAT R32G32B32A32_FLOAT +#define KNOB_DEPTH_HOT_TILE_FORMAT R32_FLOAT +#define KNOB_STENCIL_HOT_TILE_FORMAT R8_UINT + +// Max scissor rectangle +#define KNOB_MAX_SCISSOR_X KNOB_NUM_HOT_TILES_X * KNOB_MACROTILE_X_DIM +#define KNOB_MAX_SCISSOR_Y KNOB_NUM_HOT_TILES_Y * KNOB_MACROTILE_Y_DIM + +#if KNOB_SIMD_WIDTH==8 && KNOB_TILE_X_DIM < 4 +#error "incompatible width/tile dimensions" +#endif + +#if KNOB_SIMD_WIDTH == 8 +#define SIMD_TILE_X_DIM 4 +#define SIMD_TILE_Y_DIM 2 +#else +#error "Invalid simd width" +#endif + +/////////////////////////////////////////////////////////////////////////////// +// Optimization knobs +/////////////////////////////////////////////////////////////////////////////// +#define KNOB_USE_FAST_SRGB TRUE + +// enables cut-aware primitive assembler +#define KNOB_ENABLE_CUT_AWARE_PA TRUE + +/////////////////////////////////////////////////////////////////////////////// +// Debug knobs +/////////////////////////////////////////////////////////////////////////////// +//#define KNOB_ENABLE_RDTSC + +// Set to 1 to use the dynamic KNOB_TOSS_XXXX knobs. +#if !defined(KNOB_ENABLE_TOSS_POINTS) +#define KNOB_ENABLE_TOSS_POINTS 0 +#endif + diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h new file mode 100644 index 00000000000..3f19555557f --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h @@ -0,0 +1,98 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file knobs_init.h +* +* @brief Dynamic Knobs Initialization for Core. +* +******************************************************************************/ +#pragma once + +#include <core/knobs.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include <stdio.h> + +// Assume the type is compatible with a 32-bit integer +template <typename T> +static inline void ConvertEnvToKnob(const char* pOverride, T& knobValue) +{ + uint32_t value = 0; + if (sscanf(pOverride, "%u", &value)) + { + knobValue = static_cast<T>(value); + } +} + +static inline void ConvertEnvToKnob(const char* pOverride, bool& knobValue) +{ + size_t len = strlen(pOverride); + if (len == 1) + { + auto c = tolower(pOverride[0]); + if (c == 'y' || c == 't' || c == '1') + { + knobValue = true; + return; + } + if (c == 'n' || c == 'f' || c == '0') + { + knobValue = false; + return; + } + } + + // Try converting to a number and casting to bool + uint32_t value = 0; + if (sscanf(pOverride, "%u", &value)) + { + knobValue = value != 0; + return; + } +} + +static inline void ConvertEnvToKnob(const char* pOverride, float& knobValue) +{ + float value = knobValue; + if (sscanf(pOverride, "%f", &value)) + { + knobValue = value; + } +} + +template <typename T> +static inline void InitKnob(T& knob) +{ + + // TODO, read registry first + + // Second, read environment variables + const char* pOverride = getenv(knob.Name()); + + if (pOverride) + { + auto knobValue = knob.Value(); + ConvertEnvToKnob(pOverride, knobValue); + knob.Value(knobValue); + } +} diff --git a/src/gallium/drivers/swr/rasterizer/core/multisample.cpp b/src/gallium/drivers/swr/rasterizer/core/multisample.cpp new file mode 100644 index 00000000000..d51a546b063 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/multisample.cpp @@ -0,0 +1,51 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file multisample.cpp +* +******************************************************************************/ + +#include "multisample.h" + +const uint32_t MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosXi[2] {0xC0, 0x40}; +const uint32_t MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosYi[2] {0xC0, 0x40}; +const uint32_t MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosXi[4] {0x60, 0xE0, 0x20, 0xA0}; +const uint32_t MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosYi[4] {0x20, 0x60, 0xA0, 0xE0}; +const uint32_t MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosXi[8] {0x90, 0x70, 0xD0, 0x50, 0x30, 0x10, 0xB0, 0xF0}; +const uint32_t MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosYi[8] {0x50, 0xB0, 0x90, 0x30, 0xD0, 0x70, 0xF0, 0x10}; +const uint32_t MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosXi[16] +{0x90, 0x70, 0x50, 0xC0, 0x30, 0xA0, 0xD0, 0xB0, 0x60, 0x80, 0x40, 0x20, 0x00, 0xF0, 0xE0, 0x10}; +const uint32_t MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosYi[16] +{0x90, 0x50, 0xA0, 0x70, 0x60, 0xD0, 0xB0, 0x30, 0xE0, 0x10, 0x20, 0xC0, 0x80, 0x40, 0xF0, 0x00}; + +const float MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosX{0.5f}; +const float MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosY{0.5f}; +const float MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosX[2]{0.75f, 0.25f}; +const float MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosY[2]{0.75f, 0.25f}; +const float MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosX[4]{0.375f, 0.875, 0.125, 0.625}; +const float MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosY[4]{0.125, 0.375, 0.625, 0.875}; +const float MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosX[8]{0.5625, 0.4375, 0.8125, 0.3125, 0.1875, 0.0625, 0.6875, 0.9375}; +const float MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosY[8]{0.3125, 0.6875, 0.5625, 0.1875, 0.8125, 0.4375, 0.9375, 0.0625}; +const float MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosX[16] +{0.5625, 0.4375, 0.3125, 0.7500, 0.1875, 0.6250, 0.8125, 0.6875, 0.3750, 0.5000, 0.2500, 0.1250, 0.0000, 0.9375, 0.8750, 0.0625}; +const float MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosY[16] +{0.5625, 0.3125, 0.6250, 0.4375, 0.3750, 0.8125, 0.6875, 0.1875, 0.8750, 0.0625, 0.1250, 0.7500, 0.5000, 0.2500, 0.9375, 0.0000}; diff --git a/src/gallium/drivers/swr/rasterizer/core/multisample.h b/src/gallium/drivers/swr/rasterizer/core/multisample.h new file mode 100644 index 00000000000..4ae777e2fc5 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/multisample.h @@ -0,0 +1,620 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file multisample.h +* +******************************************************************************/ + +#pragma once + +#include "context.h" +#include "format_traits.h" + +INLINE +uint32_t GetNumSamples(SWR_MULTISAMPLE_COUNT sampleCount) +{ + static const uint32_t sampleCountLUT[SWR_MULTISAMPLE_TYPE_MAX] {1, 2, 4, 8, 16}; + assert(sampleCount < SWR_MULTISAMPLE_TYPE_MAX); + return sampleCountLUT[sampleCount]; +} + +INLINE +SWR_MULTISAMPLE_COUNT GetSampleCount(uint32_t numSamples) +{ + switch(numSamples) + { + case 1: return SWR_MULTISAMPLE_1X; + case 2: return SWR_MULTISAMPLE_2X; + case 4: return SWR_MULTISAMPLE_4X; + case 8: return SWR_MULTISAMPLE_8X; + case 16: return SWR_MULTISAMPLE_16X; + default: assert(0); return SWR_MULTISAMPLE_1X; + } +} + +// hardcoded offsets based on Direct3d standard multisample positions +// 8 x 8 pixel grid ranging from (0, 0) to (15, 15), with (0, 0) = UL pixel corner +// coords are 0.8 fixed point offsets from (0, 0) +template<SWR_MULTISAMPLE_COUNT sampleCount> +struct MultisampleTraits +{ + INLINE static __m128i vXi(uint32_t sampleNum) = delete; + INLINE static __m128i vYi(uint32_t sampleNum) = delete; + INLINE static simdscalar vX(uint32_t sampleNum) = delete; + INLINE static simdscalar vY(uint32_t sampleNum) = delete; + INLINE static float X(uint32_t sampleNum) = delete; + INLINE static float Y(uint32_t sampleNum) = delete; + INLINE static __m128i TileSampleOffsetsX() = delete; + INLINE static __m128i TileSampleOffsetsY() = delete; + INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) = delete; + INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) = delete; + INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) = delete; + INLINE static simdscalari FullSampleMask() = delete; + + static const uint32_t numSamples = 0; +}; + +template<> +struct MultisampleTraits<SWR_MULTISAMPLE_1X> +{ + INLINE static __m128i vXi(uint32_t sampleNum) + { + static const __m128i X = _mm_set1_epi32(samplePosXi); + return X; + } + + INLINE static __m128i vYi(uint32_t sampleNum) + { + static const __m128i Y = _mm_set1_epi32(samplePosYi); + return Y; + } + + INLINE static simdscalar vX(uint32_t sampleNum) + { + static const simdscalar X = _simd_set1_ps(0.5f); + return X; + } + + INLINE static simdscalar vY(uint32_t sampleNum) + { + static const simdscalar Y = _simd_set1_ps(0.5f); + return Y; + } + + INLINE static float X(uint32_t sampleNum) {return samplePosX;}; + INLINE static float Y(uint32_t sampleNum) {return samplePosY;}; + + INLINE static __m128i TileSampleOffsetsX() + { + static const uint32_t bboxLeftEdge = 0x80; + static const uint32_t bboxRightEdge = 0x80; + // BR, BL, UR, UL + static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge); + return tileSampleOffsetX; + } + + INLINE static __m128i TileSampleOffsetsY() + { + static const uint32_t bboxTopEdge = 0x80; + static const uint32_t bboxBottomEdge = 0x80; + // BR, BL, UR, UL + static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge); + return tileSampleOffsetY; + } + + INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) + { + return 0; + } + + INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) + { + return 0; + } + + INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) + { + return 0; + } + + INLINE static simdscalari FullSampleMask(){return _simd_set1_epi32(0x1);}; + + static const uint32_t samplePosXi {0x80}; + static const uint32_t samplePosYi {0x80}; + static const float samplePosX; + static const float samplePosY; + static const uint32_t numSamples = 1; +}; + +template<> +struct MultisampleTraits<SWR_MULTISAMPLE_2X> +{ + INLINE static __m128i vXi(uint32_t sampleNum) + { + SWR_ASSERT(sampleNum < numSamples); + static const __m128i X[numSamples] {_mm_set1_epi32(samplePosXi[0]), _mm_set1_epi32(samplePosXi[1])}; + return X[sampleNum]; + } + + INLINE static __m128i vYi(uint32_t sampleNum) + { + SWR_ASSERT(sampleNum < numSamples); + static const __m128i Y[numSamples] {_mm_set1_epi32(samplePosYi[0]), _mm_set1_epi32(samplePosYi[1])}; + return Y[sampleNum]; + } + + INLINE static simdscalar vX(uint32_t sampleNum) + { + static const simdscalar X[numSamples] {_simd_set1_ps(0.75f), _simd_set1_ps(0.25f)}; + assert(sampleNum < numSamples); + return X[sampleNum]; + } + + INLINE static simdscalar vY(uint32_t sampleNum) + { + static const simdscalar Y[numSamples] {_simd_set1_ps(0.75f), _simd_set1_ps(0.25f)}; + assert(sampleNum < numSamples); + return Y[sampleNum]; + } + + INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; }; + INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; }; + + INLINE static __m128i TileSampleOffsetsX() + { + static const uint32_t bboxLeftEdge = 0x40; + static const uint32_t bboxRightEdge = 0xC0; + // BR, BL, UR, UL + static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge); + return tileSampleOffsetX; + } + + INLINE static __m128i TileSampleOffsetsY() + { + static const uint32_t bboxTopEdge = 0x40; + static const uint32_t bboxBottomEdge = 0xC0; + // BR, BL, UR, UL + static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge); + return tileSampleOffsetY; + } + + INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) + { + static const uint32_t RasterTileColorOffsets[numSamples] + { 0, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) + }; + assert(sampleNum < numSamples); + return RasterTileColorOffsets[sampleNum]; + } + + INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) + { + static const uint32_t RasterTileDepthOffsets[numSamples] + { 0, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) + }; + assert(sampleNum < numSamples); + return RasterTileDepthOffsets[sampleNum]; + } + + INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) + { + static const uint32_t RasterTileStencilOffsets[numSamples] + { 0, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) + }; + assert(sampleNum < numSamples); + return RasterTileStencilOffsets[sampleNum]; + } + + INLINE static simdscalari FullSampleMask() + { + static const simdscalari mask =_simd_set1_epi32(0x3); + return mask; + } + + static const uint32_t samplePosXi[2]; + static const uint32_t samplePosYi[2]; + static const float samplePosX[2]; + static const float samplePosY[2]; + static const uint32_t numSamples = 2; +}; + +template<> +struct MultisampleTraits<SWR_MULTISAMPLE_4X> +{ + INLINE static __m128i vXi(uint32_t sampleNum) + { + static const __m128i X[numSamples] + {_mm_set1_epi32(samplePosXi[0]), _mm_set1_epi32(samplePosXi[1]), _mm_set1_epi32(samplePosXi[2]), _mm_set1_epi32(samplePosXi[3])}; + SWR_ASSERT(sampleNum < numSamples); + return X[sampleNum]; + } + + INLINE static __m128i vYi(uint32_t sampleNum) + { + static const __m128i Y[numSamples] + {_mm_set1_epi32(samplePosYi[0]), _mm_set1_epi32(samplePosYi[1]), _mm_set1_epi32(samplePosYi[2]), _mm_set1_epi32(samplePosYi[3])}; + SWR_ASSERT(sampleNum < numSamples); + return Y[sampleNum]; + } + + INLINE static simdscalar vX(uint32_t sampleNum) + { + static const simdscalar X[numSamples] + {_simd_set1_ps(0.375f), _simd_set1_ps(0.875), _simd_set1_ps(0.125), _simd_set1_ps(0.625)}; + assert(sampleNum < numSamples); + return X[sampleNum]; + } + + INLINE static simdscalar vY(uint32_t sampleNum) + { + static const simdscalar Y[numSamples] + {_simd_set1_ps(0.125), _simd_set1_ps(0.375f), _simd_set1_ps(0.625), _simd_set1_ps(0.875)}; + assert(sampleNum < numSamples); + return Y[sampleNum]; + } + + INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; }; + INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; }; + + INLINE static __m128i TileSampleOffsetsX() + { + static const uint32_t bboxLeftEdge = 0x20; + static const uint32_t bboxRightEdge = 0xE0; + // BR, BL, UR, UL + static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge); + return tileSampleOffsetX; + } + + INLINE static __m128i TileSampleOffsetsY() + { + static const uint32_t bboxTopEdge = 0x20; + static const uint32_t bboxBottomEdge = 0xE0; + // BR, BL, UR, UL + static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge); + return tileSampleOffsetY; + } + + INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) + { + static const uint32_t RasterTileColorOffsets[numSamples] + { 0, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8), + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3, + }; + assert(sampleNum < numSamples); + return RasterTileColorOffsets[sampleNum]; + } + + INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) + { + static const uint32_t RasterTileDepthOffsets[numSamples] + { 0, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8), + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3, + }; + assert(sampleNum < numSamples); + return RasterTileDepthOffsets[sampleNum]; + } + + INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) + { + static const uint32_t RasterTileStencilOffsets[numSamples] + { 0, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8), + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3, + }; + assert(sampleNum < numSamples); + return RasterTileStencilOffsets[sampleNum]; + } + + INLINE static simdscalari FullSampleMask() + { + static const simdscalari mask = _simd_set1_epi32(0xF); + return mask; + } + + static const uint32_t samplePosXi[4]; + static const uint32_t samplePosYi[4]; + static const float samplePosX[4]; + static const float samplePosY[4]; + static const uint32_t numSamples = 4; +}; + +template<> +struct MultisampleTraits<SWR_MULTISAMPLE_8X> +{ + INLINE static __m128i vXi(uint32_t sampleNum) + { + static const __m128i X[numSamples] + {_mm_set1_epi32(samplePosXi[0]), _mm_set1_epi32(samplePosXi[1]), _mm_set1_epi32(samplePosXi[2]), _mm_set1_epi32(samplePosXi[3]), + _mm_set1_epi32(samplePosXi[4]), _mm_set1_epi32(samplePosXi[5]), _mm_set1_epi32(samplePosXi[6]), _mm_set1_epi32(samplePosXi[7])}; + SWR_ASSERT(sampleNum < numSamples); + return X[sampleNum]; + } + + INLINE static __m128i vYi(uint32_t sampleNum) + { + static const __m128i Y[numSamples] + {_mm_set1_epi32(samplePosYi[0]), _mm_set1_epi32(samplePosYi[1]), _mm_set1_epi32(samplePosYi[2]), _mm_set1_epi32(samplePosYi[3]), + _mm_set1_epi32(samplePosYi[4]), _mm_set1_epi32(samplePosYi[5]), _mm_set1_epi32(samplePosYi[6]), _mm_set1_epi32(samplePosYi[7])}; + SWR_ASSERT(sampleNum < numSamples); + return Y[sampleNum]; + } + + INLINE static simdscalar vX(uint32_t sampleNum) + { + static const simdscalar X[numSamples] + {_simd_set1_ps(0.5625), _simd_set1_ps(0.4375), _simd_set1_ps(0.8125), _simd_set1_ps(0.3125), + _simd_set1_ps(0.1875), _simd_set1_ps(0.0625), _simd_set1_ps(0.6875), _simd_set1_ps(0.9375)}; + assert(sampleNum < numSamples); + return X[sampleNum]; + } + + INLINE static simdscalar vY(uint32_t sampleNum) + { + static const simdscalar Y[numSamples] + {_simd_set1_ps(0.3125), _simd_set1_ps(0.6875), _simd_set1_ps(0.5625), _simd_set1_ps(0.1875), + _simd_set1_ps(0.8125), _simd_set1_ps(0.4375), _simd_set1_ps(0.9375), _simd_set1_ps(0.0625)}; + assert(sampleNum < numSamples); + return Y[sampleNum]; + } + + INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; }; + INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; }; + + INLINE static __m128i TileSampleOffsetsX() + { + static const uint32_t bboxLeftEdge = 0x10; + static const uint32_t bboxRightEdge = 0xF0; + // BR, BL, UR, UL + static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge); + return tileSampleOffsetX; + } + + INLINE static __m128i TileSampleOffsetsY() + { + static const uint32_t bboxTopEdge = 0x10; + static const uint32_t bboxBottomEdge = 0xF0; + // BR, BL, UR, UL + static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge); + return tileSampleOffsetY; + } + + INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) + { + static const uint32_t RasterTileColorOffsets[numSamples] + { 0, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8), + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7, + }; + assert(sampleNum < numSamples); + return RasterTileColorOffsets[sampleNum]; + } + + INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) + { + static const uint32_t RasterTileDepthOffsets[numSamples] + { 0, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8), + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7, + }; + assert(sampleNum < numSamples); + return RasterTileDepthOffsets[sampleNum]; + } + + INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) + { + static const uint32_t RasterTileStencilOffsets[numSamples] + { 0, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8), + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7, + }; + assert(sampleNum < numSamples); + return RasterTileStencilOffsets[sampleNum]; + } + + INLINE static simdscalari FullSampleMask() + { + static const simdscalari mask = _simd_set1_epi32(0xFF); + return mask; + } + + static const uint32_t samplePosXi[8]; + static const uint32_t samplePosYi[8]; + static const float samplePosX[8]; + static const float samplePosY[8]; + static const uint32_t numSamples = 8; +}; + +template<> +struct MultisampleTraits<SWR_MULTISAMPLE_16X> +{ + INLINE static __m128i vXi(uint32_t sampleNum) + { + static const __m128i X[numSamples] + {_mm_set1_epi32(samplePosXi[0]), _mm_set1_epi32(samplePosXi[1]), _mm_set1_epi32(samplePosXi[2]), _mm_set1_epi32(samplePosXi[3]), + _mm_set1_epi32(samplePosXi[4]), _mm_set1_epi32(samplePosXi[5]), _mm_set1_epi32(samplePosXi[6]), _mm_set1_epi32(samplePosXi[7]), + _mm_set1_epi32(samplePosXi[8]), _mm_set1_epi32(samplePosXi[9]), _mm_set1_epi32(samplePosXi[10]), _mm_set1_epi32(samplePosXi[11]), + _mm_set1_epi32(samplePosXi[12]), _mm_set1_epi32(samplePosXi[13]), _mm_set1_epi32(samplePosXi[14]), _mm_set1_epi32(samplePosXi[15])}; + SWR_ASSERT(sampleNum < numSamples); + return X[sampleNum]; + } + + INLINE static __m128i vYi(uint32_t sampleNum) + { + static const __m128i Y[numSamples] + {_mm_set1_epi32(samplePosYi[0]), _mm_set1_epi32(samplePosYi[1]), _mm_set1_epi32(samplePosYi[2]), _mm_set1_epi32(samplePosYi[3]), + _mm_set1_epi32(samplePosYi[4]), _mm_set1_epi32(samplePosYi[5]), _mm_set1_epi32(samplePosYi[6]), _mm_set1_epi32(samplePosYi[7]), + _mm_set1_epi32(samplePosYi[8]), _mm_set1_epi32(samplePosYi[9]), _mm_set1_epi32(samplePosYi[10]), _mm_set1_epi32(samplePosYi[11]), + _mm_set1_epi32(samplePosYi[12]), _mm_set1_epi32(samplePosYi[13]), _mm_set1_epi32(samplePosYi[14]), _mm_set1_epi32(samplePosYi[15])}; + SWR_ASSERT(sampleNum < numSamples); + return Y[sampleNum]; + } + + INLINE static simdscalar vX(uint32_t sampleNum) + { + static const simdscalar X[numSamples] + {_simd_set1_ps(0.5625), _simd_set1_ps(0.4375), _simd_set1_ps(0.3125), _simd_set1_ps(0.7500), + _simd_set1_ps(0.1875), _simd_set1_ps(0.6250), _simd_set1_ps(0.8125), _simd_set1_ps(0.6875), + _simd_set1_ps(0.3750), _simd_set1_ps(0.5000), _simd_set1_ps(0.2500), _simd_set1_ps(0.1250), + _simd_set1_ps(0.0000), _simd_set1_ps(0.9375), _simd_set1_ps(0.8750), _simd_set1_ps(0.0625)}; + assert(sampleNum < numSamples); + return X[sampleNum]; + } + + INLINE static simdscalar vY(uint32_t sampleNum) + { + static const simdscalar Y[numSamples] + {_simd_set1_ps(0.5625), _simd_set1_ps(0.3125), _simd_set1_ps(0.6250), _simd_set1_ps(0.4375), + _simd_set1_ps(0.3750), _simd_set1_ps(0.8125), _simd_set1_ps(0.6875), _simd_set1_ps(0.1875), + _simd_set1_ps(0.8750), _simd_set1_ps(0.0625), _simd_set1_ps(0.1250), _simd_set1_ps(0.7500), + _simd_set1_ps(0.5000), _simd_set1_ps(0.2500), _simd_set1_ps(0.9375), _simd_set1_ps(0.0000)}; + assert(sampleNum < numSamples); + return Y[sampleNum]; + } + + INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; }; + INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; }; + + INLINE static __m128i TileSampleOffsetsX() + { + static const uint32_t bboxLeftEdge = 0x00; + static const uint32_t bboxRightEdge = 0xF0; + // BR, BL, UR, UL + static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge); + return tileSampleOffsetX; + } + + INLINE static __m128i TileSampleOffsetsY() + { + static const uint32_t bboxTopEdge = 0x00; + static const uint32_t bboxBottomEdge = 0xF0; + // BR, BL, UR, UL + static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge); + return tileSampleOffsetY; + } + + INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) + { + static const uint32_t RasterTileColorOffsets[numSamples] + { 0, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8), + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15, + }; + assert(sampleNum < numSamples); + return RasterTileColorOffsets[sampleNum]; + } + + INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) + { + static const uint32_t RasterTileDepthOffsets[numSamples] + { 0, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8), + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15, + }; + assert(sampleNum < numSamples); + return RasterTileDepthOffsets[sampleNum]; + } + + INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) + { + static const uint32_t RasterTileStencilOffsets[numSamples] + { 0, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8), + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15, + }; + assert(sampleNum < numSamples); + return RasterTileStencilOffsets[sampleNum]; + } + + INLINE static simdscalari FullSampleMask() + { + static const simdscalari mask = _simd_set1_epi32(0xFFFF); + return mask; + } + + static const uint32_t samplePosXi[16]; + static const uint32_t samplePosYi[16]; + static const float samplePosX[16]; + static const float samplePosY[16]; + static const uint32_t numSamples = 16; +}; diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h new file mode 100644 index 00000000000..2028d9fbcfe --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/pa.h @@ -0,0 +1,1208 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file pa.h +* +* @brief Definitions for primitive assembly. +* N primitives are assembled at a time, where N is the SIMD width. +* A state machine, that is specific for a given topology, drives the +* assembly of vertices into triangles. +* +******************************************************************************/ +#pragma once + +#include "frontend.h" + +struct PA_STATE +{ + DRAW_CONTEXT *pDC; // draw context + uint8_t* pStreamBase; // vertex stream + uint32_t streamSizeInVerts; // total size of the input stream in verts + + // The topology the binner will use. In some cases the FE changes the topology from the api state. + PRIMITIVE_TOPOLOGY binTopology; + + PA_STATE() {} + PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) : + pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {} + + virtual bool HasWork() = 0; + virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0; + virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0; + virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0; + virtual bool NextPrim() = 0; + virtual simdvertex& GetNextVsOutput() = 0; + virtual bool GetNextStreamOutput() = 0; + virtual simdmask& GetNextVsIndices() = 0; + virtual uint32_t NumPrims() = 0; + virtual void Reset() = 0; + virtual simdscalari GetPrimID(uint32_t startID) = 0; +}; + +// The Optimized PA is a state machine that assembles triangles from vertex shader simd +// output. Here is the sequence +// 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd). +// 2. Execute PA function to assemble and bin triangles. +// a. The PA function is a set of functions that collectively make up the +// state machine for a given topology. +// 1. We use a state index to track which PA function to call. +// b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle. +// 1. We call this the current and previous simd vertex. +// 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In +// order to assemble the second triangle, for a triangle list, we'll need the +// last vertex from the previous simd and the first 2 vertices from the current simd. +// 3. At times the PA can assemble multiple triangles from the 2 simd vertices. +// +// This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without +// cuts +struct PA_STATE_OPT : public PA_STATE +{ + simdvertex leadingVertex; // For tri-fan + uint32_t numPrims; // Total number of primitives for draw. + uint32_t numPrimsComplete; // Total number of complete primitives. + + uint32_t numSimdPrims; // Number of prims in current simd. + + uint32_t cur; // index to current VS output. + uint32_t prev; // index to prev VS output. Not really needed in the state. + uint32_t first; // index to first VS output. Used for trifan. + + uint32_t counter; // state counter + bool reset; // reset state + + uint32_t primIDIncr; // how much to increment for each vector (typically vector / {1, 2}) + simdscalari primID; + + typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]); + typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); + + PFN_PA_FUNC pfnPaFunc; // PA state machine function for assembling 4 triangles. + PFN_PA_SINGLE_FUNC pfnPaSingleFunc; // PA state machine function for assembling single triangle. + PFN_PA_FUNC pfnPaFuncReset; // initial state to set on reset + + // state used to advance the PA when Next is called + PFN_PA_FUNC pfnPaNextFunc; + uint32_t nextNumSimdPrims; + uint32_t nextNumPrimsIncrement; + bool nextReset; + bool isStreaming; + + simdmask tmpIndices; // temporary index store for unused virtual function + + PA_STATE_OPT() {} + PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts, + bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN); + + bool HasWork() + { + return (this->numPrimsComplete < this->numPrims) ? true : false; + } + + simdvector& GetSimdVector(uint32_t index, uint32_t slot) + { + simdvertex* pVertex = (simdvertex*)pStreamBase; + return pVertex[index].attrib[slot]; + } + + // Assembles 4 triangles. Each simdvector is a single vertex from 4 + // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle. + bool Assemble(uint32_t slot, simdvector verts[]) + { + return this->pfnPaFunc(*this, slot, verts); + } + + // Assembles 1 primitive. Each simdscalar is a vertex (xyzw). + void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) + { + return this->pfnPaSingleFunc(*this, slot, primIndex, verts); + } + + bool NextPrim() + { + this->pfnPaFunc = this->pfnPaNextFunc; + this->numSimdPrims = this->nextNumSimdPrims; + this->numPrimsComplete += this->nextNumPrimsIncrement; + this->reset = this->nextReset; + + if (this->isStreaming) + { + this->reset = false; + } + + bool morePrims = false; + + if (this->numSimdPrims > 0) + { + morePrims = true; + this->numSimdPrims--; + } + else + { + this->counter = (this->reset) ? 0 : (this->counter + 1); + this->reset = false; + } + + this->pfnPaFunc = this->pfnPaNextFunc; + + if (!HasWork()) + { + morePrims = false; // no more to do + } + + return morePrims; + } + + simdvertex& GetNextVsOutput() + { + // increment cur and prev indices + const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD_WIDTH; + this->prev = this->cur; // prev is undefined for first state. + this->cur = this->counter % numSimdVerts; + + simdvertex* pVertex = (simdvertex*)pStreamBase; + return pVertex[this->cur]; + } + + simdmask& GetNextVsIndices() + { + // unused in optimized PA, pass tmp buffer back + return tmpIndices; + } + + bool GetNextStreamOutput() + { + this->prev = this->cur; + this->cur = this->counter; + + return HasWork(); + } + + uint32_t NumPrims() + { + return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ? + (KNOB_SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : KNOB_SIMD_WIDTH; + } + + void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, + PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, + uint32_t numSimdPrims = 0, + uint32_t numPrimsIncrement = 0, + bool reset = false) + { + this->pfnPaNextFunc = pfnPaNextFunc; + this->nextNumSimdPrims = numSimdPrims; + this->nextNumPrimsIncrement = numPrimsIncrement; + this->nextReset = reset; + + this->pfnPaSingleFunc = pfnPaNextSingleFunc; + } + + void Reset() + { + this->pfnPaFunc = this->pfnPaFuncReset; + this->numPrimsComplete = 0; + this->numSimdPrims = 0; + this->cur = 0; + this->prev = 0; + this->first = 0; + this->counter = 0; + this->reset = false; + } + + simdscalari GetPrimID(uint32_t startID) + { + return _simd_add_epi32(this->primID, + _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / KNOB_SIMD_WIDTH))); + } +}; + +// helper C wrappers to avoid having to rewrite all the PA topology state functions +INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, + PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, + uint32_t numSimdPrims = 0, + uint32_t numPrimsIncrement = 0, + bool reset = false) +{ + return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset); +} +INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot) +{ + return pa.GetSimdVector(index, slot); +} + +INLINE __m128 swizzleLane0(const simdvector &a) +{ + simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z); + simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w); + return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); +} + +INLINE __m128 swizzleLane1(const simdvector &a) +{ + simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z); + simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w); + return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); +} + +INLINE __m128 swizzleLane2(const simdvector &a) +{ + simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z); + simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w); + return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); +} + +INLINE __m128 swizzleLane3(const simdvector &a) +{ + simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z); + simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w); + return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); +} + +INLINE __m128 swizzleLane4(const simdvector &a) +{ + simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z); + simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w); + return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); + +} + +INLINE __m128 swizzleLane5(const simdvector &a) +{ + simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z); + simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w); + return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); +} + +INLINE __m128 swizzleLane6(const simdvector &a) +{ + simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z); + simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w); + return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); +} + +INLINE __m128 swizzleLane7(const simdvector &a) +{ + simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z); + simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w); + return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); +} + +INLINE __m128 swizzleLaneN(const simdvector &a, int lane) +{ + switch (lane) { + case 0: + return swizzleLane0(a); + case 1: + return swizzleLane1(a); + case 2: + return swizzleLane2(a); + case 3: + return swizzleLane3(a); + case 4: + return swizzleLane4(a); + case 5: + return swizzleLane5(a); + case 6: + return swizzleLane6(a); + case 7: + return swizzleLane7(a); + default: + return _mm_setzero_ps(); + } +} + +// Cut-aware primitive assembler. +struct PA_STATE_CUT : public PA_STATE +{ + simdmask* pCutIndices; // cut indices buffer, 1 bit per vertex + uint32_t numVerts; // number of vertices available in buffer store + uint32_t numAttribs; // number of attributes + int32_t numRemainingVerts; // number of verts remaining to be assembled + uint32_t numVertsToAssemble; // total number of verts to assemble for the draw + OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH]; // current index buffer for gather + simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd + uint32_t numPrimsAssembled; // number of primitives that are fully assembled + uint32_t headVertex; // current unused vertex slot in vertex buffer store + uint32_t tailVertex; // beginning vertex currently assembling + uint32_t curVertex; // current unprocessed vertex + uint32_t startPrimId; // starting prim id + simdscalari vPrimId; // vector of prim ID + bool needOffsets; // need to compute gather offsets for current SIMD + uint32_t vertsPerPrim; + simdvertex tmpVertex; // temporary simdvertex for unimplemented API + bool processCutVerts; // vertex indices with cuts should be processed as normal, otherwise they + // are ignored. Fetch shader sends invalid verts on cuts that should be ignored + // while the GS sends valid verts for every index + // Topology state tracking + uint32_t vert[MAX_NUM_VERTS_PER_PRIM]; + uint32_t curIndex; + bool reverseWinding; // indicates reverse winding for strips + int32_t adjExtraVert; // extra vert uses for tristrip w/ adj + + typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish); + PFN_PA_FUNC pfnPa; // per-topology function that processes a single vert + + PA_STATE_CUT() {} + PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts, + uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts) + : PA_STATE(pDC, in_pStream, in_streamSizeInVerts) + { + numVerts = in_streamSizeInVerts; + numAttribs = in_numAttribs; + binTopology = topo; + needOffsets = false; + processCutVerts = in_processCutVerts; + + numVertsToAssemble = numRemainingVerts = in_numVerts; + numPrimsAssembled = 0; + headVertex = tailVertex = curVertex = 0; + + curIndex = 0; + pCutIndices = in_pIndices; + memset(indices, 0, sizeof(indices)); + vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + reverseWinding = false; + adjExtraVert = -1; + + bool gsEnabled = pDC->pState->state.gsState.gsEnable; + vertsPerPrim = NumVertsPerPrim(topo, gsEnabled); + + switch (topo) + { + case TOP_TRIANGLE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertTriList; break; + case TOP_TRI_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break; + case TOP_TRIANGLE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break; + case TOP_TRI_STRIP_ADJ: if (gsEnabled) + { + pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ; + } + else + { + pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ; + } + break; + + case TOP_POINT_LIST: pfnPa = &PA_STATE_CUT::ProcessVertPointList; break; + case TOP_LINE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertLineList; break; + case TOP_LINE_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break; + case TOP_LINE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break; + case TOP_LISTSTRIP_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break; + default: assert(0 && "Unimplemented topology"); + } + } + + simdvertex& GetNextVsOutput() + { + uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH; + this->headVertex = (this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts; + this->needOffsets = true; + return ((simdvertex*)pStreamBase)[vertexIndex]; + } + + simdmask& GetNextVsIndices() + { + uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH; + simdmask* pCurCutIndex = this->pCutIndices + vertexIndex; + return *pCurCutIndex; + } + + simdvector& GetSimdVector(uint32_t index, uint32_t slot) + { + // unused + SWR_ASSERT(0 && "Not implemented"); + return this->tmpVertex.attrib[0]; + } + + bool GetNextStreamOutput() + { + this->headVertex += KNOB_SIMD_WIDTH; + this->needOffsets = true; + return HasWork(); + } + + simdscalari GetPrimID(uint32_t startID) + { + return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId); + } + + void Reset() + { + this->numRemainingVerts = this->numVertsToAssemble; + this->numPrimsAssembled = 0; + this->curIndex = 0; + this->curVertex = 0; + this->tailVertex = 0; + this->headVertex = 0; + this->reverseWinding = false; + this->adjExtraVert = -1; + this->vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + } + + bool HasWork() + { + return this->numRemainingVerts > 0 || this->adjExtraVert != -1; + } + + bool IsVertexStoreFull() + { + return ((this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts) == this->tailVertex; + } + + void RestartTopology() + { + this->curIndex = 0; + this->reverseWinding = false; + this->adjExtraVert = -1; + } + + bool IsCutIndex(uint32_t vertex) + { + uint32_t vertexIndex = vertex / KNOB_SIMD_WIDTH; + uint32_t vertexOffset = vertex & (KNOB_SIMD_WIDTH - 1); + return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1; + } + + // iterates across the unprocessed verts until we hit the end or we + // have assembled SIMD prims + void ProcessVerts() + { + while (this->numPrimsAssembled != KNOB_SIMD_WIDTH && + this->numRemainingVerts > 0 && + this->curVertex != this->headVertex) + { + // if cut index, restart topology + if (IsCutIndex(this->curVertex)) + { + if (this->processCutVerts) + { + (this->*pfnPa)(this->curVertex, false); + } + // finish off tri strip w/ adj before restarting topo + if (this->adjExtraVert != -1) + { + (this->*pfnPa)(this->curVertex, true); + } + RestartTopology(); + } + else + { + (this->*pfnPa)(this->curVertex, false); + } + + this->curVertex = (this->curVertex + 1) % this->numVerts; + this->numRemainingVerts--; + } + + // special case last primitive for tri strip w/ adj + if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1) + { + (this->*pfnPa)(this->curVertex, true); + } + } + + void Advance() + { + // done with current batch + // advance tail to the current unsubmitted vertex + this->tailVertex = this->curVertex; + this->numPrimsAssembled = 0; + this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(KNOB_SIMD_WIDTH)); + } + + bool NextPrim() + { + // if we've assembled enough prims, we can advance to the next set of verts + if (this->numPrimsAssembled == KNOB_SIMD_WIDTH || this->numRemainingVerts <= 0) + { + Advance(); + } + return false; + } + + void ComputeOffsets() + { + for (uint32_t v = 0; v < this->vertsPerPrim; ++v) + { + simdscalari vIndices = *(simdscalari*)&this->indices[v][0]; + + // step to simdvertex batch + const uint32_t simdShift = 3; // @todo make knob + simdscalari vVertexBatch = _simd_srai_epi32(vIndices, simdShift); + this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(simdvertex))); + + // step to index + const uint32_t simdMask = 0x7; // @todo make knob + simdscalari vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask)); + this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float)))); + } + } + + bool Assemble(uint32_t slot, simdvector result[]) + { + // process any outstanding verts + ProcessVerts(); + + // return false if we don't have enough prims assembled + if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts > 0) + { + return false; + } + + // cache off gather offsets given the current SIMD set of indices the first time we get an assemble + if (this->needOffsets) + { + ComputeOffsets(); + this->needOffsets = false; + } + + for (uint32_t v = 0; v < this->vertsPerPrim; ++v) + { + simdscalari offsets = this->vOffsets[v]; + + // step to attribute + offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector))); + + float* pBase = (float*)this->pStreamBase; + for (uint32_t c = 0; c < 4; ++c) + { + result[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1); + + // move base to next component + pBase += KNOB_SIMD_WIDTH; + } + } + + return true; + } + + void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3]) + { + // move to slot + for (uint32_t v = 0; v < this->vertsPerPrim; ++v) + { + uint32_t* pOffset = (uint32_t*)&this->vOffsets[v]; + uint32_t offset = pOffset[triIndex]; + offset += sizeof(simdvector) * slot; + float* pVert = (float*)&tri[v]; + for (uint32_t c = 0; c < 4; ++c) + { + float* pComponent = (float*)(this->pStreamBase + offset); + pVert[c] = *pComponent; + offset += KNOB_SIMD_WIDTH * sizeof(float); + } + } + } + + uint32_t NumPrims() + { + return this->numPrimsAssembled; + } + + // Per-topology functions + void ProcessVertTriStrip(uint32_t index, bool finish) + { + this->vert[this->curIndex] = index; + this->curIndex++; + if (this->curIndex == 3) + { + // assembled enough verts for prim, add to gather indices + this->indices[0][this->numPrimsAssembled] = this->vert[0]; + if (reverseWinding) + { + this->indices[1][this->numPrimsAssembled] = this->vert[2]; + this->indices[2][this->numPrimsAssembled] = this->vert[1]; + } + else + { + this->indices[1][this->numPrimsAssembled] = this->vert[1]; + this->indices[2][this->numPrimsAssembled] = this->vert[2]; + } + + // increment numPrimsAssembled + this->numPrimsAssembled++; + + // set up next prim state + this->vert[0] = this->vert[1]; + this->vert[1] = this->vert[2]; + this->curIndex = 2; + this->reverseWinding ^= 1; + } + } + + template<bool gsEnabled> + void AssembleTriStripAdj() + { + if (!gsEnabled) + { + this->vert[1] = this->vert[2]; + this->vert[2] = this->vert[4]; + + this->indices[0][this->numPrimsAssembled] = this->vert[0]; + this->indices[1][this->numPrimsAssembled] = this->vert[1]; + this->indices[2][this->numPrimsAssembled] = this->vert[2]; + + this->vert[4] = this->vert[2]; + this->vert[2] = this->vert[1]; + } + else + { + this->indices[0][this->numPrimsAssembled] = this->vert[0]; + this->indices[1][this->numPrimsAssembled] = this->vert[1]; + this->indices[2][this->numPrimsAssembled] = this->vert[2]; + this->indices[3][this->numPrimsAssembled] = this->vert[3]; + this->indices[4][this->numPrimsAssembled] = this->vert[4]; + this->indices[5][this->numPrimsAssembled] = this->vert[5]; + } + this->numPrimsAssembled++; + } + + + template<bool gsEnabled> + void ProcessVertTriStripAdj(uint32_t index, bool finish) + { + // handle last primitive of tristrip + if (finish && this->adjExtraVert != -1) + { + this->vert[3] = this->adjExtraVert; + AssembleTriStripAdj<gsEnabled>(); + this->adjExtraVert = -1; + return; + } + + switch (this->curIndex) + { + case 0: + case 1: + case 2: + case 4: + this->vert[this->curIndex] = index; + this->curIndex++; + break; + case 3: + this->vert[5] = index; + this->curIndex++; + break; + case 5: + if (this->adjExtraVert == -1) + { + this->adjExtraVert = index; + } + else + { + this->vert[3] = index; + if (!gsEnabled) + { + AssembleTriStripAdj<gsEnabled>(); + + uint32_t nextTri[6]; + if (this->reverseWinding) + { + nextTri[0] = this->vert[4]; + nextTri[1] = this->vert[0]; + nextTri[2] = this->vert[2]; + nextTri[4] = this->vert[3]; + nextTri[5] = this->adjExtraVert; + } + else + { + nextTri[0] = this->vert[2]; + nextTri[1] = this->adjExtraVert; + nextTri[2] = this->vert[3]; + nextTri[4] = this->vert[4]; + nextTri[5] = this->vert[0]; + } + for (uint32_t i = 0; i < 6; ++i) + { + this->vert[i] = nextTri[i]; + } + + this->adjExtraVert = -1; + this->reverseWinding ^= 1; + } + else + { + this->curIndex++; + } + } + break; + case 6: + SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!"); + AssembleTriStripAdj<gsEnabled>(); + + uint32_t nextTri[6]; + if (this->reverseWinding) + { + nextTri[0] = this->vert[4]; + nextTri[1] = this->vert[0]; + nextTri[2] = this->vert[2]; + nextTri[4] = this->vert[3]; + nextTri[5] = this->adjExtraVert; + } + else + { + nextTri[0] = this->vert[2]; + nextTri[1] = this->adjExtraVert; + nextTri[2] = this->vert[3]; + nextTri[4] = this->vert[4]; + nextTri[5] = this->vert[0]; + } + for (uint32_t i = 0; i < 6; ++i) + { + this->vert[i] = nextTri[i]; + } + this->reverseWinding ^= 1; + this->adjExtraVert = index; + this->curIndex--; + break; + } + } + + void ProcessVertTriList(uint32_t index, bool finish) + { + this->vert[this->curIndex] = index; + this->curIndex++; + if (this->curIndex == 3) + { + // assembled enough verts for prim, add to gather indices + this->indices[0][this->numPrimsAssembled] = this->vert[0]; + this->indices[1][this->numPrimsAssembled] = this->vert[1]; + this->indices[2][this->numPrimsAssembled] = this->vert[2]; + + // increment numPrimsAssembled + this->numPrimsAssembled++; + + // set up next prim state + this->curIndex = 0; + } + } + + void ProcessVertTriListAdj(uint32_t index, bool finish) + { + this->vert[this->curIndex] = index; + this->curIndex++; + if (this->curIndex == 6) + { + // assembled enough verts for prim, add to gather indices + this->indices[0][this->numPrimsAssembled] = this->vert[0]; + this->indices[1][this->numPrimsAssembled] = this->vert[1]; + this->indices[2][this->numPrimsAssembled] = this->vert[2]; + this->indices[3][this->numPrimsAssembled] = this->vert[3]; + this->indices[4][this->numPrimsAssembled] = this->vert[4]; + this->indices[5][this->numPrimsAssembled] = this->vert[5]; + + // increment numPrimsAssembled + this->numPrimsAssembled++; + + // set up next prim state + this->curIndex = 0; + } + } + + void ProcessVertTriListAdjNoGs(uint32_t index, bool finish) + { + this->vert[this->curIndex] = index; + this->curIndex++; + if (this->curIndex == 6) + { + // assembled enough verts for prim, add to gather indices + this->indices[0][this->numPrimsAssembled] = this->vert[0]; + this->indices[1][this->numPrimsAssembled] = this->vert[2]; + this->indices[2][this->numPrimsAssembled] = this->vert[4]; + + // increment numPrimsAssembled + this->numPrimsAssembled++; + + // set up next prim state + this->curIndex = 0; + } + } + + + void ProcessVertLineList(uint32_t index, bool finish) + { + this->vert[this->curIndex] = index; + this->curIndex++; + if (this->curIndex == 2) + { + this->indices[0][this->numPrimsAssembled] = this->vert[0]; + this->indices[1][this->numPrimsAssembled] = this->vert[1]; + + this->numPrimsAssembled++; + this->curIndex = 0; + } + } + + void ProcessVertLineStrip(uint32_t index, bool finish) + { + this->vert[this->curIndex] = index; + this->curIndex++; + if (this->curIndex == 2) + { + // assembled enough verts for prim, add to gather indices + this->indices[0][this->numPrimsAssembled] = this->vert[0]; + this->indices[1][this->numPrimsAssembled] = this->vert[1]; + + // increment numPrimsAssembled + this->numPrimsAssembled++; + + // set up next prim state + this->vert[0] = this->vert[1]; + this->curIndex = 1; + } + } + + void ProcessVertLineStripAdj(uint32_t index, bool finish) + { + this->vert[this->curIndex] = index; + this->curIndex++; + if (this->curIndex == 4) + { + // assembled enough verts for prim, add to gather indices + this->indices[0][this->numPrimsAssembled] = this->vert[0]; + this->indices[1][this->numPrimsAssembled] = this->vert[1]; + this->indices[2][this->numPrimsAssembled] = this->vert[2]; + this->indices[3][this->numPrimsAssembled] = this->vert[3]; + + // increment numPrimsAssembled + this->numPrimsAssembled++; + + // set up next prim state + this->vert[0] = this->vert[1]; + this->vert[1] = this->vert[2]; + this->vert[2] = this->vert[3]; + this->curIndex = 3; + } + } + + void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish) + { + this->vert[this->curIndex] = index; + this->curIndex++; + if (this->curIndex == 4) + { + // assembled enough verts for prim, add to gather indices + this->indices[0][this->numPrimsAssembled] = this->vert[1]; + this->indices[1][this->numPrimsAssembled] = this->vert[2]; + + // increment numPrimsAssembled + this->numPrimsAssembled++; + + // set up next prim state + this->vert[0] = this->vert[1]; + this->vert[1] = this->vert[2]; + this->vert[2] = this->vert[3]; + this->curIndex = 3; + } + } + + void ProcessVertLineListAdj(uint32_t index, bool finish) + { + this->vert[this->curIndex] = index; + this->curIndex++; + if (this->curIndex == 4) + { + this->indices[0][this->numPrimsAssembled] = this->vert[0]; + this->indices[1][this->numPrimsAssembled] = this->vert[1]; + this->indices[2][this->numPrimsAssembled] = this->vert[2]; + this->indices[3][this->numPrimsAssembled] = this->vert[3]; + + this->numPrimsAssembled++; + this->curIndex = 0; + } + } + + void ProcessVertLineListAdjNoGs(uint32_t index, bool finish) + { + this->vert[this->curIndex] = index; + this->curIndex++; + if (this->curIndex == 4) + { + this->indices[0][this->numPrimsAssembled] = this->vert[1]; + this->indices[1][this->numPrimsAssembled] = this->vert[2]; + + this->numPrimsAssembled++; + this->curIndex = 0; + } + } + + void ProcessVertPointList(uint32_t index, bool finish) + { + this->vert[this->curIndex] = index; + this->curIndex++; + if (this->curIndex == 1) + { + this->indices[0][this->numPrimsAssembled] = this->vert[0]; + this->numPrimsAssembled++; + this->curIndex = 0; + } + } +}; + +// Primitive Assembly for data output from the DomainShader. +struct PA_TESS : PA_STATE +{ + PA_TESS( + DRAW_CONTEXT *in_pDC, + const simdscalar* in_pVertData, + uint32_t in_attributeStrideInVectors, + uint32_t in_numAttributes, + uint32_t* (&in_ppIndices)[3], + uint32_t in_numPrims, + PRIMITIVE_TOPOLOGY in_binTopology) : + + PA_STATE(in_pDC, nullptr, 0), + m_pVertexData(in_pVertData), + m_attributeStrideInVectors(in_attributeStrideInVectors), + m_numAttributes(in_numAttributes), + m_numPrims(in_numPrims) + { + m_vPrimId = _simd_setzero_si(); + binTopology = in_binTopology; + m_ppIndices[0] = in_ppIndices[0]; + m_ppIndices[1] = in_ppIndices[1]; + m_ppIndices[2] = in_ppIndices[2]; + + switch (binTopology) + { + case TOP_POINT_LIST: + m_numVertsPerPrim = 1; + break; + + case TOP_LINE_LIST: + m_numVertsPerPrim = 2; + break; + + case TOP_TRIANGLE_LIST: + m_numVertsPerPrim = 3; + break; + + default: + SWR_ASSERT(0, "Invalid binTopology (%d) for %s", binTopology, __FUNCTION__); + break; + } + } + + bool HasWork() + { + return m_numPrims != 0; + } + + simdvector& GetSimdVector(uint32_t index, uint32_t slot) + { + SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__); + static simdvector junk = { 0 }; + return junk; + } + + static simdscalari GenPrimMask(uint32_t numPrims) + { + SWR_ASSERT(numPrims <= KNOB_SIMD_WIDTH); +#if KNOB_SIMD_WIDTH == 8 + static const OSALIGN(int32_t, 64) maskGen[KNOB_SIMD_WIDTH * 2] = + { + -1, -1, -1, -1, -1, -1, -1, -1, + 0, 0, 0, 0, 0, 0, 0, 0 + }; +#elif KNOB_SIMD_WIDTH == 16 + static const OSALIGN(int32_t, 128) maskGen[KNOB_SIMD_WIDTH * 2] = + { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; +#else +#error "Help, help, I can't get up!" +#endif + + return _simd_loadu_si((const simdscalari*)&maskGen[KNOB_SIMD_WIDTH - numPrims]); + } + + bool Assemble(uint32_t slot, simdvector verts[]) + { + static_assert(KNOB_SIMD_WIDTH == 8, "Need to revisit this when AVX512 is implemented"); + SWR_ASSERT(slot < m_numAttributes); + + uint32_t numPrimsToAssemble = PA_TESS::NumPrims(); + if (0 == numPrimsToAssemble) + { + return false; + } + + simdscalari mask = GenPrimMask(numPrimsToAssemble); + + const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4]; + for (uint32_t i = 0; i < m_numVertsPerPrim; ++i) + { + simdscalari indices = _simd_load_si((const simdscalari*)m_ppIndices[i]); + + const float* pBase = pBaseAttrib; + for (uint32_t c = 0; c < 4; ++c) + { + verts[i].v[c] = _simd_mask_i32gather_ps( + _simd_setzero_ps(), + pBase, + indices, + _simd_castsi_ps(mask), + 4 /* gcc doesn't like sizeof(float) */); + pBase += m_attributeStrideInVectors * KNOB_SIMD_WIDTH; + } + } + + return true; + } + + void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) + { + SWR_ASSERT(slot < m_numAttributes); + SWR_ASSERT(primIndex < PA_TESS::NumPrims()); + + const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4]; + for (uint32_t i = 0; i < m_numVertsPerPrim; ++i) + { + uint32_t index = m_ppIndices[i][primIndex]; + const float* pVertData = pVertDataBase; + float* pVert = (float*)&verts[i]; + + for (uint32_t c = 0; c < 4; ++c) + { + pVert[c] = pVertData[index]; + pVertData += m_attributeStrideInVectors * KNOB_SIMD_WIDTH; + } + } + } + + bool NextPrim() + { + uint32_t numPrims = PA_TESS::NumPrims(); + m_numPrims -= numPrims; + m_ppIndices[0] += numPrims; + m_ppIndices[1] += numPrims; + m_ppIndices[2] += numPrims; + + return HasWork(); + } + + simdvertex& GetNextVsOutput() + { + SWR_ASSERT(0, "%s", __FUNCTION__); + static simdvertex junk; + return junk; + } + + bool GetNextStreamOutput() + { + SWR_ASSERT(0, "%s", __FUNCTION__); + return false; + } + + simdmask& GetNextVsIndices() + { + SWR_ASSERT(0, "%s", __FUNCTION__); + static simdmask junk; + return junk; + } + + uint32_t NumPrims() + { + return std::min<uint32_t>(m_numPrims, KNOB_SIMD_WIDTH); + } + + void Reset() { SWR_ASSERT(0); }; + + simdscalari GetPrimID(uint32_t startID) + { + return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId); + } + +private: + const simdscalar* m_pVertexData = nullptr; + uint32_t m_attributeStrideInVectors = 0; + uint32_t m_numAttributes = 0; + uint32_t m_numPrims = 0; + uint32_t* m_ppIndices[3]; + + uint32_t m_numVertsPerPrim = 0; + + simdscalari m_vPrimId; +}; + +// Primitive Assembler factory class, responsible for creating and initializing the correct assembler +// based on state. +template <bool IsIndexedT> +struct PA_FACTORY +{ + PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts) : topo(in_topo) + { +#if KNOB_ENABLE_CUT_AWARE_PA == TRUE + const API_STATE& state = GetApiState(pDC); + if ((IsIndexedT && ( + topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST || + topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP || + topo == TOP_TRIANGLE_LIST || topo == TOP_LINE_LIST_ADJ || + topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || + topo == TOP_TRI_STRIP_ADJ)) || + + // non-indexed draws with adjacency topologies must use cut-aware PA until we add support + // for them in the optimized PA + (!IsIndexedT && ( + topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))) + { + memset(&indexStore, 0, sizeof(indexStore)); + DWORD numAttribs; + _BitScanReverse(&numAttribs, state.feAttribMask); + numAttribs++; + new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, + &this->indexStore[0], numVerts, numAttribs, state.topology, false); + cutPA = true; + } + else +#endif + { + uint32_t numPrims = GetNumPrims(in_topo, numVerts); + new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, false); + cutPA = false; + } + + } + + PA_STATE& GetPA() + { +#if KNOB_ENABLE_CUT_AWARE_PA == TRUE + if (cutPA) + { + return this->paCut; + } + else +#endif + { + return this->paOpt; + } + } + + PA_STATE_OPT paOpt; + PA_STATE_CUT paCut; + bool cutPA; + + PRIMITIVE_TOPOLOGY topo; + + simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM]; + simdmask indexStore[MAX_NUM_VERTS_PER_PRIM]; +}; diff --git a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp new file mode 100644 index 00000000000..9850b436e39 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp @@ -0,0 +1,1177 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file pa_avx.cpp +* +* @brief AVX implementation for primitive assembly. +* N primitives are assembled at a time, where N is the SIMD width. +* A state machine, that is specific for a given topology, drives the +* assembly of vertices into triangles. +* +******************************************************************************/ +#include "context.h" +#include "pa.h" +#include "frontend.h" + +#if (KNOB_SIMD_WIDTH == 8) + +bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); +bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); +bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); +void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); + +bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); +bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); +void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); + +bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); +bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); +void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); + +bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); +bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); +void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); + +bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); +bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); + +bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); +bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); +void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t index, __m128 verts[]); + +bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); +bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); +void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 lineverts[]); + +bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); +void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); + +bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); +bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); +bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); +void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); + +template <uint32_t TotalControlPoints> +void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) +{ + // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output + // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute. + // Each attribute has 4 components. + + /// @todo Optimize this + + float* pOutVec = (float*)verts; + + for (uint32_t cp = 0; cp < TotalControlPoints; ++cp) + { + uint32_t input_cp = primIndex * TotalControlPoints + cp; + uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH; + uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH; + + // Loop over all components of the attribute + for (uint32_t i = 0; i < 4; ++i) + { + const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]); + pOutVec[cp * 4 + i] = pInputVec[input_lane]; + } + } +} + +template<uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1> +static bool PaPatchList(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) +{ + SetNextPaState( + pa, + PaPatchList<TotalControlPoints, CurrentControlPoints + 1>, + PaPatchListSingle<TotalControlPoints>); + + return false; +} + +template<uint32_t TotalControlPoints> +static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) +{ + // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output + // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute. + // Each attribute has 4 components. + + /// @todo Optimize this + + // Loop over all components of the attribute + for (uint32_t i = 0; i < 4; ++i) + { + for (uint32_t cp = 0; cp < TotalControlPoints; ++cp) + { + float vec[KNOB_SIMD_WIDTH]; + for (uint32_t lane = 0; lane < KNOB_SIMD_WIDTH; ++lane) + { + uint32_t input_cp = lane * TotalControlPoints + cp; + uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH; + uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH; + + const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]); + vec[lane] = pInputVec[input_lane]; + } + verts[cp][i] = _simd_loadu_ps(vec); + } + } + + SetNextPaState( + pa, + PaPatchList<TotalControlPoints>, + PaPatchListSingle<TotalControlPoints>, + 0, + KNOB_SIMD_WIDTH, + true); + + return true; +} + +#define PA_PATCH_LIST_TERMINATOR(N) \ + template<> bool PaPatchList<N, N>(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])\ + { return PaPatchListTerm<N>(pa, slot, verts); } +PA_PATCH_LIST_TERMINATOR(1) +PA_PATCH_LIST_TERMINATOR(2) +PA_PATCH_LIST_TERMINATOR(3) +PA_PATCH_LIST_TERMINATOR(4) +PA_PATCH_LIST_TERMINATOR(5) +PA_PATCH_LIST_TERMINATOR(6) +PA_PATCH_LIST_TERMINATOR(7) +PA_PATCH_LIST_TERMINATOR(8) +PA_PATCH_LIST_TERMINATOR(9) +PA_PATCH_LIST_TERMINATOR(10) +PA_PATCH_LIST_TERMINATOR(11) +PA_PATCH_LIST_TERMINATOR(12) +PA_PATCH_LIST_TERMINATOR(13) +PA_PATCH_LIST_TERMINATOR(14) +PA_PATCH_LIST_TERMINATOR(15) +PA_PATCH_LIST_TERMINATOR(16) +PA_PATCH_LIST_TERMINATOR(17) +PA_PATCH_LIST_TERMINATOR(18) +PA_PATCH_LIST_TERMINATOR(19) +PA_PATCH_LIST_TERMINATOR(20) +PA_PATCH_LIST_TERMINATOR(21) +PA_PATCH_LIST_TERMINATOR(22) +PA_PATCH_LIST_TERMINATOR(23) +PA_PATCH_LIST_TERMINATOR(24) +PA_PATCH_LIST_TERMINATOR(25) +PA_PATCH_LIST_TERMINATOR(26) +PA_PATCH_LIST_TERMINATOR(27) +PA_PATCH_LIST_TERMINATOR(28) +PA_PATCH_LIST_TERMINATOR(29) +PA_PATCH_LIST_TERMINATOR(30) +PA_PATCH_LIST_TERMINATOR(31) +PA_PATCH_LIST_TERMINATOR(32) +#undef PA_PATCH_LIST_TERMINATOR + +bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) +{ + SetNextPaState(pa, PaTriList1, PaTriListSingle0); + return false; // Not enough vertices to assemble 4 or 8 triangles. +} + +bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) +{ + SetNextPaState(pa, PaTriList2, PaTriListSingle0); + return false; // Not enough vertices to assemble 8 triangles. +} + +bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) +{ + simdvector& a = PaGetSimdVector(pa, 0, slot); + simdvector& b = PaGetSimdVector(pa, 1, slot); + simdvector& c = PaGetSimdVector(pa, 2, slot); + simdscalar s; + + // Tri Pattern - provoking vertex is always v0 + // v0 -> 0 3 6 9 12 15 18 21 + // v1 -> 1 4 7 10 13 16 19 22 + // v2 -> 2 5 8 11 14 17 20 23 + + for(int i = 0; i < 4; ++i) + { + simdvector& v0 = verts[0]; + v0[i] = _simd_blend_ps(a[i], b[i], 0x92); + v0[i] = _simd_blend_ps(v0[i], c[i], 0x24); + v0[i] = _mm256_permute_ps(v0[i], 0x6C); + s = _mm256_permute2f128_ps(v0[i], v0[i], 0x21); + v0[i] = _simd_blend_ps(v0[i], s, 0x44); + + simdvector& v1 = verts[1]; + v1[i] = _simd_blend_ps(a[i], b[i], 0x24); + v1[i] = _simd_blend_ps(v1[i], c[i], 0x49); + v1[i] = _mm256_permute_ps(v1[i], 0xB1); + s = _mm256_permute2f128_ps(v1[i], v1[i], 0x21); + v1[i] = _simd_blend_ps(v1[i], s, 0x66); + + simdvector& v2 = verts[2]; + v2[i] = _simd_blend_ps(a[i], b[i], 0x49); + v2[i] = _simd_blend_ps(v2[i], c[i], 0x92); + v2[i] = _mm256_permute_ps(v2[i], 0xC6); + s = _mm256_permute2f128_ps(v2[i], v2[i], 0x21); + v2[i] = _simd_blend_ps(v2[i], s, 0x22); + } + + SetNextPaState(pa, PaTriList0, PaTriListSingle0, 0, KNOB_SIMD_WIDTH, true); + return true; +} + +void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) +{ + // We have 12 simdscalars contained within 3 simdvectors which + // hold at least 8 triangles worth of data. We want to assemble a single + // triangle with data in horizontal form. + simdvector& a = PaGetSimdVector(pa, 0, slot); + simdvector& b = PaGetSimdVector(pa, 1, slot); + simdvector& c = PaGetSimdVector(pa, 2, slot); + + // Convert from vertical to horizontal. + // Tri Pattern - provoking vertex is always v0 + // v0 -> 0 3 6 9 12 15 18 21 + // v1 -> 1 4 7 10 13 16 19 22 + // v2 -> 2 5 8 11 14 17 20 23 + switch(primIndex) + { + case 0: + verts[0] = swizzleLane0(a); + verts[1] = swizzleLane1(a); + verts[2] = swizzleLane2(a); + break; + case 1: + verts[0] = swizzleLane3(a); + verts[1] = swizzleLane4(a); + verts[2] = swizzleLane5(a); + break; + case 2: + verts[0] = swizzleLane6(a); + verts[1] = swizzleLane7(a); + verts[2] = swizzleLane0(b); + break; + case 3: + verts[0] = swizzleLane1(b); + verts[1] = swizzleLane2(b); + verts[2] = swizzleLane3(b); + break; + case 4: + verts[0] = swizzleLane4(b); + verts[1] = swizzleLane5(b); + verts[2] = swizzleLane6(b); + break; + case 5: + verts[0] = swizzleLane7(b); + verts[1] = swizzleLane0(c); + verts[2] = swizzleLane1(c); + break; + case 6: + verts[0] = swizzleLane2(c); + verts[1] = swizzleLane3(c); + verts[2] = swizzleLane4(c); + break; + case 7: + verts[0] = swizzleLane5(c); + verts[1] = swizzleLane6(c); + verts[2] = swizzleLane7(c); + break; + }; +} + +bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) +{ + SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0); + return false; // Not enough vertices to assemble 8 triangles. +} + +bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) +{ + simdvector& a = PaGetSimdVector(pa, pa.prev, slot); + simdvector& b = PaGetSimdVector(pa, pa.cur, slot); + simdscalar s; + + for(int i = 0; i < 4; ++i) + { + simdscalar a0 = a[i]; + simdscalar b0 = b[i]; + + // Tri Pattern - provoking vertex is always v0 + // v0 -> 01234567 + // v1 -> 13355779 + // v2 -> 22446688 + simdvector& v0 = verts[0]; + v0[i] = a0; + + // s -> 4567891011 + s = _mm256_permute2f128_ps(a0, b0, 0x21); + // s -> 23456789 + s = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2)); + + simdvector& v1 = verts[1]; + // v1 -> 13355779 + v1[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(3, 1, 3, 1)); + + simdvector& v2 = verts[2]; + // v2 -> 22446688 + v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(2, 2, 2, 2)); + } + + SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0, 0, KNOB_SIMD_WIDTH); + return true; +} + +void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) +{ + simdvector& a = PaGetSimdVector(pa, pa.prev, slot); + simdvector& b = PaGetSimdVector(pa, pa.cur, slot); + + // Convert from vertical to horizontal. + // Tri Pattern - provoking vertex is always v0 + // v0 -> 01234567 + // v1 -> 13355779 + // v2 -> 22446688 + switch(primIndex) + { + case 0: + verts[0] = swizzleLane0(a); + verts[1] = swizzleLane1(a); + verts[2] = swizzleLane2(a); + break; + case 1: + verts[0] = swizzleLane1(a); + verts[1] = swizzleLane3(a); + verts[2] = swizzleLane2(a); + break; + case 2: + verts[0] = swizzleLane2(a); + verts[1] = swizzleLane3(a); + verts[2] = swizzleLane4(a); + break; + case 3: + verts[0] = swizzleLane3(a); + verts[1] = swizzleLane5(a); + verts[2] = swizzleLane4(a); + break; + case 4: + verts[0] = swizzleLane4(a); + verts[1] = swizzleLane5(a); + verts[2] = swizzleLane6(a); + break; + case 5: + verts[0] = swizzleLane5(a); + verts[1] = swizzleLane7(a); + verts[2] = swizzleLane6(a); + break; + case 6: + verts[0] = swizzleLane6(a); + verts[1] = swizzleLane7(a); + verts[2] = swizzleLane0(b); + break; + case 7: + verts[0] = swizzleLane7(a); + verts[1] = swizzleLane1(b); + verts[2] = swizzleLane0(b); + break; + }; +} + +bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) +{ + simdvector& a = PaGetSimdVector(pa, pa.cur, slot); + + // Extract vertex 0 to every lane of first vector + for(int i = 0; i < 4; ++i) + { + __m256 a0 = a[i]; + simdvector& v0 = verts[0]; + v0[i] = _simd_shuffle_ps(a0, a0, _MM_SHUFFLE(0, 0, 0, 0)); + v0[i] = _mm256_permute2f128_ps(v0[i], a0, 0x00); + } + + // store off leading vertex for attributes + simdvertex* pVertex = (simdvertex*)pa.pStreamBase; + pa.leadingVertex = pVertex[pa.cur]; + + SetNextPaState(pa, PaTriFan1, PaTriFanSingle0); + return false; // Not enough vertices to assemble 8 triangles. +} + +bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) +{ + simdvector& leadVert = pa.leadingVertex.attrib[slot]; + simdvector& a = PaGetSimdVector(pa, pa.prev, slot); + simdvector& b = PaGetSimdVector(pa, pa.cur, slot); + simdscalar s; + + // need to fill vectors 1/2 with new verts, and v0 with anchor vert. + for(int i = 0; i < 4; ++i) + { + simdscalar a0 = a[i]; + simdscalar b0 = b[i]; + + __m256 comp = leadVert[i]; + simdvector& v0 = verts[0]; + v0[i] = _simd_shuffle_ps(comp, comp, _MM_SHUFFLE(0, 0, 0, 0)); + v0[i] = _mm256_permute2f128_ps(v0[i], comp, 0x00); + + simdvector& v2 = verts[2]; + s = _mm256_permute2f128_ps(a0, b0, 0x21); + v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2)); + + simdvector& v1 = verts[1]; + v1[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1)); + } + + SetNextPaState(pa, PaTriFan1, PaTriFanSingle0, 0, KNOB_SIMD_WIDTH); + return true; +} + +void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) +{ + // vert 0 from leading vertex + simdvector& lead = pa.leadingVertex.attrib[slot]; + verts[0] = swizzleLane0(lead); + + simdvector& a = PaGetSimdVector(pa, pa.prev, slot); + simdvector& b = PaGetSimdVector(pa, pa.cur, slot); + + // vert 1 + if (primIndex < 7) + { + verts[1] = swizzleLaneN(a, primIndex + 1); + } + else + { + verts[1] = swizzleLane0(b); + } + + // vert 2 + if (primIndex < 6) + { + verts[2] = swizzleLaneN(a, primIndex + 2); + } + else + { + verts[2] = swizzleLaneN(b, primIndex - 6); + } +} + +bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) +{ + SetNextPaState(pa, PaQuadList1, PaQuadListSingle0); + return false; // Not enough vertices to assemble 8 triangles. +} + +bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) +{ + simdvector& a = PaGetSimdVector(pa, 0, slot); + simdvector& b = PaGetSimdVector(pa, 1, slot); + simdscalar s1, s2; + + for(int i = 0; i < 4; ++i) + { + simdscalar a0 = a[i]; + simdscalar b0 = b[i]; + + s1 = _mm256_permute2f128_ps(a0, b0, 0x20); + s2 = _mm256_permute2f128_ps(a0, b0, 0x31); + + simdvector& v0 = verts[0]; + v0[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(0, 0, 0, 0)); + + simdvector& v1 = verts[1]; + v1[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(2, 1, 2, 1)); + + simdvector& v2 = verts[2]; + v2[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2)); + } + + SetNextPaState(pa, PaQuadList0, PaQuadListSingle0, 0, KNOB_SIMD_WIDTH, true); + return true; +} + +void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) +{ + simdvector& a = PaGetSimdVector(pa, 0, slot); + simdvector& b = PaGetSimdVector(pa, 1, slot); + + switch (primIndex) + { + case 0: + // triangle 0 - 0 1 2 + verts[0] = swizzleLane0(a); + verts[1] = swizzleLane1(a); + verts[2] = swizzleLane2(a); + break; + + case 1: + // triangle 1 - 0 2 3 + verts[0] = swizzleLane0(a); + verts[1] = swizzleLane2(a); + verts[2] = swizzleLane3(a); + break; + + case 2: + // triangle 2 - 4 5 6 + verts[0] = swizzleLane4(a); + verts[1] = swizzleLane5(a); + verts[2] = swizzleLane6(a); + break; + + case 3: + // triangle 3 - 4 6 7 + verts[0] = swizzleLane4(a); + verts[1] = swizzleLane6(a); + verts[2] = swizzleLane7(a); + break; + + case 4: + // triangle 4 - 8 9 10 (0 1 2) + verts[0] = swizzleLane0(b); + verts[1] = swizzleLane1(b); + verts[2] = swizzleLane2(b); + break; + + case 5: + // triangle 1 - 0 2 3 + verts[0] = swizzleLane0(b); + verts[1] = swizzleLane2(b); + verts[2] = swizzleLane3(b); + break; + + case 6: + // triangle 2 - 4 5 6 + verts[0] = swizzleLane4(b); + verts[1] = swizzleLane5(b); + verts[2] = swizzleLane6(b); + break; + + case 7: + // triangle 3 - 4 6 7 + verts[0] = swizzleLane4(b); + verts[1] = swizzleLane6(b); + verts[2] = swizzleLane7(b); + break; + } +} + +void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t lineIndex, __m128 verts[]) +{ + PaLineStripSingle0(pa, slot, lineIndex, verts); + + if (pa.numPrimsComplete + lineIndex == pa.numPrims - 1) { + simdvector &start = PaGetSimdVector(pa, pa.first, slot); + verts[1] = swizzleLane0(start); + } +} + +bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) +{ + SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0); + return false; +} + +bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) +{ + PaLineStrip1(pa, slot, verts); + + if (pa.numPrimsComplete + KNOB_SIMD_WIDTH > pa.numPrims - 1) { + // loop reconnect now + int lane = pa.numPrims - pa.numPrimsComplete - 1; + simdvector &start = PaGetSimdVector(pa, pa.first, slot); + for (int i = 0; i < 4; i++) { + float *startVtx = (float *)&(start[i]); + float *targetVtx = (float *)&(verts[1][i]); + targetVtx[lane] = startVtx[0]; + } + } + + SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0, 0, KNOB_SIMD_WIDTH); + return true; +} + + +bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) +{ + SetNextPaState(pa, PaLineList1, PaLineListSingle0); + return false; // Not enough vertices to assemble 8 lines +} + +bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) +{ + simdvector& a = PaGetSimdVector(pa, 0, slot); + simdvector& b = PaGetSimdVector(pa, 1, slot); + /// @todo: verify provoking vertex is correct + // Line list 0 1 2 3 4 5 6 7 + // 8 9 10 11 12 13 14 15 + + // shuffle: + // 0 2 4 6 8 10 12 14 + // 1 3 5 7 9 11 13 15 + + for (uint32_t i = 0; i < 4; ++i) + { + // 0 1 2 3 8 9 10 11 + __m256 vALowBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x20); + // 4 5 6 7 12 13 14 15 + __m256 vAHighBHigh = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x31); + + // 0 2 4 6 8 10 12 14 + verts[0].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(2, 0, 2, 0)); + // 1 3 5 7 9 11 13 15 + verts[1].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(3, 1, 3, 1)); + } + + SetNextPaState(pa, PaLineList0, PaLineListSingle0, 0, KNOB_SIMD_WIDTH, true); + return true; +} + +void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) +{ + simdvector &a = PaGetSimdVector(pa, pa.prev, slot); + simdvector &b = PaGetSimdVector(pa, pa.cur, slot); + + switch (primIndex) + { + case 0: + verts[0] = swizzleLane0(a); + verts[1] = swizzleLane1(a); + break; + case 1: + verts[0] = swizzleLane2(a); + verts[1] = swizzleLane3(a); + break; + case 2: + verts[0] = swizzleLane4(a); + verts[1] = swizzleLane5(a); + break; + case 3: + verts[0] = swizzleLane6(a); + verts[1] = swizzleLane7(a); + break; + case 4: + verts[0] = swizzleLane0(b); + verts[1] = swizzleLane1(b); + break; + case 5: + verts[0] = swizzleLane2(b); + verts[1] = swizzleLane3(b); + break; + case 6: + verts[0] = swizzleLane4(b); + verts[1] = swizzleLane5(b); + break; + case 7: + verts[0] = swizzleLane6(b); + verts[1] = swizzleLane7(b); + break; + } +} + +bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) +{ + SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0); + return false; // Not enough vertices to assemble 8 lines +} + +bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) +{ + simdvector& a = PaGetSimdVector(pa, pa.prev, slot); + simdvector& b = PaGetSimdVector(pa, pa.cur, slot); + + /// @todo: verify provoking vertex is correct + // Line list 0 1 2 3 4 5 6 7 + // 8 9 10 11 12 13 14 15 + + // shuffle: + // 0 1 2 3 4 5 6 7 + // 1 2 3 4 5 6 7 8 + + verts[0] = a; + + for(uint32_t i = 0; i < 4; ++i) + { + // 1 2 3 x 5 6 7 x + __m256 vPermA = _mm256_permute_ps(a.v[i], 0x39); // indices hi->low 00 11 10 01 (0 3 2 1) + // 4 5 6 7 8 9 10 11 + __m256 vAHighBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x21); + + // x x x 4 x x x 8 + __m256 vPermB = _mm256_permute_ps(vAHighBLow, 0); // indices hi->low (0 0 0 0) + + verts[1].v[i] = _mm256_blend_ps(vPermA, vPermB, 0x88); + } + + SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0, 0, KNOB_SIMD_WIDTH); + return true; +} + +void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t lineIndex, __m128 verts[]) +{ + simdvector& a = PaGetSimdVector(pa, pa.prev, slot); + simdvector& b = PaGetSimdVector(pa, pa.cur, slot); + + switch (lineIndex) + { + case 0: + verts[0] = swizzleLane0(a); + verts[1] = swizzleLane1(a); + break; + case 1: + verts[0] = swizzleLane1(a); + verts[1] = swizzleLane2(a); + break; + case 2: + verts[0] = swizzleLane2(a); + verts[1] = swizzleLane3(a); + break; + case 3: + verts[0] = swizzleLane3(a); + verts[1] = swizzleLane4(a); + break; + case 4: + verts[0] = swizzleLane4(a); + verts[1] = swizzleLane5(a); + break; + case 5: + verts[0] = swizzleLane5(a); + verts[1] = swizzleLane6(a); + break; + case 6: + verts[0] = swizzleLane6(a); + verts[1] = swizzleLane7(a); + break; + case 7: + verts[0] = swizzleLane7(a); + verts[1] = swizzleLane0(b); + break; + } +} + +bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) +{ + simdvector& a = PaGetSimdVector(pa, pa.cur, slot); + + verts[0] = a; // points only have 1 vertex. + + SetNextPaState(pa, PaPoints0, PaPointsSingle0, 0, KNOB_SIMD_WIDTH, true); + return true; +} + +void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) +{ + simdvector &a = PaGetSimdVector(pa, pa.cur, slot); + switch(primIndex) + { + case 0: + verts[0] = swizzleLane0(a); + break; + case 1: + verts[0] = swizzleLane1(a); + break; + case 2: + verts[0] = swizzleLane2(a); + break; + case 3: + verts[0] = swizzleLane3(a); + break; + case 4: + verts[0] = swizzleLane4(a); + break; + case 5: + verts[0] = swizzleLane5(a); + break; + case 6: + verts[0] = swizzleLane6(a); + break; + case 7: + verts[0] = swizzleLane7(a); + break; + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief State 1 for RECT_LIST topology. +/// There is not enough to assemble 8 triangles. +bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) +{ + SetNextPaState(pa, PaRectList1, PaRectListSingle0); + return false; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief State 1 for RECT_LIST topology. +/// Rect lists has the following format. +/// w x y z +/// v2 o---o v5 o---o v8 o---o v11 o---o +/// | \ | | \ | | \ | | \ | +/// v1 o---o v4 o---o v7 o---o v10 o---o +/// v0 v3 v6 v9 +/// +/// Only 3 vertices of the rectangle are supplied. The 4th vertex is implied. +/// +/// tri0 = { v0, v1, v2 } tri1 = { v0, v2, w } <-- w = v0 - v1 + v2 +/// tri2 = { v3, v4, v5 } tri3 = { v3, v5, x } <-- x = v3 - v4 + v5 +/// etc. +/// +/// PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2 +/// where v0 contains all the first vertices for 8 triangles. +/// +/// Result: +/// verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 } +/// verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 } +/// verts[2] = { v2, w, v5, x, v8, y, v11, z } +/// +/// @param pa - State for PA state machine. +/// @param slot - Index into VS output which is either a position (slot 0) or attribute. +/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc. +bool PaRectList1( + PA_STATE_OPT& pa, + uint32_t slot, + simdvector verts[]) +{ + // SIMD vectors a and b are the last two vertical outputs from the vertex shader. + simdvector& a = PaGetSimdVector(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7 } + simdvector& b = PaGetSimdVector(pa, 1, slot); // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 } + + __m256 tmp0, tmp1, tmp2; + + // Loop over each component in the simdvector. + for(int i = 0; i < 4; ++i) + { + simdvector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 } + tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 } + v0[i] = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care. + tmp1 = _mm256_permute_ps(v0[i], 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * } + v0[i] = _mm256_permute_ps(v0[i], 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 } + v0[i] = _mm256_blend_ps(tmp1, v0[i], 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 } + + /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'. + /// AVX2 should make this much cheaper. + simdvector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 } + v1[i] = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * } + tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 } + tmp2 = _mm256_blend_ps(v1[i], tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 } + tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * } + v1[i] = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 } + v1[i] = _mm256_blend_ps(tmp2, v1[i], 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 } + v1[i] = _mm256_blend_ps(v1[i], tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 } + + // verts[2] = { v2, w, v5, x, v8, y, v11, z } + simdvector& v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z } + v2[i] = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * } + tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * } + v2[i] = _mm256_blend_ps(tmp1, v2[i], 0xF0); + + // Need to compute 4th implied vertex for the rectangle. + tmp2 = _mm256_sub_ps(v0[i], v1[i]); + tmp2 = _mm256_add_ps(tmp2, v2[i]); // tmp2 = { w, *, x, *, y, *, z, * } + tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z } + v2[i] = _mm256_blend_ps(v2[i], tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z } + } + + SetNextPaState(pa, PaRectList1, PaRectListSingle0, 0, KNOB_SIMD_WIDTH, true); + return true; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief State 2 for RECT_LIST topology. +/// Not implemented unless there is a use case for more then 8 rects. +/// @param pa - State for PA state machine. +/// @param slot - Index into VS output which is either a position (slot 0) or attribute. +/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc. +bool PaRectList2( + PA_STATE_OPT& pa, + uint32_t slot, + simdvector verts[]) +{ + SWR_ASSERT(0); // Is rect list used for anything other then clears? + SetNextPaState(pa, PaRectList0, PaRectListSingle0, 0, KNOB_SIMD_WIDTH, true); + return true; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief This procedure is called by the Binner to assemble the attributes. +/// Unlike position, which is stored vertically, the attributes are +/// stored horizontally. The outputs from the VS, labeled as 'a' and +/// 'b' are vertical. This function needs to transpose the lanes +/// containing the vertical attribute data into horizontal form. +/// @param pa - State for PA state machine. +/// @param slot - Index into VS output for a given attribute. +/// @param primIndex - Binner processes each triangle individually. +/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc. +void PaRectListSingle0( + PA_STATE_OPT& pa, + uint32_t slot, + uint32_t primIndex, + __m128 verts[]) +{ + // We have 12 simdscalars contained within 3 simdvectors which + // hold at least 8 triangles worth of data. We want to assemble a single + // triangle with data in horizontal form. + simdvector& a = PaGetSimdVector(pa, 0, slot); + + // Convert from vertical to horizontal. + switch(primIndex) + { + case 0: + verts[0] = swizzleLane0(a); + verts[1] = swizzleLane1(a); + verts[2] = swizzleLane2(a); + break; + case 1: + verts[0] = swizzleLane0(a); + verts[1] = swizzleLane2(a); + verts[2] = _mm_blend_ps(verts[0], verts[1], 0x2); + break; + case 2: + case 3: + case 4: + case 5: + case 6: + case 7: + SWR_ASSERT(0); + break; + }; +} + +PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* pStream, uint32_t in_streamSizeInVerts, + bool in_isStreaming, PRIMITIVE_TOPOLOGY topo) : PA_STATE(in_pDC, pStream, in_streamSizeInVerts), numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0), + cur(0), prev(0), first(0), counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming) +{ + const API_STATE& state = GetApiState(pDC); + + this->binTopology = topo == TOP_UNKNOWN ? state.topology : topo; + + switch (this->binTopology) + { + case TOP_TRIANGLE_LIST: + this->pfnPaFunc = PaTriList0; + break; + case TOP_TRIANGLE_STRIP: + this->pfnPaFunc = PaTriStrip0; + break; + case TOP_TRIANGLE_FAN: + this->pfnPaFunc = PaTriFan0; + break; + case TOP_QUAD_LIST: + this->pfnPaFunc = PaQuadList0; + this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles + break; + case TOP_QUAD_STRIP: + // quad strip pattern when decomposed into triangles is the same as verts strips + this->pfnPaFunc = PaTriStrip0; + this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles + break; + case TOP_LINE_LIST: + this->pfnPaFunc = PaLineList0; + this->numPrims = in_numPrims; + break; + case TOP_LINE_STRIP: + this->pfnPaFunc = PaLineStrip0; + this->numPrims = in_numPrims; + break; + case TOP_LINE_LOOP: + this->pfnPaFunc = PaLineLoop0; + this->numPrims = in_numPrims; + break; + case TOP_POINT_LIST: + // use point binner and rasterizer if supported + this->pfnPaFunc = PaPoints0; + this->numPrims = in_numPrims; + break; + case TOP_RECT_LIST: + this->pfnPaFunc = PaRectList0; + this->numPrims = in_numPrims * 2; + break; + + case TOP_PATCHLIST_1: + this->pfnPaFunc = PaPatchList<1>; + break; + case TOP_PATCHLIST_2: + this->pfnPaFunc = PaPatchList<2>; + break; + case TOP_PATCHLIST_3: + this->pfnPaFunc = PaPatchList<3>; + break; + case TOP_PATCHLIST_4: + this->pfnPaFunc = PaPatchList<4>; + break; + case TOP_PATCHLIST_5: + this->pfnPaFunc = PaPatchList<5>; + break; + case TOP_PATCHLIST_6: + this->pfnPaFunc = PaPatchList<6>; + break; + case TOP_PATCHLIST_7: + this->pfnPaFunc = PaPatchList<7>; + break; + case TOP_PATCHLIST_8: + this->pfnPaFunc = PaPatchList<8>; + break; + case TOP_PATCHLIST_9: + this->pfnPaFunc = PaPatchList<9>; + break; + case TOP_PATCHLIST_10: + this->pfnPaFunc = PaPatchList<10>; + break; + case TOP_PATCHLIST_11: + this->pfnPaFunc = PaPatchList<11>; + break; + case TOP_PATCHLIST_12: + this->pfnPaFunc = PaPatchList<12>; + break; + case TOP_PATCHLIST_13: + this->pfnPaFunc = PaPatchList<13>; + break; + case TOP_PATCHLIST_14: + this->pfnPaFunc = PaPatchList<14>; + break; + case TOP_PATCHLIST_15: + this->pfnPaFunc = PaPatchList<15>; + break; + case TOP_PATCHLIST_16: + this->pfnPaFunc = PaPatchList<16>; + break; + case TOP_PATCHLIST_17: + this->pfnPaFunc = PaPatchList<17>; + break; + case TOP_PATCHLIST_18: + this->pfnPaFunc = PaPatchList<18>; + break; + case TOP_PATCHLIST_19: + this->pfnPaFunc = PaPatchList<19>; + break; + case TOP_PATCHLIST_20: + this->pfnPaFunc = PaPatchList<20>; + break; + case TOP_PATCHLIST_21: + this->pfnPaFunc = PaPatchList<21>; + break; + case TOP_PATCHLIST_22: + this->pfnPaFunc = PaPatchList<22>; + break; + case TOP_PATCHLIST_23: + this->pfnPaFunc = PaPatchList<23>; + break; + case TOP_PATCHLIST_24: + this->pfnPaFunc = PaPatchList<24>; + break; + case TOP_PATCHLIST_25: + this->pfnPaFunc = PaPatchList<25>; + break; + case TOP_PATCHLIST_26: + this->pfnPaFunc = PaPatchList<26>; + break; + case TOP_PATCHLIST_27: + this->pfnPaFunc = PaPatchList<27>; + break; + case TOP_PATCHLIST_28: + this->pfnPaFunc = PaPatchList<28>; + break; + case TOP_PATCHLIST_29: + this->pfnPaFunc = PaPatchList<29>; + break; + case TOP_PATCHLIST_30: + this->pfnPaFunc = PaPatchList<30>; + break; + case TOP_PATCHLIST_31: + this->pfnPaFunc = PaPatchList<31>; + break; + case TOP_PATCHLIST_32: + this->pfnPaFunc = PaPatchList<32>; + break; + + default: + SWR_ASSERT(0); + break; + }; + + this->pfnPaFuncReset = this->pfnPaFunc; + + // simdscalari id8 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); + // simdscalari id4 = _mm256_set_epi32(0, 0, 1, 1, 2, 2, 3, 3); + simdscalari id8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + simdscalari id4 = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0); + + switch(this->binTopology) + { + case TOP_TRIANGLE_LIST: + case TOP_TRIANGLE_STRIP: + case TOP_TRIANGLE_FAN: + case TOP_LINE_STRIP: + case TOP_LINE_LIST: + case TOP_LINE_LOOP: + this->primIDIncr = 8; + this->primID = id8; + break; + case TOP_QUAD_LIST: + case TOP_QUAD_STRIP: + case TOP_RECT_LIST: + this->primIDIncr = 4; + this->primID = id4; + break; + case TOP_POINT_LIST: + this->primIDIncr = 8; + this->primID = id8; + break; + case TOP_PATCHLIST_1: + case TOP_PATCHLIST_2: + case TOP_PATCHLIST_3: + case TOP_PATCHLIST_4: + case TOP_PATCHLIST_5: + case TOP_PATCHLIST_6: + case TOP_PATCHLIST_7: + case TOP_PATCHLIST_8: + case TOP_PATCHLIST_9: + case TOP_PATCHLIST_10: + case TOP_PATCHLIST_11: + case TOP_PATCHLIST_12: + case TOP_PATCHLIST_13: + case TOP_PATCHLIST_14: + case TOP_PATCHLIST_15: + case TOP_PATCHLIST_16: + case TOP_PATCHLIST_17: + case TOP_PATCHLIST_18: + case TOP_PATCHLIST_19: + case TOP_PATCHLIST_20: + case TOP_PATCHLIST_21: + case TOP_PATCHLIST_22: + case TOP_PATCHLIST_23: + case TOP_PATCHLIST_24: + case TOP_PATCHLIST_25: + case TOP_PATCHLIST_26: + case TOP_PATCHLIST_27: + case TOP_PATCHLIST_28: + case TOP_PATCHLIST_29: + case TOP_PATCHLIST_30: + case TOP_PATCHLIST_31: + case TOP_PATCHLIST_32: + // Always run KNOB_SIMD_WIDTH number of patches at a time. + this->primIDIncr = 8; + this->primID = id8; + break; + + default: + SWR_ASSERT(0); + break; + }; + +} +#endif diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp new file mode 100644 index 00000000000..587e336d87d --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp @@ -0,0 +1,1393 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file rasterizer.cpp +* +* @brief Implementation for the rasterizer. +* +******************************************************************************/ + +#include <vector> +#include <algorithm> + +#include "rasterizer.h" +#include "multisample.h" +#include "rdtsc_core.h" +#include "backend.h" +#include "utils.h" +#include "frontend.h" +#include "tilemgr.h" +#include "memory/tilingtraits.h" + +void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, + uint32_t numSamples, uint32_t renderTargetArrayIndex); +void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers, uint32_t colorTileStep, uint32_t depthTileStep, uint32_t stencilTileStep); +void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow, + uint32_t colorRowStep, uint32_t depthRowStep, uint32_t stencilRowStep); + +#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3} +const __m128 gMaskToVec[] = { + MASKTOVEC(0,0,0,0), + MASKTOVEC(0,0,0,1), + MASKTOVEC(0,0,1,0), + MASKTOVEC(0,0,1,1), + MASKTOVEC(0,1,0,0), + MASKTOVEC(0,1,0,1), + MASKTOVEC(0,1,1,0), + MASKTOVEC(0,1,1,1), + MASKTOVEC(1,0,0,0), + MASKTOVEC(1,0,0,1), + MASKTOVEC(1,0,1,0), + MASKTOVEC(1,0,1,1), + MASKTOVEC(1,1,0,0), + MASKTOVEC(1,1,0,1), + MASKTOVEC(1,1,1,0), + MASKTOVEC(1,1,1,1), +}; + +const __m256d gMaskToVecpd[] = +{ + MASKTOVEC(0, 0, 0, 0), + MASKTOVEC(0, 0, 0, 1), + MASKTOVEC(0, 0, 1, 0), + MASKTOVEC(0, 0, 1, 1), + MASKTOVEC(0, 1, 0, 0), + MASKTOVEC(0, 1, 0, 1), + MASKTOVEC(0, 1, 1, 0), + MASKTOVEC(0, 1, 1, 1), + MASKTOVEC(1, 0, 0, 0), + MASKTOVEC(1, 0, 0, 1), + MASKTOVEC(1, 0, 1, 0), + MASKTOVEC(1, 0, 1, 1), + MASKTOVEC(1, 1, 0, 0), + MASKTOVEC(1, 1, 0, 1), + MASKTOVEC(1, 1, 1, 0), + MASKTOVEC(1, 1, 1, 1), +}; + +struct POS +{ + int32_t x, y; +}; + +struct EDGE +{ + double a, b; // a, b edge coefficients in fix8 + double stepQuadX; // step to adjacent horizontal quad in fix16 + double stepQuadY; // step to adjacent vertical quad in fix16 + double stepRasterTileX; // step to adjacent horizontal raster tile in fix16 + double stepRasterTileY; // step to adjacent vertical raster tile in fix16 + + __m256d vQuadOffsets; // offsets for 4 samples of a quad + __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief rasterize a raster tile partially covered by the triangle +/// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile +/// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C) +/// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad. +/// Used to step between quads when sweeping over the raster tile. +template<uint32_t NumEdges> +INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges) +{ + uint64_t coverageMask = 0; + + __m256d vEdges[NumEdges]; + __m256d vStepX[NumEdges]; + __m256d vStepY[NumEdges]; + + for (uint32_t e = 0; e < NumEdges; ++e) + { + // Step to the pixel sample locations of the 1st quad + vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets); + + // compute step to next quad (mul by 2 in x and y direction) + vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX); + vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY); + } + + // fast unrolled version for 8x8 tile +#if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8 + int edgeMask[NumEdges]; + uint64_t mask; + + auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);}; + auto update_lambda = [&](int e){mask &= edgeMask[e];}; + auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);}; + auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);}; + auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);}; + +// evaluate which pixels in the quad are covered +#define EVAL \ + UnrollerL<0, NumEdges, 1>::step(eval_lambda); + + // update coverage mask +#define UPDATE_MASK(bit) \ + mask = edgeMask[0]; \ + UnrollerL<1, NumEdges, 1>::step(update_lambda); \ + coverageMask |= (mask << bit); + + // step in the +x direction to the next quad +#define INCX \ + UnrollerL<0, NumEdges, 1>::step(incx_lambda); + + // step in the +y direction to the next quad +#define INCY \ + UnrollerL<0, NumEdges, 1>::step(incy_lambda); + + // step in the -x direction to the next quad +#define DECX \ + UnrollerL<0, NumEdges, 1>::step(decx_lambda); + + // sweep 2x2 quad back and forth through the raster tile, + // computing coverage masks for the entire tile + + // raster tile + // 0 1 2 3 4 5 6 7 + // x x + // x x ------------------> + // x x | + // <-----------------x x V + // .. + + // row 0 + EVAL; + UPDATE_MASK(0); + INCX; + EVAL; + UPDATE_MASK(4); + INCX; + EVAL; + UPDATE_MASK(8); + INCX; + EVAL; + UPDATE_MASK(12); + INCY; + + //row 1 + EVAL; + UPDATE_MASK(28); + DECX; + EVAL; + UPDATE_MASK(24); + DECX; + EVAL; + UPDATE_MASK(20); + DECX; + EVAL; + UPDATE_MASK(16); + INCY; + + // row 2 + EVAL; + UPDATE_MASK(32); + INCX; + EVAL; + UPDATE_MASK(36); + INCX; + EVAL; + UPDATE_MASK(40); + INCX; + EVAL; + UPDATE_MASK(44); + INCY; + + // row 3 + EVAL; + UPDATE_MASK(60); + DECX; + EVAL; + UPDATE_MASK(56); + DECX; + EVAL; + UPDATE_MASK(52); + DECX; + EVAL; + UPDATE_MASK(48); +#else + uint32_t bit = 0; + for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y) + { + __m256d vStartOfRowEdge[NumEdges]; + for (uint32_t e = 0; e < NumEdges; ++e) + { + vStartOfRowEdge[e] = vEdges[e]; + } + + for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x) + { + int edgeMask[NumEdges]; + for (uint32_t e = 0; e < NumEdges; ++e) + { + edgeMask[e] = _mm256_movemask_pd(vEdges[e]); + } + + uint64_t mask = edgeMask[0]; + for (uint32_t e = 1; e < NumEdges; ++e) + { + mask &= edgeMask[e]; + } + coverageMask |= (mask << bit); + + // step to the next pixel in the x + for (uint32_t e = 0; e < NumEdges; ++e) + { + vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]); + } + bit+=4; + } + + // step to the next row + for (uint32_t e = 0; e < NumEdges; ++e) + { + vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]); + } + } +#endif + return coverageMask; + +} +// Top left rule: +// Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge +// Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge +// Top left: a sample is in if it is a top or left edge. +// Out: !(horizontal && above) = !horizontal && below +// Out: !horizontal && left = !(!horizontal && left) = horizontal and right +INLINE __m256d adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, const __m256d vEdge) +{ + // if vA < 0, vC-- + // if vA == 0 && vB < 0, vC-- + + __m256d vEdgeOut = vEdge; + __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0)); + + // if vA < 0 (line is not horizontal and below) + int msk = _mm_movemask_ps(_mm_castsi128_ps(vA)); + + // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri) + __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128()); + int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp)); + msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB)); + + // if either of these are true and we're on the line (edge == 0), bump it outside the line + vEdgeOut = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]); + return vEdgeOut; +} + +// max(abs(dz/dx), abs(dz,dy) +INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc) +{ + /* + // evaluate i,j at (0,0) + float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2]; + float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2]; + + // evaluate i,j at (1,0) + float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2]; + float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2]; + + // compute dz/dx + float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2]; + float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2]; + float dzdx = abs(d10 - d00); + + // evaluate i,j at (0,1) + float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2]; + float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2]; + + float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2]; + float dzdy = abs(d01 - d00); + */ + + // optimized version of above + float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0])); + float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1])); + + return std::max(dzdx, dzdy); +} + +INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z) +{ + if (pState->depthFormat == R24_UNORM_X8_TYPELESS) + { + return (1.0f / (1 << 24)); + } + else if (pState->depthFormat == R16_UNORM) + { + return (1.0f / (1 << 16)); + } + else + { + SWR_ASSERT(pState->depthFormat == R32_FLOAT); + + // for f32 depth, factor = 2^(exponent(max(abs(z) - 23) + float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2]))); + uint32_t zMaxInt = *(uint32_t*)&zMax; + zMaxInt &= 0x7f800000; + zMax = *(float*)&zMaxInt; + + return zMax * (1.0f / (1 << 23)); + } +} + +INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z) +{ + if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0) + { + return 0.0f; + } + + float scale = pState->slopeScaledDepthBias; + if (scale != 0.0f) + { + scale *= ComputeMaxDepthSlope(pTri); + } + + float bias = pState->depthBias * ComputeBiasFactor(pState, pTri, z) + scale; + if (pState->depthBiasClamp > 0.0f) + { + bias = std::min(bias, pState->depthBiasClamp); + } + else if (pState->depthBiasClamp < 0.0f) + { + bias = std::max(bias, pState->depthBiasClamp); + } + + return bias; +} + +// Prevent DCE by writing coverage mask from rasterizer to volatile +#if KNOB_ENABLE_TOSS_POINTS +__declspec(thread) volatile uint64_t gToss; +#endif + +static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4; +// try to avoid _chkstk insertions; make this thread local +static THREAD OSALIGN(float, 16) perspAttribsTLS[vertsPerTri * KNOB_NUM_ATTRIBUTES * componentsPerAttrib]; + +INLINE +void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge) +{ + edge.a = a; + edge.b = b; + + // compute constant steps to adjacent quads + edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE)); + edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE)); + + // compute constant steps to adjacent raster tiles + edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE)); + edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE)); + + // compute quad offsets + const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0); + const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0); + + __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8); + __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8); + edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16); + + // compute raster tile offsets + const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0); + const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0); + + __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8); + __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8); + edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16); +} + +INLINE +void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge) +{ + ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge); +} + +template<bool RasterizeScissorEdges, SWR_MULTISAMPLE_COUNT sampleCount> +void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc) +{ + const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc); +#if KNOB_ENABLE_TOSS_POINTS + if (KNOB_TOSS_BIN_TRIS) + { + return; + } +#endif + RDTSC_START(BERasterizeTriangle); + + RDTSC_START(BETriangleSetup); + const API_STATE &state = GetApiState(pDC); + const SWR_RASTSTATE &rastState = state.rastState; + const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; + + OSALIGN(SWR_TRIANGLE_DESC, 16) triDesc; + triDesc.pUserClipBuffer = workDesc.pUserClipBuffer; + + __m128 vX, vY, vZ, vRecipW; + + // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care + // eg: vX = [x0 x1 x2 dc] + vX = _mm_load_ps(workDesc.pTriBuffer); + vY = _mm_load_ps(workDesc.pTriBuffer + 4); + vZ = _mm_load_ps(workDesc.pTriBuffer + 8); + vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12); + + // convert to fixed point + __m128i vXi = fpToFixedPoint(vX); + __m128i vYi = fpToFixedPoint(vY); + + // quantize floating point position to fixed point precision + // to prevent attribute creep around the triangle vertices + vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE)); + vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE)); + + // triangle setup - A and B edge equation coefs + __m128 vA, vB; + triangleSetupAB(vX, vY, vA, vB); + + __m128i vAi, vBi; + triangleSetupABInt(vXi, vYi, vAi, vBi); + + // determinant + float det = calcDeterminantInt(vAi, vBi); + + /// @todo: This test is flipped...we have a stray '-' sign somewhere + // Convert CW triangles to CCW + if (det > 0.0) + { + vA = _mm_mul_ps(vA, _mm_set1_ps(-1)); + vB = _mm_mul_ps(vB, _mm_set1_ps(-1)); + vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1)); + vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1)); + det = -det; + } + + __m128 vC; + // Finish triangle setup - C edge coef + triangleSetupC(vX, vY, vA, vB, vC); + + // compute barycentric i and j + // i = (A1x + B1y + C1)/det + // j = (A2x + B2y + C2)/det + __m128 vDet = _mm_set1_ps(det); + __m128 vRecipDet = _mm_div_ps(_mm_set1_ps(1.0f), vDet);//_mm_rcp_ps(vDet); + _mm_store_ss(&triDesc.recipDet, vRecipDet); + + // only extract coefs for 2 of the barycentrics; the 3rd can be + // determined from the barycentric equation: + // i + j + k = 1 <=> k = 1 - j - i + _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1); + _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1); + _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1); + _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2); + _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2); + _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2); + + OSALIGN(float, 16) oneOverW[4]; + _mm_store_ps(oneOverW, vRecipW); + triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2]; + triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2]; + triDesc.OneOverW[2] = oneOverW[2]; + + // calculate perspective correct coefs per vertex attrib + float* pPerspAttribs = perspAttribsTLS; + float* pAttribs = workDesc.pAttribs; + triDesc.pPerspAttribs = pPerspAttribs; + triDesc.pAttribs = pAttribs; + float *pRecipW = workDesc.pTriBuffer + 12; + triDesc.pRecipW = pRecipW; + __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW); + __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1); + __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1); + for(uint32_t i = 0; i < workDesc.numAttribs; i++) + { + __m128 attribA = _mm_load_ps(pAttribs); + __m128 attribB = _mm_load_ps(pAttribs+=4); + __m128 attribC = _mm_load_ps(pAttribs+=4); + pAttribs+=4; + + attribA = _mm_mul_ps(attribA, vOneOverWV0); + attribB = _mm_mul_ps(attribB, vOneOverWV1); + attribC = _mm_mul_ps(attribC, vOneOverWV2); + + _mm_store_ps(pPerspAttribs, attribA); + _mm_store_ps(pPerspAttribs+=4, attribB); + _mm_store_ps(pPerspAttribs+=4, attribC); + pPerspAttribs+=4; + } + + // compute bary Z + // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0) + OSALIGN(float, 16) a[4]; + _mm_store_ps(a, vZ); + triDesc.Z[0] = a[0] - a[2]; + triDesc.Z[1] = a[1] - a[2]; + triDesc.Z[2] = a[2]; + + // add depth bias + triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8); + + // Compute edge data + OSALIGNSIMD(int32_t) aAi[4], aBi[4]; + _mm_store_si128((__m128i*)aAi, vAi); + _mm_store_si128((__m128i*)aBi, vBi); + + const uint32_t numEdges = 3 + (RasterizeScissorEdges ? 4 : 0); + EDGE rastEdges[7]; + + // compute triangle edges + ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]); + ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]); + ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]); + + // compute scissor edges if enabled + if (RasterizeScissorEdges) + { + POS topLeft{state.scissorInFixedPoint.left, state.scissorInFixedPoint.top}; + POS bottomLeft{state.scissorInFixedPoint.left, state.scissorInFixedPoint.bottom}; + POS topRight{state.scissorInFixedPoint.right, state.scissorInFixedPoint.top}; + POS bottomRight{state.scissorInFixedPoint.right, state.scissorInFixedPoint.bottom}; + + // construct 4 scissor edges in ccw direction + ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]); + ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]); + ComputeEdgeData(bottomRight, topRight, rastEdges[5]); + ComputeEdgeData(topRight, topLeft, rastEdges[6]); + } + + // Calc bounding box of triangle + OSALIGN(BBOX, 16) bbox; + calcBoundingBoxInt(vXi, vYi, bbox); + + // Intersect with scissor/viewport + bbox.left = std::max(bbox.left, state.scissorInFixedPoint.left); + bbox.right = std::min(bbox.right - 1, state.scissorInFixedPoint.right); + bbox.top = std::max(bbox.top, state.scissorInFixedPoint.top); + bbox.bottom = std::min(bbox.bottom - 1, state.scissorInFixedPoint.bottom); + + triDesc.triFlags = workDesc.triFlags; + + // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox + uint32_t macroX, macroY; + MacroTileMgr::getTileIndices(macroTile, macroX, macroY); + int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED; + int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1; + int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; + int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1; + + OSALIGN(BBOX, 16) intersect; + intersect.left = std::max(bbox.left, macroBoxLeft); + intersect.top = std::max(bbox.top, macroBoxTop); + intersect.right = std::min(bbox.right, macroBoxRight); + intersect.bottom = std::min(bbox.bottom, macroBoxBottom); + + SWR_ASSERT(intersect.left <= intersect.right && intersect.top <= intersect.bottom && intersect.left >= 0 && intersect.right >= 0 && intersect.top >= 0 && intersect.bottom >= 0); + + RDTSC_STOP(BETriangleSetup, 0, pDC->drawId); + + // update triangle desc + uint32_t tileX = intersect.left >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); + uint32_t tileY = intersect.top >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); + uint32_t maxTileX = intersect.right >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); + uint32_t maxTileY = intersect.bottom >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); + uint32_t numTilesX = maxTileX - tileX + 1; + uint32_t numTilesY = maxTileY - tileY + 1; + + if (numTilesX == 0 || numTilesY == 0) + { + RDTSC_EVENT(BEEmptyTriangle, 1, 0); + RDTSC_STOP(BERasterizeTriangle, 1, 0); + return; + } + + RDTSC_START(BEStepSetup); + + // Step to pixel center of top-left pixel of the triangle bbox + // Align intersect bbox (top/left) to raster tile's (top/left). + int32_t x = AlignDown(intersect.left, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM)); + int32_t y = AlignDown(intersect.top, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM)); + + if(sampleCount == SWR_MULTISAMPLE_1X) + { + // Add 0.5, in fixed point, to offset to pixel center + x += (FIXED_POINT_SCALE / 2); + y += (FIXED_POINT_SCALE / 2); + } + + __m128i vTopLeftX = _mm_set1_epi32(x); + __m128i vTopLeftY = _mm_set1_epi32(y); + + // evaluate edge equations at top-left pixel using 64bit math + // all other evaluations will be 32bit steps from it + // small triangles could skip this and do all 32bit math + // edge 0 + // + // line = Ax + By + C + // solving for C: + // C = -Ax - By + // we know x0 and y0 are on the line; plug them in: + // C = -Ax0 - By0 + // plug C back into line equation: + // line = Ax - Bx - Ax0 - Bx1 + // line = A(x - x0) + B(y - y0) + // line = A(x0+dX) + B(y0+dY) + C = Ax0 + AdX + By0 + BdY + c = AdX + BdY + + // edge 0 and 1 + // edge0 = A0(x - x0) + B0(y - y0) + // edge1 = A1(x - x1) + B1(y - y1) + __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi); + __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi); + + __m256d vEdgeFix16[7]; + + // evaluate A(dx) and B(dY) for all points + __m256d vAipd = _mm256_cvtepi32_pd(vAi); + __m256d vBipd = _mm256_cvtepi32_pd(vBi); + __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX); + __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY); + + __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd); + __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd); + __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16); + + // adjust for top-left rule + vEdge = adjustTopLeftRuleIntFix16(vAi, vBi, vEdge); + + // broadcast respective edge results to all lanes + double* pEdge = (double*)&vEdge; + vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]); + vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]); + vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]); + + // evaluate edge equations for scissor edges + if (RasterizeScissorEdges) + { + const BBOX &scissor = state.scissorInFixedPoint; + vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.left)) + (rastEdges[3].b * (y - scissor.top))); + vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.left)) + (rastEdges[4].b * (y - scissor.bottom))); + vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.right)) + (rastEdges[5].b * (y - scissor.bottom))); + vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.right)) + (rastEdges[6].b * (y - scissor.top))); + } + + // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile + // used to for testing if entire raster tile is inside a triangle + vEdgeFix16[0] = _mm256_add_pd(vEdgeFix16[0], rastEdges[0].vRasterTileOffsets); + vEdgeFix16[1] = _mm256_add_pd(vEdgeFix16[1], rastEdges[1].vRasterTileOffsets); + vEdgeFix16[2] = _mm256_add_pd(vEdgeFix16[2], rastEdges[2].vRasterTileOffsets); + + // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox + // step sample positions to the raster tile bbox of multisample points + // min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples) + // | | + // | | + // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples) + __m256d vEdge0TileBbox, vEdge1TileBbox, vEdge2TileBbox; + if (sampleCount > SWR_MULTISAMPLE_1X) + { + __m128i vTileSampleBBoxXh = MultisampleTraits<sampleCount>::TileSampleOffsetsX(); + __m128i vTileSampleBBoxYh = MultisampleTraits<sampleCount>::TileSampleOffsetsY(); + + __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh); + __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh); + + // step edge equation tests from Tile + // used to for testing if entire raster tile is inside a triangle + __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].a), vTileSampleBBoxXFix8); + __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].b), vTileSampleBBoxYFix8); + vEdge0TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16); + + vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].a), vTileSampleBBoxXFix8); + vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].b), vTileSampleBBoxYFix8); + vEdge1TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16); + + vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].a), vTileSampleBBoxXFix8); + vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].b), vTileSampleBBoxYFix8); + vEdge2TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16); + } + + RDTSC_STOP(BEStepSetup, 0, pDC->drawId); + + uint32_t tY = tileY; + uint32_t tX = tileX; + uint32_t maxY = maxTileY; + uint32_t maxX = maxTileX; + + // compute steps between raster tiles for render output buffers + static const uint32_t colorRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8)) * MultisampleTraits<sampleCount>::numSamples}; + static const uint32_t colorRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * colorRasterTileStep}; + static const uint32_t depthRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8)) * MultisampleTraits<sampleCount>::numSamples}; + static const uint32_t depthRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM)* depthRasterTileStep}; + static const uint32_t stencilRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8)) * MultisampleTraits<sampleCount>::numSamples}; + static const uint32_t stencilRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * stencilRasterTileStep}; + RenderOutputBuffers renderBuffers, currentRenderBufferRow; + + GetRenderHotTiles(pDC, macroTile, tileX, tileY, renderBuffers, MultisampleTraits<sampleCount>::numSamples, + triDesc.triFlags.renderTargetArrayIndex); + currentRenderBufferRow = renderBuffers; + + // rasterize and generate coverage masks per sample + uint32_t maxSamples = MultisampleTraits<sampleCount>::numSamples; + for (uint32_t tileY = tY; tileY <= maxY; ++tileY) + { + __m256d vStartOfRowEdge[numEdges]; + for (uint32_t e = 0; e < numEdges; ++e) + { + vStartOfRowEdge[e] = vEdgeFix16[e]; + } + + for (uint32_t tileX = tX; tileX <= maxX; ++tileX) + { + uint64_t anyCoveredSamples = 0; + + // is the corner of the edge outside of the raster tile? (vEdge < 0) + int mask0, mask1, mask2; + if (sampleCount == SWR_MULTISAMPLE_1X) + { + mask0 = _mm256_movemask_pd(vEdgeFix16[0]); + mask1 = _mm256_movemask_pd(vEdgeFix16[1]); + mask2 = _mm256_movemask_pd(vEdgeFix16[2]); + } + else + { + __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2; + // evaluate edge equations at the tile multisample bounding box + vSampleBboxTest0 = _mm256_add_pd(vEdge0TileBbox, vEdgeFix16[0]); + vSampleBboxTest1 = _mm256_add_pd(vEdge1TileBbox, vEdgeFix16[1]); + vSampleBboxTest2 = _mm256_add_pd(vEdge2TileBbox, vEdgeFix16[2]); + mask0 = _mm256_movemask_pd(vSampleBboxTest0); + mask1 = _mm256_movemask_pd(vSampleBboxTest1); + mask2 = _mm256_movemask_pd(vSampleBboxTest2); + } + + for (uint32_t sampleNum = 0; sampleNum < maxSamples; sampleNum++) + { + // trivial reject, at least one edge has all 4 corners of raster tile outside + bool trivialReject = (!(mask0 && mask1 && mask2)) ? true : false; + + if (!trivialReject) + { + // trivial accept mask + triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL; + if ((mask0 & mask1 & mask2) == 0xf) + { + anyCoveredSamples = triDesc.coverageMask[sampleNum]; + // trivial accept, all 4 corners of all 3 edges are negative + // i.e. raster tile completely inside triangle + RDTSC_EVENT(BETrivialAccept, 1, 0); + } + else + { + __m256d vEdge0AtSample, vEdge1AtSample, vEdge2AtSample; + if(sampleCount == SWR_MULTISAMPLE_1X) + { + // should get optimized out for single sample case (global value numbering or copy propagation) + vEdge0AtSample = vEdgeFix16[0]; + vEdge1AtSample = vEdgeFix16[1]; + vEdge2AtSample = vEdgeFix16[2]; + } + else + { + __m128i vSampleOffsetXh = MultisampleTraits<sampleCount>::vXi(sampleNum); + __m128i vSampleOffsetYh = MultisampleTraits<sampleCount>::vYi(sampleNum); + __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh); + __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh); + + // *note*: none of this needs to be vectorized as rasterizePartialTile just takes vEdge[0] + // for each edge and broadcasts it before offsetting to individual pixel quads + + // step edge equation tests from UL tile corner to pixel sample position + __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].a), vSampleOffsetX); + __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].b), vSampleOffsetY); + vEdge0AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16); + vEdge0AtSample = _mm256_add_pd(vEdgeFix16[0], vEdge0AtSample); + + vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].a), vSampleOffsetX); + vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].b), vSampleOffsetY); + vEdge1AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16); + vEdge1AtSample = _mm256_add_pd(vEdgeFix16[1], vEdge1AtSample); + + vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].a), vSampleOffsetX); + vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].b), vSampleOffsetY); + vEdge2AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16); + vEdge2AtSample = _mm256_add_pd(vEdgeFix16[2], vEdge2AtSample); + } + + double startQuadEdges[numEdges]; + const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); + _mm256_maskstore_pd(&startQuadEdges[0], vLane0Mask, vEdge0AtSample); + _mm256_maskstore_pd(&startQuadEdges[1], vLane0Mask, vEdge1AtSample); + _mm256_maskstore_pd(&startQuadEdges[2], vLane0Mask, vEdge2AtSample); + + for (uint32_t e = 3; e < numEdges; ++e) + { + _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeFix16[e]); + } + + // not trivial accept or reject, must rasterize full tile + RDTSC_START(BERasterizePartial); + if (RasterizeScissorEdges) + { + triDesc.coverageMask[sampleNum] = rasterizePartialTile<7>(pDC, startQuadEdges, rastEdges); + } + else + { + triDesc.coverageMask[sampleNum] = rasterizePartialTile<3>(pDC, startQuadEdges, rastEdges); + } + RDTSC_STOP(BERasterizePartial, 0, 0); + + anyCoveredSamples |= triDesc.coverageMask[sampleNum]; + } + } + else + { + // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything + if(sampleCount > SWR_MULTISAMPLE_1X) + { + triDesc.coverageMask[sampleNum] = 0; + } + RDTSC_EVENT(BETrivialReject, 1, 0); + } + } + +#if KNOB_ENABLE_TOSS_POINTS + if(KNOB_TOSS_RS) + { + gToss = triDesc.coverageMask[0]; + } + else +#endif + if(anyCoveredSamples) + { + RDTSC_START(BEPixelBackend); + backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers); + RDTSC_STOP(BEPixelBackend, 0, 0); + } + + // step to the next tile in X + for (uint32_t e = 0; e < numEdges; ++e) + { + vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX)); + } + StepRasterTileX(state.psState.numRenderTargets, renderBuffers, colorRasterTileStep, depthRasterTileStep, stencilRasterTileStep); + } + + // step to the next tile in Y + for (uint32_t e = 0; e < numEdges; ++e) + { + vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY)); + } + StepRasterTileY(state.psState.numRenderTargets, renderBuffers, currentRenderBufferRow, colorRasterTileRowStep, depthRasterTileRowStep, stencilRasterTileRowStep); + } + + RDTSC_STOP(BERasterizeTriangle, 1, 0); +} + +void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData) +{ + const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData; + const SWR_RASTSTATE& rastState = pDC->pState->state.rastState; + const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState; + + bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0; + + // load point vertex + float x = *workDesc.pTriBuffer; + float y = *(workDesc.pTriBuffer + 1); + float z = *(workDesc.pTriBuffer + 2); + + // create a copy of the triangle buffer to write our adjusted vertices to + OSALIGNSIMD(float) newTriBuffer[4 * 4]; + TRIANGLE_WORK_DESC newWorkDesc = workDesc; + newWorkDesc.pTriBuffer = &newTriBuffer[0]; + + // create a copy of the attrib buffer to write our adjusted attribs to + OSALIGNSIMD(float) newAttribBuffer[4 * 3 * KNOB_NUM_ATTRIBUTES]; + newWorkDesc.pAttribs = &newAttribBuffer[0]; + + newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer; + newWorkDesc.numAttribs = workDesc.numAttribs; + newWorkDesc.triFlags = workDesc.triFlags; + + // construct two tris by bloating point by point size + float halfPointSize = workDesc.triFlags.pointSize * 0.5f; + float lowerX = x - halfPointSize; + float upperX = x + halfPointSize; + float lowerY = y - halfPointSize; + float upperY = y + halfPointSize; + + // tri 0 + float *pBuf = &newTriBuffer[0]; + *pBuf++ = lowerX; + *pBuf++ = lowerX; + *pBuf++ = upperX; + pBuf++; + *pBuf++ = lowerY; + *pBuf++ = upperY; + *pBuf++ = upperY; + pBuf++; + _mm_store_ps(pBuf, _mm_set1_ps(z)); + _mm_store_ps(pBuf+=4, _mm_set1_ps(1.0f)); + + // setup triangle rasterizer function + PFN_WORK_FUNC pfnTriRast; + if (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) + { + pfnTriRast = gRasterizerTable[rastState.scissorEnable][rastState.sampleCount]; + } + else + { + // for center sample pattern, all samples are at pixel center; calculate coverage + // once at center and broadcast the results in the backend + pfnTriRast = gRasterizerTable[rastState.scissorEnable][SWR_MULTISAMPLE_1X]; + } + + // overwrite texcoords for point sprites + if (isPointSpriteTexCoordEnabled) + { + // copy original attribs + memcpy(&newAttribBuffer[0], workDesc.pAttribs, 4 * 3 * workDesc.numAttribs * sizeof(float)); + newWorkDesc.pAttribs = &newAttribBuffer[0]; + + // overwrite texcoord for point sprites + uint32_t texCoordMask = backendState.pointSpriteTexCoordMask; + DWORD texCoordAttrib = 0; + + while (_BitScanForward(&texCoordAttrib, texCoordMask)) + { + texCoordMask &= ~(1 << texCoordAttrib); + __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib; + if (rastState.pointSpriteTopOrigin) + { + pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0); + pTexAttrib[1] = _mm_set_ps(1, 0, 1, 0); + pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1); + } + else + { + pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0); + pTexAttrib[1] = _mm_set_ps(1, 0, 0, 0); + pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1); + } + } + } + else + { + // no texcoord overwrite, can reuse the attrib buffer from frontend + newWorkDesc.pAttribs = workDesc.pAttribs; + } + + pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); + + // tri 1 + pBuf = &newTriBuffer[0]; + *pBuf++ = lowerX; + *pBuf++ = upperX; + *pBuf++ = upperX; + pBuf++; + *pBuf++ = lowerY; + *pBuf++ = upperY; + *pBuf++ = lowerY; + // z, w unchanged + + if (isPointSpriteTexCoordEnabled) + { + uint32_t texCoordMask = backendState.pointSpriteTexCoordMask; + DWORD texCoordAttrib = 0; + + while (_BitScanForward(&texCoordAttrib, texCoordMask)) + { + texCoordMask &= ~(1 << texCoordAttrib); + __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib; + if (rastState.pointSpriteTopOrigin) + { + pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0); + pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1); + pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1); + + } + else + { + pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0); + pTexAttrib[1] = _mm_set_ps(1, 0, 0, 1); + pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1); + } + } + } + + pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); +} + +void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData) +{ +#if KNOB_ENABLE_TOSS_POINTS + if (KNOB_TOSS_BIN_TRIS) + { + return; + } +#endif + + const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData; + const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; + + // map x,y relative offsets from start of raster tile to bit position in + // coverage mask for the point + static const uint32_t coverageMap[8][8] = { + { 0, 1, 4, 5, 8, 9, 12, 13 }, + { 2, 3, 6, 7, 10, 11, 14, 15 }, + { 16, 17, 20, 21, 24, 25, 28, 29 }, + { 18, 19, 22, 23, 26, 27, 30, 31 }, + { 32, 33, 36, 37, 40, 41, 44, 45 }, + { 34, 35, 38, 39, 42, 43, 46, 47 }, + { 48, 49, 52, 53, 56, 57, 60, 61 }, + { 50, 51, 54, 55, 58, 59, 62, 63 } + }; + + OSALIGN(SWR_TRIANGLE_DESC, 16) triDesc; + + // pull point information from triangle buffer + // @todo use structs for readability + uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer; + uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1); + float z = *(workDesc.pTriBuffer + 2); + + // construct triangle descriptor for point + // no interpolation, set up i,j for constant interpolation of z and attribs + // @todo implement an optimized backend that doesn't require triangle information + + // compute coverage mask from x,y packed into the coverageMask flag + // mask indices by the maximum valid index for x/y of coveragemap. + uint32_t tX = workDesc.triFlags.coverageMask & 0x7; + uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7; + // todo: multisample points? + triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX]; + + // no persp divide needed for points + triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs; + triDesc.triFlags = workDesc.triFlags; + triDesc.recipDet = 1.0f; + triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f; + triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f; + triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f; + triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z; + + RenderOutputBuffers renderBuffers; + GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT, + renderBuffers, 1, triDesc.triFlags.renderTargetArrayIndex); + + RDTSC_START(BEPixelBackend); + backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers); + RDTSC_STOP(BEPixelBackend, 0, 0); +} + +// Get pointers to hot tile memory for color RT, depth, stencil +void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, + uint32_t numSamples, uint32_t renderTargetArrayIndex) +{ + const API_STATE& state = GetApiState(pDC); + SWR_CONTEXT *pContext = pDC->pContext; + + uint32_t mx, my; + MacroTileMgr::getTileIndices(macroID, mx, my); + tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx; + tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my; + + // compute tile offset for active hottile buffers + const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8; + uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY); + offset*=numSamples; + + unsigned long rtSlot = 0; + uint32_t colorHottileEnableMask = state.colorHottileEnable; + while(_BitScanForward(&rtSlot, colorHottileEnableMask)) + { + HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, + numSamples, renderTargetArrayIndex); + pColor->state = HOTTILE_DIRTY; + renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset; + + colorHottileEnableMask &= ~(1 << rtSlot); + } + if(state.depthHottileEnable) + { + const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8; + uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY); + offset*=numSamples; + HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, + numSamples, renderTargetArrayIndex); + pDepth->state = HOTTILE_DIRTY; + SWR_ASSERT(pDepth->pBuffer != nullptr); + renderBuffers.pDepth = pDepth->pBuffer + offset; + } + if(state.stencilHottileEnable) + { + const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8; + uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY); + offset*=numSamples; + HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, + numSamples, renderTargetArrayIndex); + pStencil->state = HOTTILE_DIRTY; + SWR_ASSERT(pStencil->pBuffer != nullptr); + renderBuffers.pStencil = pStencil->pBuffer + offset; + } +} + +INLINE +void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers, uint32_t colorTileStep, uint32_t depthTileStep, uint32_t stencilTileStep) +{ + for(uint32_t rt = 0; rt < NumRT; ++rt) + { + buffers.pColor[rt] += colorTileStep; + } + + buffers.pDepth += depthTileStep; + buffers.pStencil += stencilTileStep; +} + +INLINE +void StepRasterTileY(uint32_t NumRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow, uint32_t colorRowStep, uint32_t depthRowStep, uint32_t stencilRowStep) +{ + for(uint32_t rt = 0; rt < NumRT; ++rt) + { + startBufferRow.pColor[rt] += colorRowStep; + buffers.pColor[rt] = startBufferRow.pColor[rt]; + } + startBufferRow.pDepth += depthRowStep; + buffers.pDepth = startBufferRow.pDepth; + + startBufferRow.pStencil += stencilRowStep; + buffers.pStencil = startBufferRow.pStencil; +} + +// initialize rasterizer function table +PFN_WORK_FUNC gRasterizerTable[2][SWR_MULTISAMPLE_TYPE_MAX] = +{ + RasterizeTriangle<false, SWR_MULTISAMPLE_1X>, + RasterizeTriangle<false, SWR_MULTISAMPLE_2X>, + RasterizeTriangle<false, SWR_MULTISAMPLE_4X>, + RasterizeTriangle<false, SWR_MULTISAMPLE_8X>, + RasterizeTriangle<false, SWR_MULTISAMPLE_16X>, + RasterizeTriangle<true, SWR_MULTISAMPLE_1X>, + RasterizeTriangle<true, SWR_MULTISAMPLE_2X>, + RasterizeTriangle<true, SWR_MULTISAMPLE_4X>, + RasterizeTriangle<true, SWR_MULTISAMPLE_8X>, + RasterizeTriangle<true, SWR_MULTISAMPLE_16X> +}; + +void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) +{ + const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pData); +#if KNOB_ENABLE_TOSS_POINTS + if (KNOB_TOSS_BIN_TRIS) + { + return; + } +#endif + + // bloat line to two tris and call the triangle rasterizer twice + RDTSC_START(BERasterizeLine); + + const API_STATE &state = GetApiState(pDC); + const SWR_RASTSTATE &rastState = state.rastState; + + // macrotile dimensioning + uint32_t macroX, macroY; + MacroTileMgr::getTileIndices(macroTile, macroX, macroY); + int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED; + int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1; + int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; + int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1; + + // create a copy of the triangle buffer to write our adjusted vertices to + OSALIGNSIMD(float) newTriBuffer[4 * 4]; + TRIANGLE_WORK_DESC newWorkDesc = workDesc; + newWorkDesc.pTriBuffer = &newTriBuffer[0]; + + // create a copy of the attrib buffer to write our adjusted attribs to + OSALIGNSIMD(float) newAttribBuffer[4 * 3 * KNOB_NUM_ATTRIBUTES]; + newWorkDesc.pAttribs = &newAttribBuffer[0]; + + const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f); + const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f); + + __m128 vX, vY, vZ, vRecipW; + + vX = _mm_load_ps(workDesc.pTriBuffer); + vY = _mm_load_ps(workDesc.pTriBuffer + 4); + vZ = _mm_load_ps(workDesc.pTriBuffer + 8); + vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12); + + // triangle 0 + // v0,v1 -> v0,v0,v1 + __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0)); + __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0)); + __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0)); + __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0)); + + __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth); + __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0); + if (workDesc.triFlags.yMajor) + { + vXa = _mm_add_ps(vAdjust, vXa); + } + else + { + vYa = _mm_add_ps(vAdjust, vYa); + } + + // Store triangle description for rasterizer + _mm_store_ps((float*)&newTriBuffer[0], vXa); + _mm_store_ps((float*)&newTriBuffer[4], vYa); + _mm_store_ps((float*)&newTriBuffer[8], vZa); + _mm_store_ps((float*)&newTriBuffer[12], vRecipWa); + + // binner bins 3 edges for lines as v0, v1, v1 + // tri0 needs v0, v0, v1 + for (uint32_t a = 0; a < workDesc.numAttribs; ++a) + { + __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a*12 + 0]); + __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a*12 + 4]); + + _mm_store_ps((float*)&newAttribBuffer[a*12 + 0], vAttrib0); + _mm_store_ps((float*)&newAttribBuffer[a*12 + 4], vAttrib0); + _mm_store_ps((float*)&newAttribBuffer[a*12 + 8], vAttrib1); + } + + // Store user clip distances for triangle 0 + float newClipBuffer[3 * 8]; + uint32_t numClipDist = _mm_popcnt_u32(state.rastState.clipDistanceMask); + if (numClipDist) + { + newWorkDesc.pUserClipBuffer = newClipBuffer; + + float* pOldBuffer = workDesc.pUserClipBuffer; + float* pNewBuffer = newClipBuffer; + for (uint32_t i = 0; i < numClipDist; ++i) + { + // read barycentric coeffs from binner + float a = *(pOldBuffer++); + float b = *(pOldBuffer++); + + // reconstruct original clip distance at vertices + float c0 = a + b; + float c1 = b; + + // construct triangle barycentrics + *(pNewBuffer++) = c0 - c1; + *(pNewBuffer++) = c0 - c1; + *(pNewBuffer++) = c1; + } + } + + // make sure this macrotile intersects the triangle + __m128i vXai = fpToFixedPoint(vXa); + __m128i vYai = fpToFixedPoint(vYa); + OSALIGN(BBOX, 16) bboxA; + calcBoundingBoxInt(vXai, vYai, bboxA); + + if (!(bboxA.left > macroBoxRight || + bboxA.left > state.scissorInFixedPoint.right || + bboxA.right - 1 < macroBoxLeft || + bboxA.right - 1 < state.scissorInFixedPoint.left || + bboxA.top > macroBoxBottom || + bboxA.top > state.scissorInFixedPoint.bottom || + bboxA.bottom - 1 < macroBoxTop || + bboxA.bottom - 1 < state.scissorInFixedPoint.top)) { + // rasterize triangle + gRasterizerTable[rastState.scissorEnable][rastState.sampleCount](pDC, workerId, macroTile, (void*)&newWorkDesc); + } + + // triangle 1 + // v0,v1 -> v1,v1,v0 + vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1)); + vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1)); + vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1)); + vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1)); + + vAdjust = _mm_mul_ps(vLineWidth, vBloat1); + if (workDesc.triFlags.yMajor) + { + vXa = _mm_add_ps(vAdjust, vXa); + } + else + { + vYa = _mm_add_ps(vAdjust, vYa); + } + + // Store triangle description for rasterizer + _mm_store_ps((float*)&newTriBuffer[0], vXa); + _mm_store_ps((float*)&newTriBuffer[4], vYa); + _mm_store_ps((float*)&newTriBuffer[8], vZa); + _mm_store_ps((float*)&newTriBuffer[12], vRecipWa); + + // binner bins 3 edges for lines as v0, v1, v1 + // tri1 needs v1, v1, v0 + for (uint32_t a = 0; a < workDesc.numAttribs; ++a) + { + __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]); + __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]); + + _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1); + _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1); + _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0); + } + + // store user clip distance for triangle 1 + if (numClipDist) + { + float* pOldBuffer = workDesc.pUserClipBuffer; + float* pNewBuffer = newClipBuffer; + for (uint32_t i = 0; i < numClipDist; ++i) + { + // read barycentric coeffs from binner + float a = *(pOldBuffer++); + float b = *(pOldBuffer++); + + // reconstruct original clip distance at vertices + float c0 = a + b; + float c1 = b; + + // construct triangle barycentrics + *(pNewBuffer++) = c1 - c0; + *(pNewBuffer++) = c1 - c0; + *(pNewBuffer++) = c0; + } + } + + vXai = fpToFixedPoint(vXa); + vYai = fpToFixedPoint(vYa); + calcBoundingBoxInt(vXai, vYai, bboxA); + + if (!(bboxA.left > macroBoxRight || + bboxA.left > state.scissorInFixedPoint.right || + bboxA.right - 1 < macroBoxLeft || + bboxA.right - 1 < state.scissorInFixedPoint.left || + bboxA.top > macroBoxBottom || + bboxA.top > state.scissorInFixedPoint.bottom || + bboxA.bottom - 1 < macroBoxTop || + bboxA.bottom - 1 < state.scissorInFixedPoint.top)) { + // rasterize triangle + gRasterizerTable[rastState.scissorEnable][rastState.sampleCount](pDC, workerId, macroTile, (void*)&newWorkDesc); + } + + RDTSC_STOP(BERasterizeLine, 1, 0); +} + diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h new file mode 100644 index 00000000000..bcfeef48410 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h @@ -0,0 +1,35 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file rasterizer.h +* +* @brief Definitions for the rasterizer. +* +******************************************************************************/ +#pragma once + +#include "context.h" + +extern PFN_WORK_FUNC gRasterizerTable[2][SWR_MULTISAMPLE_TYPE_MAX]; +void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); +void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); +void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp new file mode 100644 index 00000000000..4b6b536075b --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp @@ -0,0 +1,91 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +****************************************************************************/ + +#include "rdtsc_core.h" +#include "common/rdtsc_buckets.h" + +// must match CORE_BUCKETS enum order +BUCKET_DESC gCoreBuckets[] = { + { "APIClearRenderTarget", "", true, 0xff0b8bea }, + { "APIDraw", "", true, 0xff000066 }, + { "APIDrawWakeAllThreads", "", false, 0xffffffff }, + { "APIDrawIndexed", "", true, 0xff000066 }, + { "APIDispatch", "", true, 0xff660000 }, + { "APIStoreTiles", "", true, 0xff00ffff }, + { "APIGetDrawContext", "", false, 0xffffffff }, + { "APISync", "", true, 0xff6666ff }, + { "APIWaitForIdle", "", true, 0xff0000ff }, + { "FEProcessDraw", "", true, 0xff009900 }, + { "FEProcessDrawIndexed", "", true, 0xff009900 }, + { "FEFetchShader", "", false, 0xffffffff }, + { "FEVertexShader", "", false, 0xffffffff }, + { "FEHullShader", "", false, 0xffffffff }, + { "FETessellation", "", false, 0xffffffff }, + { "FEDomainShader", "", false, 0xffffffff }, + { "FEGeometryShader", "", false, 0xffffffff }, + { "FEStreamout", "", false, 0xffffffff }, + { "FEPAAssemble", "", false, 0xffffffff }, + { "FEBinPoints", "", false, 0xff29b854 }, + { "FEBinLines", "", false, 0xff29b854 }, + { "FEBinTriangles", "", false, 0xff29b854 }, + { "FETriangleSetup", "", false, 0xffffffff }, + { "FEViewportCull", "", false, 0xffffffff }, + { "FEGuardbandClip", "", false, 0xffffffff }, + { "FEClipPoints", "", false, 0xffffffff }, + { "FEClipLines", "", false, 0xffffffff }, + { "FEClipTriangles", "", false, 0xffffffff }, + { "FECullZeroAreaAndBackface", "", false, 0xffffffff }, + { "FECullBetweenCenters", "", false, 0xffffffff }, + { "FEProcessStoreTiles", "", true, 0xff39c864 }, + { "FEProcessInvalidateTiles", "", true, 0xffffffff }, + { "WorkerWorkOnFifoBE", "", false, 0xff40261c }, + { "WorkerFoundWork", "", false, 0xff573326 }, + { "BELoadTiles", "", true, 0xffb0e2ff }, + { "BEDispatch", "", true, 0xff00a2ff }, + { "BEClear", "", true, 0xff00ccbb }, + { "BERasterizeLine", "", true, 0xffb26a4e }, + { "BERasterizeTriangle", "", true, 0xffb26a4e }, + { "BETriangleSetup", "", false, 0xffffffff }, + { "BEStepSetup", "", false, 0xffffffff }, + { "BECullZeroArea", "", false, 0xffffffff }, + { "BEEmptyTriangle", "", false, 0xffffffff }, + { "BETrivialAccept", "", false, 0xffffffff }, + { "BETrivialReject", "", false, 0xffffffff }, + { "BERasterizePartial", "", false, 0xffffffff }, + { "BEPixelBackend", "", false, 0xffffffff }, + { "BESetup", "", false, 0xffffffff }, + { "BEBarycentric", "", false, 0xffffffff }, + { "BEEarlyDepthTest", "", false, 0xffffffff }, + { "BEPixelShader", "", false, 0xffffffff }, + { "BELateDepthTest", "", false, 0xffffffff }, + { "BEOutputMerger", "", false, 0xffffffff }, + { "BEStoreTiles", "", true, 0xff00cccc }, + { "BEEndTile", "", false, 0xffffffff }, + { "WorkerWaitForThreadEvent", "", false, 0xffffffff }, +}; + +/// @todo bucketmanager and mapping should probably be a part of the SWR context +std::vector<uint32_t> gBucketMap; +BucketManager gBucketMgr(KNOB_BUCKETS_ENABLE_THREADVIZ); + +uint32_t gCurrentFrame = 0; diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h new file mode 100644 index 00000000000..5fcc40bf8ee --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h @@ -0,0 +1,177 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +****************************************************************************/ + +#pragma once +#include "knobs.h" + +#include "common/os.h" +#include "common/rdtsc_buckets.h" + +#include <vector> + +enum CORE_BUCKETS +{ + APIClearRenderTarget, + APIDraw, + APIDrawWakeAllThreads, + APIDrawIndexed, + APIDispatch, + APIStoreTiles, + APIGetDrawContext, + APISync, + APIWaitForIdle, + FEProcessDraw, + FEProcessDrawIndexed, + FEFetchShader, + FEVertexShader, + FEHullShader, + FETessellation, + FEDomainShader, + FEGeometryShader, + FEStreamout, + FEPAAssemble, + FEBinPoints, + FEBinLines, + FEBinTriangles, + FETriangleSetup, + FEViewportCull, + FEGuardbandClip, + FEClipPoints, + FEClipLines, + FEClipTriangles, + FECullZeroAreaAndBackface, + FECullBetweenCenters, + FEProcessStoreTiles, + FEProcessInvalidateTiles, + WorkerWorkOnFifoBE, + WorkerFoundWork, + BELoadTiles, + BEDispatch, + BEClear, + BERasterizeLine, + BERasterizeTriangle, + BETriangleSetup, + BEStepSetup, + BECullZeroArea, + BEEmptyTriangle, + BETrivialAccept, + BETrivialReject, + BERasterizePartial, + BEPixelBackend, + BESetup, + BEBarycentric, + BEEarlyDepthTest, + BEPixelShader, + BELateDepthTest, + BEOutputMerger, + BEStoreTiles, + BEEndTile, + WorkerWaitForThreadEvent, + + NumBuckets +}; + +void rdtscReset(); +void rdtscInit(int threadId); +void rdtscStart(uint32_t bucketId); +void rdtscStop(uint32_t bucketId, uint32_t count, uint64_t drawId); +void rdtscEvent(uint32_t bucketId, uint32_t count1, uint32_t count2); +void rdtscEndFrame(); + +#ifdef KNOB_ENABLE_RDTSC +#define RDTSC_RESET() rdtscReset() +#define RDTSC_INIT(threadId) rdtscInit(threadId) +#define RDTSC_START(bucket) rdtscStart(bucket) +#define RDTSC_STOP(bucket, count, draw) rdtscStop(bucket, count, draw) +#define RDTSC_EVENT(bucket, count1, count2) rdtscEvent(bucket, count1, count2) +#define RDTSC_ENDFRAME() rdtscEndFrame() +#else +#define RDTSC_RESET() +#define RDTSC_INIT(threadId) +#define RDTSC_START(bucket) +#define RDTSC_STOP(bucket, count, draw) +#define RDTSC_EVENT(bucket, count1, count2) +#define RDTSC_ENDFRAME() +#endif + +extern std::vector<uint32_t> gBucketMap; +extern BucketManager gBucketMgr; +extern BUCKET_DESC gCoreBuckets[]; +extern uint32_t gCurrentFrame; + +INLINE void rdtscReset() +{ + gCurrentFrame = 0; + gBucketMgr.ClearThreads(); + gBucketMgr.ClearBuckets(); +} + +INLINE void rdtscInit(int threadId) +{ + // register all the buckets once + if (threadId == 0) + { + gBucketMap.resize(NumBuckets); + for (uint32_t i = 0; i < NumBuckets; ++i) + { + gBucketMap[i] = gBucketMgr.RegisterBucket(gCoreBuckets[i]); + } + } + + std::string name = threadId == 0 ? "API" : "WORKER"; + gBucketMgr.RegisterThread(name); +} + +INLINE void rdtscStart(uint32_t bucketId) +{ + uint32_t id = gBucketMap[bucketId]; + gBucketMgr.StartBucket(id); +} + +INLINE void rdtscStop(uint32_t bucketId, uint32_t count, uint64_t drawId) +{ + uint32_t id = gBucketMap[bucketId]; + gBucketMgr.StopBucket(id); +} + +INLINE void rdtscEvent(uint32_t bucketId, uint32_t count1, uint32_t count2) +{ + uint32_t id = gBucketMap[bucketId]; + gBucketMgr.AddEvent(id, count1); +} + +INLINE void rdtscEndFrame() +{ + gCurrentFrame++; + + if (gCurrentFrame == KNOB_BUCKETS_START_FRAME) + { + gBucketMgr.StartCapture(); + } + + if (gCurrentFrame == KNOB_BUCKETS_END_FRAME) + { + gBucketMgr.StopCapture(); + gBucketMgr.PrintReport("rdtsc.txt"); + } +} diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h new file mode 100644 index 00000000000..2758555fd4b --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -0,0 +1,1027 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file state.h +* +* @brief Definitions for API state. +* +******************************************************************************/ +#pragma once + +#include "common/formats.h" +#include "common/simdintrin.h" + +// clear flags +#define SWR_CLEAR_NONE 0 +#define SWR_CLEAR_COLOR (1 << 0) +#define SWR_CLEAR_DEPTH (1 << 1) +#define SWR_CLEAR_STENCIL (1 << 2) + +enum DRIVER_TYPE +{ + DX, + GL +}; + +////////////////////////////////////////////////////////////////////////// +/// PRIMITIVE_TOPOLOGY. +////////////////////////////////////////////////////////////////////////// +enum PRIMITIVE_TOPOLOGY +{ + TOP_UNKNOWN = 0x0, + TOP_POINT_LIST = 0x1, + TOP_LINE_LIST = 0x2, + TOP_LINE_STRIP = 0x3, + TOP_TRIANGLE_LIST = 0x4, + TOP_TRIANGLE_STRIP = 0x5, + TOP_TRIANGLE_FAN = 0x6, + TOP_QUAD_LIST = 0x7, + TOP_QUAD_STRIP = 0x8, + TOP_LINE_LIST_ADJ = 0x9, + TOP_LISTSTRIP_ADJ = 0xA, + TOP_TRI_LIST_ADJ = 0xB, + TOP_TRI_STRIP_ADJ = 0xC, + TOP_TRI_STRIP_REVERSE = 0xD, + TOP_POLYGON = 0xE, + TOP_RECT_LIST = 0xF, + TOP_LINE_LOOP = 0x10, + TOP_POINT_LIST_BF = 0x11, + TOP_LINE_STRIP_CONT = 0x12, + TOP_LINE_STRIP_BF = 0x13, + TOP_LINE_STRIP_CONT_BF = 0x14, + TOP_TRIANGLE_FAN_NOSTIPPLE = 0x16, + TOP_TRIANGLE_DISC = 0x17, /// @todo What is this?? + + TOP_PATCHLIST_BASE = 0x1F, // Invalid topology, used to calculate num verts for a patchlist. + TOP_PATCHLIST_1 = 0x20, // List of 1-vertex patches + TOP_PATCHLIST_2 = 0x21, + TOP_PATCHLIST_3 = 0x22, + TOP_PATCHLIST_4 = 0x23, + TOP_PATCHLIST_5 = 0x24, + TOP_PATCHLIST_6 = 0x25, + TOP_PATCHLIST_7 = 0x26, + TOP_PATCHLIST_8 = 0x27, + TOP_PATCHLIST_9 = 0x28, + TOP_PATCHLIST_10 = 0x29, + TOP_PATCHLIST_11 = 0x2A, + TOP_PATCHLIST_12 = 0x2B, + TOP_PATCHLIST_13 = 0x2C, + TOP_PATCHLIST_14 = 0x2D, + TOP_PATCHLIST_15 = 0x2E, + TOP_PATCHLIST_16 = 0x2F, + TOP_PATCHLIST_17 = 0x30, + TOP_PATCHLIST_18 = 0x31, + TOP_PATCHLIST_19 = 0x32, + TOP_PATCHLIST_20 = 0x33, + TOP_PATCHLIST_21 = 0x34, + TOP_PATCHLIST_22 = 0x35, + TOP_PATCHLIST_23 = 0x36, + TOP_PATCHLIST_24 = 0x37, + TOP_PATCHLIST_25 = 0x38, + TOP_PATCHLIST_26 = 0x39, + TOP_PATCHLIST_27 = 0x3A, + TOP_PATCHLIST_28 = 0x3B, + TOP_PATCHLIST_29 = 0x3C, + TOP_PATCHLIST_30 = 0x3D, + TOP_PATCHLIST_31 = 0x3E, + TOP_PATCHLIST_32 = 0x3F, // List of 32-vertex patches +}; + +////////////////////////////////////////////////////////////////////////// +/// SWR_SHADER_TYPE +////////////////////////////////////////////////////////////////////////// +enum SWR_SHADER_TYPE +{ + SHADER_VERTEX, + SHADER_GEOMETRY, + SHADER_DOMAIN, + SHADER_HULL, + SHADER_PIXEL, + SHADER_COMPUTE, + + NUM_SHADER_TYPES, +}; + +////////////////////////////////////////////////////////////////////////// +/// SWR_RENDERTARGET_ATTACHMENT +/// @todo Its not clear what an "attachment" means. Its not common term. +////////////////////////////////////////////////////////////////////////// +enum SWR_RENDERTARGET_ATTACHMENT +{ + SWR_ATTACHMENT_COLOR0, + SWR_ATTACHMENT_COLOR1, + SWR_ATTACHMENT_COLOR2, + SWR_ATTACHMENT_COLOR3, + SWR_ATTACHMENT_COLOR4, + SWR_ATTACHMENT_COLOR5, + SWR_ATTACHMENT_COLOR6, + SWR_ATTACHMENT_COLOR7, + SWR_ATTACHMENT_DEPTH, + SWR_ATTACHMENT_STENCIL, + + SWR_NUM_ATTACHMENTS +}; + +#define SWR_NUM_RENDERTARGETS 8 + +#define SWR_ATTACHMENT_COLOR0_BIT 0x001 +#define SWR_ATTACHMENT_COLOR1_BIT 0x002 +#define SWR_ATTACHMENT_COLOR2_BIT 0x004 +#define SWR_ATTACHMENT_COLOR3_BIT 0x008 +#define SWR_ATTACHMENT_COLOR4_BIT 0x010 +#define SWR_ATTACHMENT_COLOR5_BIT 0x020 +#define SWR_ATTACHMENT_COLOR6_BIT 0x040 +#define SWR_ATTACHMENT_COLOR7_BIT 0x080 +#define SWR_ATTACHMENT_DEPTH_BIT 0x100 +#define SWR_ATTACHMENT_STENCIL_BIT 0x200 +#define SWR_ATTACHMENT_MASK_ALL 0x3ff +#define SWR_ATTACHMENT_MASK_COLOR 0x0ff + + +////////////////////////////////////////////////////////////////////////// +/// @brief SWR Inner Tessellation factor ID +/// See above GetTessFactorOutputPosition code for documentation +enum SWR_INNER_TESSFACTOR_ID +{ + SWR_QUAD_U_TRI_INSIDE, + SWR_QUAD_V_INSIDE, + + SWR_NUM_INNER_TESS_FACTORS, +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief SWR Outer Tessellation factor ID +/// See above GetTessFactorOutputPosition code for documentation +enum SWR_OUTER_TESSFACTOR_ID +{ + SWR_QUAD_U_EQ0_TRI_U_LINE_DETAIL, + SWR_QUAD_V_EQ0_TRI_V_LINE_DENSITY, + SWR_QUAD_U_EQ1_TRI_W, + SWR_QUAD_V_EQ1, + + SWR_NUM_OUTER_TESS_FACTORS, +}; + + +///////////////////////////////////////////////////////////////////////// +/// simdvertex +/// @brief Defines a vertex element that holds all the data for SIMD vertices. +/// Contains position in clip space, hardcoded to attribute 0, +/// space for up to 32 attributes, as well as any SGV values generated +/// by the pipeline +///////////////////////////////////////////////////////////////////////// +#define VERTEX_POSITION_SLOT 0 +#define VERTEX_ATTRIB_START_SLOT 1 +#define VERTEX_ATTRIB_END_SLOT 32 +#define VERTEX_RTAI_SLOT 33 // GS writes RenderTargetArrayIndex here +#define VERTEX_PRIMID_SLOT 34 // GS writes PrimId here +#define VERTEX_CLIPCULL_DIST_LO_SLOT 35 // VS writes lower 4 clip/cull dist +#define VERTEX_CLIPCULL_DIST_HI_SLOT 36 // VS writes upper 4 clip/cull dist +#define VERTEX_POINT_SIZE_SLOT 37 // VS writes point size here +static_assert(VERTEX_POINT_SIZE_SLOT < KNOB_NUM_ATTRIBUTES, "Mismatched attribute slot size"); + +// SoAoSoA +struct simdvertex +{ + simdvector attrib[KNOB_NUM_ATTRIBUTES]; +}; + +////////////////////////////////////////////////////////////////////////// +/// SWR_VS_CONTEXT +/// @brief Input to vertex shader +///////////////////////////////////////////////////////////////////////// +struct SWR_VS_CONTEXT +{ + simdvertex* pVin; // IN: SIMD input vertex data store + simdvertex* pVout; // OUT: SIMD output vertex data store + + uint32_t InstanceID; // IN: Instance ID, constant across all verts of the SIMD + simdscalari VertexID; // IN: Vertex ID + simdscalari mask; // IN: Active mask for shader +}; + +///////////////////////////////////////////////////////////////////////// +/// ScalarCPoint +/// @brief defines a control point element as passed from the output +/// of the hull shader to the input of the domain shader +///////////////////////////////////////////////////////////////////////// +struct ScalarAttrib +{ + float x; + float y; + float z; + float w; +}; + +struct ScalarCPoint +{ + ScalarAttrib attrib[KNOB_NUM_ATTRIBUTES]; +}; + +////////////////////////////////////////////////////////////////////////// +/// SWR_TESSELLATION_FACTORS +/// @brief Tessellation factors structure (non-vector) +///////////////////////////////////////////////////////////////////////// +struct SWR_TESSELLATION_FACTORS +{ + float OuterTessFactors[SWR_NUM_OUTER_TESS_FACTORS]; + float InnerTessFactors[SWR_NUM_INNER_TESS_FACTORS]; +}; + +#define MAX_NUM_VERTS_PER_PRIM 32 // support up to 32 control point patches +struct ScalarPatch +{ + SWR_TESSELLATION_FACTORS tessFactors; + ScalarCPoint cp[MAX_NUM_VERTS_PER_PRIM]; + ScalarCPoint patchData; +}; + +////////////////////////////////////////////////////////////////////////// +/// SWR_HS_CONTEXT +/// @brief Input to hull shader +///////////////////////////////////////////////////////////////////////// +struct SWR_HS_CONTEXT +{ + simdvertex vert[MAX_NUM_VERTS_PER_PRIM]; // IN: (SIMD) input primitive data + simdscalari PrimitiveID; // IN: (SIMD) primitive ID generated from the draw call + simdscalari mask; // IN: Active mask for shader + ScalarPatch* pCPout; // OUT: Output control point patch + // SIMD-sized-array of SCALAR patches +}; + +////////////////////////////////////////////////////////////////////////// +/// SWR_DS_CONTEXT +/// @brief Input to domain shader +///////////////////////////////////////////////////////////////////////// +struct SWR_DS_CONTEXT +{ + uint32_t PrimitiveID; // IN: (SCALAR) PrimitiveID for the patch associated with the DS invocation + uint32_t vectorOffset; // IN: (SCALAR) vector index offset into SIMD data. + uint32_t vectorStride; // IN: (SCALAR) stride (in vectors) of output data per attribute-component + ScalarPatch* pCpIn; // IN: (SCALAR) Control patch + simdscalar* pDomainU; // IN: (SIMD) Domain Point U coords + simdscalar* pDomainV; // IN: (SIMD) Domain Point V coords + simdscalari mask; // IN: Active mask for shader + simdscalar* pOutputData; // OUT: (SIMD) Vertex Attributes (2D array of vectors, one row per attribute-component) +}; + +////////////////////////////////////////////////////////////////////////// +/// SWR_GS_CONTEXT +/// @brief Input to geometry shader. +///////////////////////////////////////////////////////////////////////// +struct SWR_GS_CONTEXT +{ + simdvertex vert[MAX_NUM_VERTS_PER_PRIM]; // IN: input primitive data for SIMD prims + simdscalari PrimitiveID; // IN: input primitive ID generated from the draw call + uint32_t InstanceID; // IN: input instance ID + simdscalari mask; // IN: Active mask for shader + uint8_t* pStream; // OUT: output stream (contains vertices for all output streams) + uint8_t* pCutOrStreamIdBuffer; // OUT: cut or stream id buffer + simdscalari vertexCount; // OUT: num vertices emitted per SIMD lane +}; + +struct PixelPositions +{ + simdscalar UL; + simdscalar center; + simdscalar sample; + simdscalar centroid; +}; + +////////////////////////////////////////////////////////////////////////// +/// SWR_PS_CONTEXT +/// @brief Input to pixel shader. +///////////////////////////////////////////////////////////////////////// +struct SWR_PS_CONTEXT +{ + PixelPositions vX; // IN: x location(s) of pixels + PixelPositions vY; // IN: x location(s) of pixels + simdscalar vZ; // INOUT: z location of pixels + simdscalari activeMask; // OUT: mask for kill + simdscalar inputMask; // IN: input coverage mask for all samples + simdscalari oMask; // OUT: mask for output coverage + + PixelPositions vI; // barycentric coords evaluated at pixel center, sample position, centroid + PixelPositions vJ; + PixelPositions vOneOverW; // IN: 1/w + + const float* pAttribs; // IN: pointer to attribute barycentric coefficients + const float* pPerspAttribs; // IN: pointer to attribute/w barycentric coefficients + const float* pRecipW; // IN: pointer to 1/w coord for each vertex + const float *I; // IN: Barycentric A, B, and C coefs used to compute I + const float *J; // IN: Barycentric A, B, and C coefs used to compute J + float recipDet; // IN: 1/Det, used when barycentric interpolating attributes + const float* pSamplePosX; // IN: array of sample positions + const float* pSamplePosY; // IN: array of sample positions + simdvector shaded[SWR_NUM_RENDERTARGETS]; + // OUT: result color per rendertarget + + uint32_t frontFace; // IN: front- 1, back- 0 + uint32_t primID; // IN: primitive ID + uint32_t sampleIndex; // IN: sampleIndex +}; + +////////////////////////////////////////////////////////////////////////// +/// SWR_CS_CONTEXT +/// @brief Input to compute shader. +///////////////////////////////////////////////////////////////////////// +struct SWR_CS_CONTEXT +{ + // The ThreadGroupId is the current thread group index relative + // to all thread groups in the Dispatch call. The ThreadId, ThreadIdInGroup, + // and ThreadIdInGroupFlattened can be derived from ThreadGroupId in the shader. + + // Compute shader accepts the following system values. + // o ThreadId - Current thread id relative to all other threads in dispatch. + // o ThreadGroupId - Current thread group id relative to all other groups in dispatch. + // o ThreadIdInGroup - Current thread relative to all threads in the current thread group. + // o ThreadIdInGroupFlattened - Flattened linear id derived from ThreadIdInGroup. + // + // All of these system values can be computed in the shader. They will be + // derived from the current tile counter. The tile counter is an atomic counter that + // resides in the draw context and is initialized to the product of the dispatch dims. + // + // tileCounter = dispatchDims.x * dispatchDims.y * dispatchDims.z + // + // Each CPU worker thread will atomically decrement this counter and passes the current + // count into the shader. When the count reaches 0 then all thread groups in the + // dispatch call have been completed. + + uint32_t tileCounter; // The tile counter value for this thread group. + + // Dispatch dimensions used by shader to compute system values from the tile counter. + uint32_t dispatchDims[3]; + + uint8_t* pTGSM; // Thread Group Shared Memory pointer. + + uint8_t* pSpillFillBuffer; // Spill/fill buffer for barrier support +}; + +// enums +enum SWR_TILE_MODE +{ + SWR_TILE_NONE = 0x0, // Linear mode (no tiling) + SWR_TILE_MODE_WMAJOR, // W major tiling + SWR_TILE_MODE_XMAJOR, // X major tiling + SWR_TILE_MODE_YMAJOR, // Y major tiling + SWR_TILE_SWRZ, // SWR-Z tiling + + SWR_TILE_MODE_COUNT +}; + +enum SWR_SURFACE_TYPE +{ + SURFACE_1D = 0, + SURFACE_2D = 1, + SURFACE_3D = 2, + SURFACE_CUBE = 3, + SURFACE_BUFFER = 4, + SURFACE_STRUCTURED_BUFFER = 5, + SURFACE_NULL = 7 +}; + +enum SWR_ZFUNCTION +{ + ZFUNC_ALWAYS, + ZFUNC_NEVER, + ZFUNC_LT, + ZFUNC_EQ, + ZFUNC_LE, + ZFUNC_GT, + ZFUNC_NE, + ZFUNC_GE, + NUM_ZFUNC +}; + +enum SWR_STENCILOP +{ + STENCILOP_KEEP, + STENCILOP_ZERO, + STENCILOP_REPLACE, + STENCILOP_INCRSAT, + STENCILOP_DECRSAT, + STENCILOP_INCR, + STENCILOP_DECR, + STENCILOP_INVERT +}; + +enum SWR_BLEND_FACTOR +{ + BLENDFACTOR_ONE, + BLENDFACTOR_SRC_COLOR, + BLENDFACTOR_SRC_ALPHA, + BLENDFACTOR_DST_ALPHA, + BLENDFACTOR_DST_COLOR, + BLENDFACTOR_SRC_ALPHA_SATURATE, + BLENDFACTOR_CONST_COLOR, + BLENDFACTOR_CONST_ALPHA, + BLENDFACTOR_SRC1_COLOR, + BLENDFACTOR_SRC1_ALPHA, + BLENDFACTOR_ZERO, + BLENDFACTOR_INV_SRC_COLOR, + BLENDFACTOR_INV_SRC_ALPHA, + BLENDFACTOR_INV_DST_ALPHA, + BLENDFACTOR_INV_DST_COLOR, + BLENDFACTOR_INV_CONST_COLOR, + BLENDFACTOR_INV_CONST_ALPHA, + BLENDFACTOR_INV_SRC1_COLOR, + BLENDFACTOR_INV_SRC1_ALPHA +}; + +enum SWR_BLEND_OP +{ + BLENDOP_ADD, + BLENDOP_SUBTRACT, + BLENDOP_REVSUBTRACT, + BLENDOP_MIN, + BLENDOP_MAX, +}; + +enum SWR_LOGIC_OP +{ + LOGICOP_CLEAR, + LOGICOP_NOR, + LOGICOP_AND_INVERTED, + LOGICOP_COPY_INVERTED, + LOGICOP_AND_REVERSE, + LOGICOP_INVERT, + LOGICOP_XOR, + LOGICOP_NAND, + LOGICOP_AND, + LOGICOP_EQUIV, + LOGICOP_NOOP, + LOGICOP_OR_INVERTED, + LOGICOP_COPY, + LOGICOP_OR_REVERSE, + LOGICOP_OR, + LOGICOP_SET, +}; + +struct SWR_SURFACE_STATE +{ + uint8_t *pBaseAddress; + SWR_SURFACE_TYPE type; // @llvm_enum + SWR_FORMAT format; // @llvm_enum + uint32_t width; + uint32_t height; + uint32_t depth; + uint32_t numSamples; + uint32_t samplePattern; + uint32_t pitch; + uint32_t qpitch; + uint32_t minLod; // for sampled surfaces, the most detailed LOD that can be accessed by sampler + uint32_t maxLod; // for sampled surfaces, the max LOD that can be accessed + float resourceMinLod; // for sampled surfaces, the most detailed fractional mip that can be accessed by sampler + uint32_t lod; // for render targets, the lod being rendered to + uint32_t arrayIndex; // for render targets, the array index being rendered to for arrayed surfaces + SWR_TILE_MODE tileMode; // @llvm_enum + bool bInterleavedSamples; // are MSAA samples stored interleaved or planar + uint32_t halign; + uint32_t valign; + uint32_t xOffset; + uint32_t yOffset; + + uint32_t lodOffsets[2][15]; // lod offsets for sampled surfaces + + uint8_t *pAuxBaseAddress; // Used for compression, append/consume counter, etc. +}; + +// vertex fetch state +// WARNING- any changes to this struct need to be reflected +// in the fetch shader jit +struct SWR_VERTEX_BUFFER_STATE +{ + uint32_t index; + uint32_t pitch; + const uint8_t *pData; + uint32_t size; + uint32_t numaNode; + uint32_t maxVertex; // size / pitch. precalculated value used by fetch shader for OOB checks + uint32_t partialInboundsSize; // size % pitch. precalculated value used by fetch shader for partially OOB vertices +}; + +struct SWR_INDEX_BUFFER_STATE +{ + // Format type for indices (e.g. UINT16, UINT32, etc.) + SWR_FORMAT format; // @llvm_enum + const void *pIndices; + uint32_t size; +}; + + +////////////////////////////////////////////////////////////////////////// +/// SWR_FETCH_CONTEXT +/// @brief Input to fetch shader. +/// @note WARNING - Changes to this struct need to be reflected in the +/// fetch shader jit. +///////////////////////////////////////////////////////////////////////// +struct SWR_FETCH_CONTEXT +{ + const SWR_VERTEX_BUFFER_STATE* pStreams; // IN: array of bound vertex buffers + const int32_t* pIndices; // IN: pointer to index buffer for indexed draws + const int32_t* pLastIndex; // IN: pointer to end of index buffer, used for bounds checking + uint32_t CurInstance; // IN: current instance + uint32_t BaseVertex; // IN: base vertex + uint32_t StartVertex; // IN: start vertex + uint32_t StartInstance; // IN: start instance + simdscalari VertexID; // OUT: vector of vertex IDs + simdscalari CutMask; // OUT: vector mask of indices which have the cut index value +}; + +////////////////////////////////////////////////////////////////////////// +/// SWR_STATS +/// +/// @brief All statistics generated by SWR go here. These are public +/// to driver. +///////////////////////////////////////////////////////////////////////// +struct SWR_STATS +{ + // Occlusion Query + uint64_t DepthPassCount; // Number of passing depth tests. Not exact. + + // Pipeline Stats + uint64_t IaVertices; // Number of Fetch Shader vertices + uint64_t IaPrimitives; // Number of PA primitives. + uint64_t VsInvocations; // Number of Vertex Shader invocations + uint64_t HsInvocations; // Number of Hull Shader invocations + uint64_t DsInvocations; // Number of Domain Shader invocations + uint64_t GsInvocations; // Number of Geometry Shader invocations + uint64_t PsInvocations; // Number of Pixel Shader invocations + uint64_t CsInvocations; // Number of Compute Shader invocations + uint64_t CInvocations; // Number of clipper invocations + uint64_t CPrimitives; // Number of clipper primitives. + uint64_t GsPrimitives; // Number of prims GS outputs. + + // Streamout Stats + uint32_t SoWriteOffset[4]; + uint64_t SoPrimStorageNeeded[4]; + uint64_t SoNumPrimsWritten[4]; +}; + +////////////////////////////////////////////////////////////////////////// +/// STREAMOUT_BUFFERS +///////////////////////////////////////////////////////////////////////// + +#define MAX_SO_STREAMS 4 +#define MAX_ATTRIBUTES 32 + +struct SWR_STREAMOUT_BUFFER +{ + bool enable; + + // Pointers to streamout buffers. + uint32_t* pBuffer; + + // Size of buffer in dwords. + uint32_t bufferSize; + + // Vertex pitch of buffer in dwords. + uint32_t pitch; + + // Offset into buffer in dwords. SOS will increment this offset. + uint32_t streamOffset; + + // Offset to the SO write offset. If not null then we update offset here. + uint32_t* pWriteOffset; + +}; + +////////////////////////////////////////////////////////////////////////// +/// STREAMOUT_STATE +///////////////////////////////////////////////////////////////////////// +struct SWR_STREAMOUT_STATE +{ + // This disables stream output. + bool soEnable; + + // which streams are enabled for streamout + bool streamEnable[MAX_SO_STREAMS]; + + // If set then do not send any streams to the rasterizer. + bool rasterizerDisable; + + // Specifies which stream to send to the rasterizer. + uint32_t streamToRasterizer; + + // The stream masks specify which attributes are sent to which streams. + // These masks help the FE to setup the pPrimData buffer that is passed + // the the Stream Output Shader (SOS) function. + uint32_t streamMasks[MAX_SO_STREAMS]; + + // Number of attributes, including position, per vertex that are streamed out. + // This should match number of bits in stream mask. + uint32_t streamNumEntries[MAX_SO_STREAMS]; +}; + +////////////////////////////////////////////////////////////////////////// +/// STREAMOUT_CONTEXT - Passed to SOS +///////////////////////////////////////////////////////////////////////// +struct SWR_STREAMOUT_CONTEXT +{ + uint32_t* pPrimData; + SWR_STREAMOUT_BUFFER* pBuffer[MAX_SO_STREAMS]; + + // Num prims written for this stream + uint32_t numPrimsWritten; + + // Num prims that should have been written if there were no overflow. + uint32_t numPrimStorageNeeded; +}; + +////////////////////////////////////////////////////////////////////////// +/// SWR_GS_STATE - Geometry shader state +///////////////////////////////////////////////////////////////////////// +struct SWR_GS_STATE +{ + bool gsEnable; + + // number of input attributes per vertex. used by the frontend to + // optimize assembling primitives for GS + uint32_t numInputAttribs; + + // output topology - can be point, tristrip, or linestrip + PRIMITIVE_TOPOLOGY outputTopology; // @llvm_enum + + // maximum number of verts that can be emitted by a single instance of the GS + uint32_t maxNumVerts; + + // instance count + uint32_t instanceCount; + + // geometry shader emits renderTargetArrayIndex + bool emitsRenderTargetArrayIndex; + + // geometry shader emits PrimitiveID + bool emitsPrimitiveID; + + // if true, geometry shader emits a single stream, with separate cut buffer. + // if false, geometry shader emits vertices for multiple streams to the stream buffer, with a separate StreamID buffer + // to map vertices to streams + bool isSingleStream; + + // when single stream is enabled, singleStreamID dictates which stream is being output. + // field ignored if isSingleStream is false + uint32_t singleStreamID; +}; + + +////////////////////////////////////////////////////////////////////////// +/// SWR_TS_OUTPUT_TOPOLOGY - Defines data output by the tessellator / DS +///////////////////////////////////////////////////////////////////////// +enum SWR_TS_OUTPUT_TOPOLOGY +{ + SWR_TS_OUTPUT_POINT, + SWR_TS_OUTPUT_LINE, + SWR_TS_OUTPUT_TRI_CW, + SWR_TS_OUTPUT_TRI_CCW, + + SWR_TS_OUTPUT_TOPOLOGY_COUNT +}; + +////////////////////////////////////////////////////////////////////////// +/// SWR_TS_PARTITIONING - Defines tessellation algorithm +///////////////////////////////////////////////////////////////////////// +enum SWR_TS_PARTITIONING +{ + SWR_TS_INTEGER, + SWR_TS_ODD_FRACTIONAL, + SWR_TS_EVEN_FRACTIONAL, + + SWR_TS_PARTITIONING_COUNT +}; + +////////////////////////////////////////////////////////////////////////// +/// SWR_TS_DOMAIN - Defines Tessellation Domain +///////////////////////////////////////////////////////////////////////// +enum SWR_TS_DOMAIN +{ + SWR_TS_QUAD, + SWR_TS_TRI, + SWR_TS_ISOLINE, + + SWR_TS_DOMAIN_COUNT +}; + +////////////////////////////////////////////////////////////////////////// +/// SWR_TS_STATE - Tessellation state +///////////////////////////////////////////////////////////////////////// +struct SWR_TS_STATE +{ + bool tsEnable; + SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology; // @llvm_enum + SWR_TS_PARTITIONING partitioning; // @llvm_enum + SWR_TS_DOMAIN domain; // @llvm_enum + + PRIMITIVE_TOPOLOGY postDSTopology; // @llvm_enum + + uint32_t numHsInputAttribs; + uint32_t numHsOutputAttribs; + uint32_t numDsOutputAttribs; +}; + +// output merger state +struct SWR_RENDER_TARGET_BLEND_STATE +{ + uint8_t writeDisableRed : 1; + uint8_t writeDisableGreen : 1; + uint8_t writeDisableBlue : 1; + uint8_t writeDisableAlpha : 1; +}; +static_assert(sizeof(SWR_RENDER_TARGET_BLEND_STATE) == 1, "Invalid SWR_RENDER_TARGET_BLEND_STATE size"); + +#define SWR_MAX_NUM_MULTISAMPLES 16 +enum SWR_MULTISAMPLE_COUNT +{ + SWR_MULTISAMPLE_1X = 0, + SWR_MULTISAMPLE_2X, + SWR_MULTISAMPLE_4X, + SWR_MULTISAMPLE_8X, + SWR_MULTISAMPLE_16X, + SWR_MULTISAMPLE_TYPE_MAX +}; + +struct SWR_BLEND_STATE +{ + // constant blend factor color in RGBA float + float constantColor[4]; + + // alpha test reference value in unorm8 or float32 + uint32_t alphaTestReference; + uint32_t sampleMask; + // all RT's have the same sample count + ///@todo move this to Output Merger state when we refactor + SWR_MULTISAMPLE_COUNT sampleCount; // @llvm_enum + + SWR_RENDER_TARGET_BLEND_STATE renderTarget[SWR_NUM_RENDERTARGETS]; +}; +static_assert(sizeof(SWR_BLEND_STATE) == 36, "Invalid SWR_BLEND_STATE size"); + +////////////////////////////////////////////////////////////////////////// +/// FUNCTION POINTERS FOR SHADERS + +typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, simdvertex& out); +typedef void(__cdecl *PFN_VERTEX_FUNC)(HANDLE hPrivateData, SWR_VS_CONTEXT* pVsContext); +typedef void(__cdecl *PFN_HS_FUNC)(HANDLE hPrivateData, SWR_HS_CONTEXT* pHsContext); +typedef void(__cdecl *PFN_DS_FUNC)(HANDLE hPrivateData, SWR_DS_CONTEXT* pDsContext); +typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, SWR_GS_CONTEXT* pGsContext); +typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, SWR_CS_CONTEXT* pCsContext); +typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext); +typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext); +typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*); + +////////////////////////////////////////////////////////////////////////// +/// FRONTEND_STATE +///////////////////////////////////////////////////////////////////////// +struct SWR_FRONTEND_STATE +{ + // skip clip test, perspective divide, and viewport transform + // intended for verts in screen space + bool vpTransformDisable; + union + { + struct + { + uint32_t triFan : 2; + uint32_t lineStripList : 1; + uint32_t triStripList : 2; + }; + uint32_t bits; + }provokingVertex; + uint32_t topologyProvokingVertex; // provoking vertex for the draw topology +}; + +////////////////////////////////////////////////////////////////////////// +/// VIEWPORT_MATRIX +///////////////////////////////////////////////////////////////////////// +struct SWR_VIEWPORT_MATRIX +{ + float m00; + float m11; + float m22; + float m30; + float m31; + float m32; +}; + +////////////////////////////////////////////////////////////////////////// +/// SWR_VIEWPORT +///////////////////////////////////////////////////////////////////////// +struct SWR_VIEWPORT +{ + float x; + float y; + float width; + float height; + float minZ; + float maxZ; +}; + +////////////////////////////////////////////////////////////////////////// +/// SWR_CULLMODE +////////////////////////////////////////////////////////////////////////// +enum SWR_CULLMODE +{ + SWR_CULLMODE_BOTH, + SWR_CULLMODE_NONE, + SWR_CULLMODE_FRONT, + SWR_CULLMODE_BACK +}; + +enum SWR_FILLMODE +{ + SWR_FILLMODE_POINT, + SWR_FILLMODE_WIREFRAME, + SWR_FILLMODE_SOLID +}; + +enum SWR_FRONTWINDING +{ + SWR_FRONTWINDING_CW, + SWR_FRONTWINDING_CCW +}; + + +enum SWR_MSAA_SAMPLE_PATTERN +{ + SWR_MSAA_CENTER_PATTERN, + SWR_MSAA_STANDARD_PATTERN, + SWR_MSAA_SAMPLE_PATTERN_MAX +}; + +enum SWR_PIXEL_LOCATION +{ + SWR_PIXEL_LOCATION_CENTER, + SWR_PIXEL_LOCATION_UL, +}; + +// fixed point screen space sample locations within a pixel +struct SWR_MULTISAMPLE_POS +{ + uint32_t x; + uint32_t y; +}; + +enum SWR_MSAA_RASTMODE +{ + SWR_MSAA_RASTMODE_OFF_PIXEL, + SWR_MSAA_RASTMODE_OFF_PATTERN, + SWR_MSAA_RASTMODE_ON_PIXEL, + SWR_MSAA_RASTMODE_ON_PATTERN +}; + +////////////////////////////////////////////////////////////////////////// +/// SWR_RASTSTATE +////////////////////////////////////////////////////////////////////////// +struct SWR_RASTSTATE +{ + uint32_t cullMode : 2; + uint32_t fillMode : 2; + uint32_t frontWinding : 1; + uint32_t scissorEnable : 1; + uint32_t depthClipEnable : 1; + float pointSize; + float lineWidth; + + // point size output from the VS + bool pointParam; + + // point sprite + bool pointSpriteEnable; + bool pointSpriteTopOrigin; + + // depth bias + float depthBias; + float slopeScaledDepthBias; + float depthBiasClamp; + SWR_FORMAT depthFormat; // @llvm_enum + + ///@todo: MSAA lines + // multisample state for MSAA lines + bool msaaRastEnable; + SWR_MSAA_RASTMODE rastMode; // @llvm_enum + + // sample count the rasterizer is running at + SWR_MULTISAMPLE_COUNT sampleCount; // @llvm_enum + bool bForcedSampleCount; + uint32_t pixelLocation; // UL or Center + bool pixelOffset; // offset pixel positions by .5 in both the horizontal and vertical direction + SWR_MULTISAMPLE_POS iSamplePos[SWR_MAX_NUM_MULTISAMPLES]; + SWR_MSAA_SAMPLE_PATTERN samplePattern; // @llvm_enum + + // user clip/cull distance enables + uint8_t cullDistanceMask; + uint8_t clipDistanceMask; +}; + +// backend state +struct SWR_BACKEND_STATE +{ + uint32_t constantInterpolationMask; + uint32_t pointSpriteTexCoordMask; + uint8_t numAttributes; + uint8_t numComponents[KNOB_NUM_ATTRIBUTES]; +}; + +union SWR_DEPTH_STENCIL_STATE +{ + struct + { + // dword 0 + uint32_t depthWriteEnable : 1; + uint32_t depthTestEnable : 1; + uint32_t stencilWriteEnable : 1; + uint32_t stencilTestEnable : 1; + uint32_t doubleSidedStencilTestEnable : 1; + + uint32_t depthTestFunc : 3; + uint32_t stencilTestFunc : 3; + + uint32_t backfaceStencilPassDepthPassOp : 3; + uint32_t backfaceStencilPassDepthFailOp : 3; + uint32_t backfaceStencilFailOp : 3; + uint32_t backfaceStencilTestFunc : 3; + uint32_t stencilPassDepthPassOp : 3; + uint32_t stencilPassDepthFailOp : 3; + uint32_t stencilFailOp : 3; + + // dword 1 + uint8_t backfaceStencilWriteMask; + uint8_t backfaceStencilTestMask; + uint8_t stencilWriteMask; + uint8_t stencilTestMask; + + // dword 2 + uint8_t backfaceStencilRefValue; + uint8_t stencilRefValue; + }; + uint32_t value[3]; +}; + +enum SWR_SHADING_RATE +{ + SWR_SHADING_RATE_PIXEL, + SWR_SHADING_RATE_SAMPLE, + SWR_SHADING_RATE_COARSE, + SWR_SHADING_RATE_MAX, +}; + +enum SWR_INPUT_COVERAGE +{ + SWR_INPUT_COVERAGE_NONE, + SWR_INPUT_COVERAGE_NORMAL, + SWR_INPUT_COVERAGE_MAX, +}; + +enum SWR_PS_POSITION_OFFSET +{ + SWR_PS_POSITION_SAMPLE_NONE, + SWR_PS_POSITION_SAMPLE_OFFSET, + SWR_PS_POSITION_CENTROID_OFFSET, + SWR_PS_POSITION_OFFSET_MAX, +}; + +enum SWR_BARYCENTRICS_MASK +{ + SWR_BARYCENTRIC_PER_PIXEL_MASK = 0x1, + SWR_BARYCENTRIC_CENTROID_MASK = 0x2, + SWR_BARYCENTRIC_PER_SAMPLE_MASK = 0x4, + SWR_BARYCENTRICS_MASK_MAX = 0x8 +}; + +// pixel shader state +struct SWR_PS_STATE +{ + // dword 0-1 + PFN_PIXEL_KERNEL pfnPixelShader; // @llvm_pfn + + // dword 2 + uint32_t killsPixel : 1; // pixel shader can kill pixels + uint32_t inputCoverage : 1; // type of input coverage PS uses + uint32_t writesODepth : 1; // pixel shader writes to depth + uint32_t usesSourceDepth : 1; // pixel shader reads depth + uint32_t shadingRate : 2; // shading per pixel / sample / coarse pixel + uint32_t numRenderTargets : 4; // number of render target outputs in use (0-8) + uint32_t posOffset : 2; // type of offset (none, sample, centroid) to add to pixel position + uint32_t barycentricsMask : 3; // which type(s) of barycentric coords does the PS interpolate attributes with + uint32_t usesUAV : 1; // pixel shader accesses UAV + uint32_t forceEarlyZ : 1; // force execution of early depth/stencil test +}; diff --git a/src/gallium/drivers/swr/rasterizer/core/tessellator.h b/src/gallium/drivers/swr/rasterizer/core/tessellator.h new file mode 100644 index 00000000000..915ac77897b --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/tessellator.h @@ -0,0 +1,88 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file tessellator.h +* +* @brief Tessellator fixed function unit interface definition +* +******************************************************************************/ +#pragma once + +/// Allocate and initialize a new tessellation context +HANDLE SWR_API TSInitCtx( + SWR_TS_DOMAIN tsDomain, ///< [IN] Tessellation domain (isoline, quad, triangle) + SWR_TS_PARTITIONING tsPartitioning, ///< [IN] Tessellation partitioning algorithm + SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology, ///< [IN] Tessellation output topology + void* pContextMem, ///< [IN] Memory to use for the context + size_t& memSize); ///< [INOUT] In: Amount of memory in pContextMem. Out: Mem required + +/// Destroy & de-allocate tessellation context +void SWR_API TSDestroyCtx( + HANDLE tsCtx); ///< [IN] Tessellation context to be destroyed + +struct SWR_TS_TESSELLATED_DATA +{ + uint32_t NumPrimitives; + uint32_t NumDomainPoints; + + uint32_t* ppIndices[3]; + float* pDomainPointsU; + float* pDomainPointsV; + // For Tri: pDomainPointsW[i] = 1.0f - pDomainPointsU[i] - pDomainPointsV[i] +}; + +/// Perform Tessellation +void SWR_API TSTessellate( + HANDLE tsCtx, ///< [IN] Tessellation Context + const SWR_TESSELLATION_FACTORS& tsTessFactors, ///< [IN] Tessellation Factors + SWR_TS_TESSELLATED_DATA& tsTessellatedData); ///< [OUT] Tessellated Data + + + +/// @TODO - Implement OSS tessellator + +INLINE HANDLE SWR_API TSInitCtx( + SWR_TS_DOMAIN tsDomain, + SWR_TS_PARTITIONING tsPartitioning, + SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology, + void* pContextMem, + size_t& memSize) +{ + SWR_ASSERT(0, "%s: Not Implemented", __FUNCTION__); + return NULL; +} + + +INLINE void SWR_API TSDestroyCtx(HANDLE tsCtx) +{ + SWR_ASSERT(0, "%s: Not Implemented", __FUNCTION__); +} + + +INLINE void SWR_API TSTessellate( + HANDLE tsCtx, + const SWR_TESSELLATION_FACTORS& tsTessFactors, + SWR_TS_TESSELLATED_DATA& tsTessellatedData) +{ + SWR_ASSERT(0, "%s: Not Implemented", __FUNCTION__); +} + diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp new file mode 100644 index 00000000000..24c5588bfec --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -0,0 +1,962 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +****************************************************************************/ + +#include <stdio.h> +#include <thread> +#include <algorithm> +#include <unordered_set> +#include <float.h> +#include <vector> +#include <utility> +#include <fstream> +#include <string> + +#if defined(__linux__) || defined(__gnu_linux__) +#include <pthread.h> +#include <sched.h> +#include <unistd.h> +#endif + +#include "common/os.h" +#include "context.h" +#include "frontend.h" +#include "backend.h" +#include "rasterizer.h" +#include "rdtsc_core.h" +#include "tilemgr.h" +#include "core/multisample.h" + + + + +// ThreadId +struct Core +{ + uint32_t procGroup = 0; + std::vector<uint32_t> threadIds; +}; + +struct NumaNode +{ + std::vector<Core> cores; +}; + +typedef std::vector<NumaNode> CPUNumaNodes; + +void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup) +{ + out_nodes.clear(); + out_numThreadsPerProcGroup = 0; + +#if defined(_WIN32) + + SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS]; + DWORD bufSize = sizeof(buffer); + + BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufSize); + SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information"); + + uint32_t count = bufSize / buffer->Size; + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = buffer; + + for (uint32_t i = 0; i < count; ++i) + { + SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore); + for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g) + { + auto& gmask = pBuffer->Processor.GroupMask[g]; + uint32_t threadId = 0; + uint32_t procGroup = gmask.Group; + + Core* pCore = nullptr; + + uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask); + + while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask)) + { + // clear mask + gmask.Mask &= ~(KAFFINITY(1) << threadId); + + // Find Numa Node + PROCESSOR_NUMBER procNum = {}; + procNum.Group = WORD(procGroup); + procNum.Number = UCHAR(threadId); + + uint32_t numaId = 0; + ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId); + SWR_ASSERT(ret); + + // Store data + if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1); + auto& numaNode = out_nodes[numaId]; + + uint32_t coreId = 0; + + if (nullptr == pCore) + { + numaNode.cores.push_back(Core()); + pCore = &numaNode.cores.back(); + pCore->procGroup = procGroup; +#if !defined(_WIN64) + coreId = (uint32_t)numaNode.cores.size(); + if ((coreId * numThreads) >= 32) + { + // Windows doesn't return threadIds >= 32 for a processor group correctly + // when running a 32-bit application. + // Just save -1 as the threadId + threadId = uint32_t(-1); + } +#endif + } + pCore->threadIds.push_back(threadId); + if (procGroup == 0) + { + out_numThreadsPerProcGroup++; + } + } + } + pBuffer = PtrAdd(pBuffer, pBuffer->Size); + } + + +#elif defined(__linux__) || defined (__gnu_linux__) + + // Parse /proc/cpuinfo to get full topology + std::ifstream input("/proc/cpuinfo"); + std::string line; + char* c; + uint32_t threadId = uint32_t(-1); + uint32_t coreId = uint32_t(-1); + uint32_t numaId = uint32_t(-1); + + while (std::getline(input, line)) + { + if (line.find("processor") != std::string::npos) + { + if (threadId != uint32_t(-1)) + { + // Save information. + if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1); + auto& numaNode = out_nodes[numaId]; + if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1); + auto& core = numaNode.cores[coreId]; + + core.procGroup = coreId; + core.threadIds.push_back(threadId); + + out_numThreadsPerProcGroup++; + } + + auto data_start = line.find(": ") + 2; + threadId = std::strtoul(&line.c_str()[data_start], &c, 10); + continue; + } + if (line.find("core id") != std::string::npos) + { + auto data_start = line.find(": ") + 2; + coreId = std::strtoul(&line.c_str()[data_start], &c, 10); + continue; + } + if (line.find("physical id") != std::string::npos) + { + auto data_start = line.find(": ") + 2; + numaId = std::strtoul(&line.c_str()[data_start], &c, 10); + continue; + } + } + + if (threadId != uint32_t(-1)) + { + // Save information. + if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1); + auto& numaNode = out_nodes[numaId]; + if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1); + auto& core = numaNode.cores[coreId]; + + core.procGroup = coreId; + core.threadIds.push_back(threadId); + out_numThreadsPerProcGroup++; + } + + for (uint32_t node = 0; node < out_nodes.size(); node++) { + auto& numaNode = out_nodes[node]; + auto it = numaNode.cores.begin(); + for ( ; it != numaNode.cores.end(); ) { + if (it->threadIds.size() == 0) + numaNode.cores.erase(it); + else + ++it; + } + } + +#else + +#error Unsupported platform + +#endif +} + + +void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=false) +{ + // Only bind threads when MAX_WORKER_THREADS isn't set. + if (KNOB_MAX_WORKER_THREADS && bindProcGroup == false) + { + return; + } + +#if defined(_WIN32) + { + GROUP_AFFINITY affinity = {}; + affinity.Group = procGroupId; + +#if !defined(_WIN64) + if (threadId >= 32) + { + // In a 32-bit process on Windows it is impossible to bind + // to logical processors 32-63 within a processor group. + // In this case set the mask to 0 and let the system assign + // the processor. Hopefully it will make smart choices. + affinity.Mask = 0; + } + else +#endif + { + // If KNOB_MAX_WORKER_THREADS is set, only bind to the proc group, + // Not the individual HW thread. + if (!KNOB_MAX_WORKER_THREADS) + { + affinity.Mask = KAFFINITY(1) << threadId; + } + } + + SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr); + } +#else + cpu_set_t cpuset; + pthread_t thread = pthread_self(); + CPU_ZERO(&cpuset); + CPU_SET(threadId, &cpuset); + + pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset); +#endif +} + +INLINE +uint64_t GetEnqueuedDraw(SWR_CONTEXT *pContext) +{ + //uint64_t result = _InterlockedCompareExchange64((volatile __int64*)&pContext->DrawEnqueued, 0, 0); + //return result; + return pContext->DrawEnqueued; +} + +INLINE +DRAW_CONTEXT *GetDC(SWR_CONTEXT *pContext, uint64_t drawId) +{ + return &pContext->dcRing[(drawId-1) % KNOB_MAX_DRAWS_IN_FLIGHT]; +} + +// returns true if dependency not met +INLINE +bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint64_t lastRetiredDraw) +{ + return (pDC->dependency > lastRetiredDraw); +} + +void ClearColorHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. +{ + // Load clear color into SIMD register... + float *pClearData = (float*)(pHotTile->clearData); + simdscalar valR = _simd_broadcast_ss(&pClearData[0]); + simdscalar valG = _simd_broadcast_ss(&pClearData[1]); + simdscalar valB = _simd_broadcast_ss(&pClearData[2]); + simdscalar valA = _simd_broadcast_ss(&pClearData[3]); + + float *pfBuf = (float*)pHotTile->pBuffer; + uint32_t numSamples = pHotTile->numSamples; + + for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) + { + for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) + { + for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++) + { + _simd_store_ps(pfBuf, valR); + pfBuf += KNOB_SIMD_WIDTH; + _simd_store_ps(pfBuf, valG); + pfBuf += KNOB_SIMD_WIDTH; + _simd_store_ps(pfBuf, valB); + pfBuf += KNOB_SIMD_WIDTH; + _simd_store_ps(pfBuf, valA); + pfBuf += KNOB_SIMD_WIDTH; + } + } + } +} + +void ClearDepthHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. +{ + // Load clear color into SIMD register... + float *pClearData = (float*)(pHotTile->clearData); + simdscalar valZ = _simd_broadcast_ss(&pClearData[0]); + + float *pfBuf = (float*)pHotTile->pBuffer; + uint32_t numSamples = pHotTile->numSamples; + + for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) + { + for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) + { + for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) + { + _simd_store_ps(pfBuf, valZ); + pfBuf += KNOB_SIMD_WIDTH; + } + } + } +} + +void ClearStencilHotTile(const HOTTILE* pHotTile) +{ + // convert from F32 to U8. + uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]); + //broadcast 32x into __m256i... + simdscalari valS = _simd_set1_epi8(clearVal); + + simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer; + uint32_t numSamples = pHotTile->numSamples; + + for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) + { + for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) + { + // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly. + for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4) + { + _simd_store_si(pBuf, valS); + pBuf += 1; + } + } + } +} + +// for draw calls, we initialize the active hot tiles and perform deferred +// load on them if tile is in invalid state. we do this in the outer thread loop instead of inside +// the draw routine itself mainly for performance, to avoid unnecessary setup +// every triangle +// @todo support deferred clear +INLINE +void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, const TRIANGLE_WORK_DESC* pWork) +{ + const API_STATE& state = GetApiState(pDC); + HotTileMgr *pHotTileMgr = pContext->pHotTileMgr; + + uint32_t x, y; + MacroTileMgr::getTileIndices(macroID, x, y); + x *= KNOB_MACROTILE_X_DIM; + y *= KNOB_MACROTILE_Y_DIM; + + uint32_t numSamples = GetNumSamples(state.rastState.sampleCount); + + // check RT if enabled + unsigned long rtSlot = 0; + uint32_t colorHottileEnableMask = state.colorHottileEnable; + while(_BitScanForward(&rtSlot, colorHottileEnableMask)) + { + HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples); + + if (pHotTile->state == HOTTILE_INVALID) + { + RDTSC_START(BELoadTiles); + // invalid hottile before draw requires a load from surface before we can draw to it + pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); + pHotTile->state = HOTTILE_DIRTY; + RDTSC_STOP(BELoadTiles, 0, 0); + } + else if (pHotTile->state == HOTTILE_CLEAR) + { + RDTSC_START(BELoadTiles); + // Clear the tile. + ClearColorHotTile(pHotTile); + pHotTile->state = HOTTILE_DIRTY; + RDTSC_STOP(BELoadTiles, 0, 0); + } + colorHottileEnableMask &= ~(1 << rtSlot); + } + + // check depth if enabled + if (state.depthHottileEnable) + { + HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples); + if (pHotTile->state == HOTTILE_INVALID) + { + RDTSC_START(BELoadTiles); + // invalid hottile before draw requires a load from surface before we can draw to it + pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); + pHotTile->state = HOTTILE_DIRTY; + RDTSC_STOP(BELoadTiles, 0, 0); + } + else if (pHotTile->state == HOTTILE_CLEAR) + { + RDTSC_START(BELoadTiles); + // Clear the tile. + ClearDepthHotTile(pHotTile); + pHotTile->state = HOTTILE_DIRTY; + RDTSC_STOP(BELoadTiles, 0, 0); + } + } + + // check stencil if enabled + if (state.stencilHottileEnable) + { + HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples); + if (pHotTile->state == HOTTILE_INVALID) + { + RDTSC_START(BELoadTiles); + // invalid hottile before draw requires a load from surface before we can draw to it + pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); + pHotTile->state = HOTTILE_DIRTY; + RDTSC_STOP(BELoadTiles, 0, 0); + } + else if (pHotTile->state == HOTTILE_CLEAR) + { + RDTSC_START(BELoadTiles); + // Clear the tile. + ClearStencilHotTile(pHotTile); + pHotTile->state = HOTTILE_DIRTY; + RDTSC_STOP(BELoadTiles, 0, 0); + } + } +} + +INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE) +{ + // increment our current draw id to the first incomplete draw + uint64_t drawEnqueued = GetEnqueuedDraw(pContext); + while (curDrawBE < drawEnqueued) + { + DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT]; + + // If its not compute and FE is not done then break out of loop. + if (!pDC->doneFE && !pDC->isCompute) break; + + bool isWorkComplete = (pDC->isCompute) ? + pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete(); + + if (isWorkComplete) + { + curDrawBE++; + InterlockedIncrement(&pDC->threadsDoneBE); + } + else + { + break; + } + } + + // If there are no more incomplete draws then return false. + return (curDrawBE >= drawEnqueued) ? false : true; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief If there is any BE work then go work on it. +/// @param pContext - pointer to SWR context. +/// @param workerId - The unique worker ID that is assigned to this thread. +/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread +/// has its own curDrawBE counter and this ensures that each worker processes all the +/// draws in order. +/// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its +/// own set and each time it fails to lock a macrotile, because its already locked, +/// then it will add that tile to the lockedTiles set. As a worker begins to work +/// on future draws the lockedTiles ensure that it doesn't work on tiles that may +/// still have work pending in a previous draw. Additionally, the lockedTiles is +/// hueristic that can steer a worker back to the same macrotile that it had been +/// working on in a previous draw. +void WorkOnFifoBE( + SWR_CONTEXT *pContext, + uint32_t workerId, + uint64_t &curDrawBE, + std::unordered_set<uint32_t>& lockedTiles) +{ + // Find the first incomplete draw that has pending work. If no such draw is found then + // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE. + if (FindFirstIncompleteDraw(pContext, curDrawBE) == false) + { + return; + } + + uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1; + + // Reset our history for locked tiles. We'll have to re-learn which tiles are locked. + lockedTiles.clear(); + + // Try to work on each draw in order of the available draws in flight. + // 1. If we're on curDrawBE, we can work on any macrotile that is available. + // 2. If we're trying to work on draws after curDrawBE, we are restricted to + // working on those macrotiles that are known to be complete in the prior draw to + // maintain order. The locked tiles provides the history to ensures this. + for (uint64_t i = curDrawBE; i < GetEnqueuedDraw(pContext); ++i) + { + DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT]; + + if (pDC->isCompute) return; // We don't look at compute work. + + // First wait for FE to be finished with this draw. This keeps threading model simple + // but if there are lots of bubbles between draws then serializing FE and BE may + // need to be revisited. + if (!pDC->doneFE) return; + + // If this draw is dependent on a previous draw then we need to bail. + if (CheckDependency(pContext, pDC, lastRetiredDraw)) + { + return; + } + + // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it. + std::vector<uint32_t> ¯oTiles = pDC->pTileMgr->getDirtyTiles(); + + for (uint32_t tileID : macroTiles) + { + MacroTileQueue &tile = pDC->pTileMgr->getMacroTileQueue(tileID); + + // can only work on this draw if it's not in use by other threads + if (lockedTiles.find(tileID) == lockedTiles.end()) + { + if (tile.getNumQueued()) + { + if (tile.tryLock()) + { + BE_WORK *pWork; + + RDTSC_START(WorkerFoundWork); + + uint32_t numWorkItems = tile.getNumQueued(); + + if (numWorkItems != 0) + { + pWork = tile.peek(); + SWR_ASSERT(pWork); + if (pWork->type == DRAW) + { + InitializeHotTiles(pContext, pDC, tileID, (const TRIANGLE_WORK_DESC*)&pWork->desc); + } + } + + while ((pWork = tile.peek()) != nullptr) + { + pWork->pfnWork(pDC, workerId, tileID, &pWork->desc); + tile.dequeue(); + } + RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId); + + _ReadWriteBarrier(); + + pDC->pTileMgr->markTileComplete(tileID); + + // Optimization: If the draw is complete and we're the last one to have worked on it then + // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete. + if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete()) + { + // We can increment the current BE and safely move to next draw since we know this draw is complete. + curDrawBE++; + InterlockedIncrement(&pDC->threadsDoneBE); + + lastRetiredDraw++; + + lockedTiles.clear(); + break; + } + } + else + { + // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again. + lockedTiles.insert(tileID); + } + } + } + } + } +} + +void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, UCHAR numaNode) +{ + // Try to grab the next DC from the ring + uint64_t drawEnqueued = GetEnqueuedDraw(pContext); + while (curDrawFE < drawEnqueued) + { + uint32_t dcSlot = curDrawFE % KNOB_MAX_DRAWS_IN_FLIGHT; + DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot]; + if (pDC->isCompute || pDC->doneFE || pDC->FeLock) + { + curDrawFE++; + InterlockedIncrement(&pDC->threadsDoneFE); + } + else + { + break; + } + } + + uint64_t curDraw = curDrawFE; + while (curDraw < drawEnqueued) + { + uint32_t dcSlot = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT; + DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot]; + + if (!pDC->isCompute && !pDC->FeLock) + { + uint32_t initial = InterlockedCompareExchange((volatile uint32_t*)&pDC->FeLock, 1, 0); + if (initial == 0) + { + // successfully grabbed the DC, now run the FE + pDC->FeWork.pfnWork(pContext, pDC, workerId, &pDC->FeWork.desc); + + _ReadWriteBarrier(); + pDC->doneFE = true; + } + } + curDraw++; + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief If there is any compute work then go work on it. +/// @param pContext - pointer to SWR context. +/// @param workerId - The unique worker ID that is assigned to this thread. +/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread +/// has its own curDrawBE counter and this ensures that each worker processes all the +/// draws in order. +void WorkOnCompute( + SWR_CONTEXT *pContext, + uint32_t workerId, + uint64_t& curDrawBE) +{ + if (FindFirstIncompleteDraw(pContext, curDrawBE) == false) + { + return; + } + + uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1; + + DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT]; + if (pDC->isCompute == false) return; + + // check dependencies + if (CheckDependency(pContext, pDC, lastRetiredDraw)) + { + return; + } + + SWR_ASSERT(pDC->pDispatch != nullptr); + DispatchQueue& queue = *pDC->pDispatch; + + // Is there any work remaining? + if (queue.getNumQueued() > 0) + { + bool lastToComplete = false; + + uint32_t threadGroupId = 0; + while (queue.getWork(threadGroupId)) + { + ProcessComputeBE(pDC, workerId, threadGroupId); + + lastToComplete = queue.finishedWork(); + } + + _ReadWriteBarrier(); + + if (lastToComplete) + { + SWR_ASSERT(queue.isWorkComplete() == true); + pDC->doneCompute = true; + } + } +} + +DWORD workerThreadMain(LPVOID pData) +{ + THREAD_DATA *pThreadData = (THREAD_DATA*)pData; + SWR_CONTEXT *pContext = pThreadData->pContext; + uint32_t threadId = pThreadData->threadId; + uint32_t workerId = pThreadData->workerId; + + bindThread(threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup); + + RDTSC_INIT(threadId); + + int numaNode = (int)pThreadData->numaId; + + // flush denormals to 0 + _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); + + // Track tiles locked by other threads. If we try to lock a macrotile and find its already + // locked then we'll add it to this list so that we don't try and lock it again. + std::unordered_set<uint32_t> lockedTiles; + + // each worker has the ability to work on any of the queued draws as long as certain + // conditions are met. the data associated + // with a draw is guaranteed to be active as long as a worker hasn't signaled that he + // has moved on to the next draw when he determines there is no more work to do. The api + // thread will not increment the head of the dc ring until all workers have moved past the + // current head. + // the logic to determine what to work on is: + // 1- try to work on the FE any draw that is queued. For now there are no dependencies + // on the FE work, so any worker can grab any FE and process in parallel. Eventually + // we'll need dependency tracking to force serialization on FEs. The worker will try + // to pick an FE by atomically incrementing a counter in the swr context. he'll keep + // trying until he reaches the tail. + // 2- BE work must be done in strict order. we accomplish this today by pulling work off + // the oldest draw (ie the head) of the dcRing. the worker can determine if there is + // any work left by comparing the total # of binned work items and the total # of completed + // work items. If they are equal, then there is no more work to do for this draw, and + // the worker can safely increment its oldestDraw counter and move on to the next draw. + std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock); + + auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->DrawEnqueued; }; + + uint64_t curDrawBE = 1; + uint64_t curDrawFE = 1; + + while (pContext->threadPool.inThreadShutdown == false) + { + uint32_t loop = 0; + while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE)) + { + _mm_pause(); + } + + if (!threadHasWork(curDrawBE)) + { + lock.lock(); + + // check for thread idle condition again under lock + if (threadHasWork(curDrawBE)) + { + lock.unlock(); + continue; + } + + if (pContext->threadPool.inThreadShutdown) + { + lock.unlock(); + break; + } + + RDTSC_START(WorkerWaitForThreadEvent); + + pContext->FifosNotEmpty.wait(lock); + lock.unlock(); + + RDTSC_STOP(WorkerWaitForThreadEvent, 0, 0); + + if (pContext->threadPool.inThreadShutdown) + { + break; + } + } + + RDTSC_START(WorkerWorkOnFifoBE); + WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles); + RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0); + + WorkOnCompute(pContext, workerId, curDrawBE); + + WorkOnFifoFE(pContext, workerId, curDrawFE, numaNode); + } + + return 0; +} + +DWORD workerThreadInit(LPVOID pData) +{ +#if defined(_WIN32) + __try +#endif // _WIN32 + { + return workerThreadMain(pData); + } + +#if defined(_WIN32) + __except(EXCEPTION_CONTINUE_SEARCH) + { + } + +#endif // _WIN32 + + return 1; +} + +void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) +{ + bindThread(0); + + CPUNumaNodes nodes; + uint32_t numThreadsPerProcGroup = 0; + CalculateProcessorTopology(nodes, numThreadsPerProcGroup); + + uint32_t numHWNodes = (uint32_t)nodes.size(); + uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size(); + uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size(); + + uint32_t numNodes = numHWNodes; + uint32_t numCoresPerNode = numHWCoresPerNode; + uint32_t numHyperThreads = numHWHyperThreads; + + if (KNOB_MAX_NUMA_NODES) + { + numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES); + } + + if (KNOB_MAX_CORES_PER_NUMA_NODE) + { + numCoresPerNode = std::min(numCoresPerNode, KNOB_MAX_CORES_PER_NUMA_NODE); + } + + if (KNOB_MAX_THREADS_PER_CORE) + { + numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE); + } + + // Calculate numThreads + uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads; + + if (KNOB_MAX_WORKER_THREADS) + { + uint32_t maxHWThreads = numHWNodes * numHWCoresPerNode * numHWHyperThreads; + numThreads = std::min(KNOB_MAX_WORKER_THREADS, maxHWThreads); + } + + if (numThreads > KNOB_MAX_NUM_THREADS) + { + printf("WARNING: system thread count %u exceeds max %u, " + "performance will be degraded\n", + numThreads, KNOB_MAX_NUM_THREADS); + } + + if (numThreads == 1) + { + // If only 1 worker thread, try to move it to an available + // HW thread. If that fails, use the API thread. + if (numCoresPerNode < numHWCoresPerNode) + { + numCoresPerNode++; + } + else if (numHyperThreads < numHWHyperThreads) + { + numHyperThreads++; + } + else if (numNodes < numHWNodes) + { + numNodes++; + } + else + { + pPool->numThreads = 0; + SET_KNOB(SINGLE_THREADED, true); + return; + } + } + else + { + // Save a HW thread for the API thread. + numThreads--; + } + + pPool->numThreads = numThreads; + pContext->NumWorkerThreads = pPool->numThreads; + + pPool->inThreadShutdown = false; + pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA)); + + if (KNOB_MAX_WORKER_THREADS) + { + bool bForceBindProcGroup = (numThreads > numThreadsPerProcGroup); + uint32_t numProcGroups = (numThreads + numThreadsPerProcGroup - 1) / numThreadsPerProcGroup; + // When MAX_WORKER_THREADS is set we don't bother to bind to specific HW threads + // But Windows will still require binding to specific process groups + for (uint32_t workerId = 0; workerId < numThreads; ++workerId) + { + pPool->pThreadData[workerId].workerId = workerId; + pPool->pThreadData[workerId].procGroupId = workerId % numProcGroups; + pPool->pThreadData[workerId].threadId = 0; + pPool->pThreadData[workerId].numaId = 0; + pPool->pThreadData[workerId].pContext = pContext; + pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup; + pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]); + } + } + else + { + uint32_t workerId = 0; + for (uint32_t n = 0; n < numNodes; ++n) + { + auto& node = nodes[n]; + + uint32_t numCores = numCoresPerNode; + for (uint32_t c = 0; c < numCores; ++c) + { + auto& core = node.cores[c]; + for (uint32_t t = 0; t < numHyperThreads; ++t) + { + if (c == 0 && n == 0 && t == 0) + { + // Skip core 0, thread0 on node 0 to reserve for API thread + continue; + } + + pPool->pThreadData[workerId].workerId = workerId; + pPool->pThreadData[workerId].procGroupId = core.procGroup; + pPool->pThreadData[workerId].threadId = core.threadIds[t]; + pPool->pThreadData[workerId].numaId = n; + pPool->pThreadData[workerId].pContext = pContext; + pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]); + + ++workerId; + } + } + } + } +} + +void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) +{ + if (!KNOB_SINGLE_THREADED) + { + // Inform threads to finish up + std::unique_lock<std::mutex> lock(pContext->WaitLock); + pPool->inThreadShutdown = true; + _mm_mfence(); + pContext->FifosNotEmpty.notify_all(); + lock.unlock(); + + // Wait for threads to finish and destroy them + for (uint32_t t = 0; t < pPool->numThreads; ++t) + { + pPool->threads[t]->join(); + delete(pPool->threads[t]); + } + + // Clean up data used by threads + free(pPool->pThreadData); + } +} diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h new file mode 100644 index 00000000000..0fa7196f5ac --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/threads.h @@ -0,0 +1,63 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file threads.h +* +* @brief Definitions for SWR threading model. +* +******************************************************************************/ +#pragma once + +#include "knobs.h" + +#include <unordered_set> +#include <thread> +typedef std::thread* THREAD_PTR; + +struct SWR_CONTEXT; + +struct THREAD_DATA +{ + uint32_t procGroupId; // Will always be 0 for non-Windows OS + uint32_t threadId; // within the procGroup for Windows + uint32_t numaId; // NUMA node id + uint32_t workerId; + SWR_CONTEXT *pContext; + bool forceBindProcGroup; // Only useful when KNOB_MAX_WORKER_THREADS is set. +}; + + +struct THREAD_POOL +{ + THREAD_PTR threads[KNOB_MAX_NUM_THREADS]; + uint32_t numThreads; + volatile bool inThreadShutdown; + THREAD_DATA *pThreadData; +}; + +void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); +void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); + +// Expose FE and BE worker functions to the API thread if single threaded +void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, UCHAR numaNode); +void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, std::unordered_set<uint32_t> &usedTiles); +void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE); diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp new file mode 100644 index 00000000000..860393661e2 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp @@ -0,0 +1,105 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file tilemgr.cpp +* +* @brief Implementation for Macro Tile Manager which provides the facilities +* for threads to work on an macro tile. +* +******************************************************************************/ +#include <unordered_map> + +#include "fifo.hpp" +#include "tilemgr.h" + +#define TILE_ID(x,y) ((x << 16 | y)) + +// override new/delete for alignment +void *MacroTileMgr::operator new(size_t size) +{ + return _aligned_malloc(size, 64); +} + +void MacroTileMgr::operator delete(void *p) +{ + _aligned_free(p); +} + +void* DispatchQueue::operator new(size_t size) +{ + return _aligned_malloc(size, 64); +} + +void DispatchQueue::operator delete(void *p) +{ + _aligned_free(p); +} + +MacroTileMgr::MacroTileMgr(Arena& arena) : mArena(arena) +{ +} + +void MacroTileMgr::initialize() +{ + mWorkItemsProduced = 0; + mWorkItemsConsumed = 0; + + mDirtyTiles.clear(); +} + +void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork) +{ + // Should not enqueue more then what we have backing for in the hot tile manager. + SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X); + SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y); + + uint32_t id = TILE_ID(x, y); + + MacroTileQueue &tile = mTiles[id]; + tile.mWorkItemsFE++; + + if (tile.mWorkItemsFE == 1) + { + tile.clear(mArena); + mDirtyTiles.push_back(id); + } + + mWorkItemsProduced++; + tile.enqueue_try_nosync(mArena, pWork); +} + +void MacroTileMgr::markTileComplete(uint32_t id) +{ + SWR_ASSERT(mTiles.find(id) != mTiles.end()); + MacroTileQueue &tile = mTiles[id]; + uint32_t numTiles = tile.mWorkItemsFE; + InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles); + + _ReadWriteBarrier(); + tile.mWorkItemsBE += numTiles; + SWR_ASSERT(tile.mWorkItemsFE == tile.mWorkItemsBE); + + // clear out tile, but defer fifo clear until the next DC first queues to it. + // this prevents worker threads from constantly locking a completed macro tile + tile.mWorkItemsFE = 0; + tile.mWorkItemsBE = 0; +} diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h new file mode 100644 index 00000000000..9137941bad4 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h @@ -0,0 +1,390 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file tilemgr.h +* +* @brief Definitions for Macro Tile Manager which provides the facilities +* for threads to work on an macro tile. +* +******************************************************************************/ +#pragma once + +#include <set> +#include <unordered_map> +#include "common/formats.h" +#include "fifo.hpp" +#include "context.h" +#include "format_traits.h" + +////////////////////////////////////////////////////////////////////////// +/// MacroTile - work queue for a tile. +////////////////////////////////////////////////////////////////////////// +struct MacroTileQueue +{ + MacroTileQueue() { } + ~MacroTileQueue() { } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Returns number of work items queued for this tile. + uint32_t getNumQueued() + { + return mFifo.getNumQueued(); + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Attempt to lock the work fifo. If already locked then return false. + bool tryLock() + { + return mFifo.tryLock(); + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Clear fifo and unlock it. + void clear(Arena& arena) + { + mFifo.clear(arena); + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Peek at work sitting at the front of the fifo. + BE_WORK* peek() + { + return mFifo.peek(); + } + + bool enqueue_try_nosync(Arena& arena, const BE_WORK* entry) + { + return mFifo.enqueue_try_nosync(arena, entry); + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Move to next work item + void dequeue() + { + mFifo.dequeue_noinc(); + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Destroy fifo + void destroy() + { + mFifo.destroy(); + } + + ///@todo This will all be private. + uint32_t mWorkItemsFE = 0; + uint32_t mWorkItemsBE = 0; + +private: + QUEUE<BE_WORK> mFifo; +}; + +////////////////////////////////////////////////////////////////////////// +/// MacroTileMgr - Manages macrotiles for a draw. +////////////////////////////////////////////////////////////////////////// +class MacroTileMgr +{ +public: + MacroTileMgr(Arena& arena); + ~MacroTileMgr() + { + for (auto &tile : mTiles) + { + tile.second.destroy(); + } + } + + void initialize(); + INLINE std::vector<uint32_t>& getDirtyTiles() { return mDirtyTiles; } + INLINE MacroTileQueue& getMacroTileQueue(uint32_t id) { return mTiles[id]; } + void markTileComplete(uint32_t id); + + INLINE bool isWorkComplete() + { + return mWorkItemsProduced == mWorkItemsConsumed; + } + + void enqueue(uint32_t x, uint32_t y, BE_WORK *pWork); + + static INLINE void getTileIndices(uint32_t tileID, uint32_t &x, uint32_t &y) + { + y = tileID & 0xffff; + x = (tileID >> 16) & 0xffff; + } + + void *operator new(size_t size); + void operator delete (void *p); + +private: + Arena& mArena; + SWR_FORMAT mFormat; + std::unordered_map<uint32_t, MacroTileQueue> mTiles; + + // Any tile that has work queued to it is a dirty tile. + std::vector<uint32_t> mDirtyTiles; + + OSALIGNLINE(LONG) mWorkItemsProduced; + OSALIGNLINE(volatile LONG) mWorkItemsConsumed; +}; + +////////////////////////////////////////////////////////////////////////// +/// DispatchQueue - work queue for dispatch +////////////////////////////////////////////////////////////////////////// +class DispatchQueue +{ +public: + DispatchQueue() {} + + ////////////////////////////////////////////////////////////////////////// + /// @brief Setup the producer consumer counts. + void initialize(uint32_t totalTasks, void* pTaskData) + { + // The available and outstanding counts start with total tasks. + // At the start there are N tasks available and outstanding. + // When both the available and outstanding counts have reached 0 then all work has completed. + // When a worker starts on a threadgroup then it decrements the available count. + // When a worker completes a threadgroup then it decrements the outstanding count. + + mTasksAvailable = totalTasks; + mTasksOutstanding = totalTasks; + + mpTaskData = pTaskData; + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Returns number of tasks available for this dispatch. + uint32_t getNumQueued() + { + return (mTasksAvailable > 0) ? mTasksAvailable : 0; + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Atomically decrement the work available count. If the result + // is greater than 0 then we can on the associated thread group. + // Otherwise, there is no more work to do. + bool getWork(uint32_t& groupId) + { + LONG result = InterlockedDecrement(&mTasksAvailable); + + if (result >= 0) + { + groupId = result; + return true; + } + + return false; + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Atomically decrement the outstanding count. A worker is notifying + /// us that he just finished some work. Also, return true if we're + /// the last worker to complete this dispatch. + bool finishedWork() + { + LONG result = InterlockedDecrement(&mTasksOutstanding); + SWR_ASSERT(result >= 0, "Should never oversubscribe work"); + + return (result == 0) ? true : false; + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Work is complete once both the available/outstanding counts have reached 0. + bool isWorkComplete() + { + return ((mTasksAvailable <= 0) && + (mTasksOutstanding <= 0)); + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Return pointer to task data. + const void* GetTasksData() + { + return mpTaskData; + } + + void *operator new(size_t size); + void operator delete (void *p); + + void* mpTaskData; // The API thread will set this up and the callback task function will interpet this. + + OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 }; + OSALIGNLINE(volatile LONG) mTasksOutstanding{ 0 }; +}; + + +enum HOTTILE_STATE +{ + HOTTILE_INVALID, // tile is in unitialized state and should be loaded with surface contents before rendering + HOTTILE_CLEAR, // tile should be cleared + HOTTILE_DIRTY, // tile has been rendered to + HOTTILE_RESOLVED, // tile has been stored to memory +}; + +struct HOTTILE +{ + BYTE *pBuffer; + HOTTILE_STATE state; + DWORD clearData[4]; // May need to change based on pfnClearTile implementation. Reorder for alignment? + uint32_t numSamples; + uint32_t renderTargetArrayIndex; // current render target array index loaded +}; + +union HotTileSet +{ + struct + { + HOTTILE Color[SWR_NUM_RENDERTARGETS]; + HOTTILE Depth; + HOTTILE Stencil; + }; + HOTTILE Attachment[SWR_NUM_ATTACHMENTS]; +}; + +class HotTileMgr +{ +public: + HotTileMgr() + { + memset(&mHotTiles[0][0], 0, sizeof(mHotTiles)); + + // cache hottile size + for (uint32_t i = SWR_ATTACHMENT_COLOR0; i <= SWR_ATTACHMENT_COLOR7; ++i) + { + mHotTileSize[i] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8; + } + mHotTileSize[SWR_ATTACHMENT_DEPTH] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8; + mHotTileSize[SWR_ATTACHMENT_STENCIL] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8; + } + + ~HotTileMgr() + { + for (int x = 0; x < KNOB_NUM_HOT_TILES_X; ++x) + { + for (int y = 0; y < KNOB_NUM_HOT_TILES_Y; ++y) + { + for (int a = 0; a < SWR_NUM_ATTACHMENTS; ++a) + { + if (mHotTiles[x][y].Attachment[a].pBuffer != NULL) + { + _aligned_free(mHotTiles[x][y].Attachment[a].pBuffer); + mHotTiles[x][y].Attachment[a].pBuffer = NULL; + } + } + } + } + } + + HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1, + uint32_t renderTargetArrayIndex = 0) + { + uint32_t x, y; + MacroTileMgr::getTileIndices(macroID, x, y); + + assert(x < KNOB_NUM_HOT_TILES_X); + assert(y < KNOB_NUM_HOT_TILES_Y); + + HotTileSet &tile = mHotTiles[x][y]; + HOTTILE& hotTile = tile.Attachment[attachment]; + if (hotTile.pBuffer == NULL) + { + if (create) + { + uint32_t size = numSamples * mHotTileSize[attachment]; + hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4); + hotTile.state = HOTTILE_INVALID; + hotTile.numSamples = numSamples; + hotTile.renderTargetArrayIndex = renderTargetArrayIndex; + } + else + { + return NULL; + } + } + else + { + // free the old tile and create a new one with enough space to hold all samples + if (numSamples > hotTile.numSamples) + { + // tile should be either uninitialized or resolved if we're deleting and switching to a + // new sample count + assert((hotTile.state == HOTTILE_INVALID) || + (hotTile.state == HOTTILE_RESOLVED) || + (hotTile.state == HOTTILE_CLEAR)); + _aligned_free(hotTile.pBuffer); + + uint32_t size = numSamples * mHotTileSize[attachment]; + hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4); + hotTile.state = HOTTILE_INVALID; + hotTile.numSamples = numSamples; + } + + // if requested render target array index isn't currently loaded, need to store out the current hottile + // and load the requested array slice + if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex) + { + SWR_FORMAT format; + switch (attachment) + { + case SWR_ATTACHMENT_COLOR0: + case SWR_ATTACHMENT_COLOR1: + case SWR_ATTACHMENT_COLOR2: + case SWR_ATTACHMENT_COLOR3: + case SWR_ATTACHMENT_COLOR4: + case SWR_ATTACHMENT_COLOR5: + case SWR_ATTACHMENT_COLOR6: + case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break; + case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break; + case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break; + default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break; + } + + if (hotTile.state == HOTTILE_DIRTY) + { + pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment, + x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer); + } + + pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment, + x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer); + + hotTile.renderTargetArrayIndex = renderTargetArrayIndex; + hotTile.state = HOTTILE_DIRTY; + } + } + return &tile.Attachment[attachment]; + } + + HotTileSet &GetHotTile(uint32_t macroID) + { + uint32_t x, y; + MacroTileMgr::getTileIndices(macroID, x, y); + assert(x < KNOB_NUM_HOT_TILES_X); + assert(y < KNOB_NUM_HOT_TILES_Y); + + return mHotTiles[x][y]; + } + +private: + HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y]; + uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS]; +}; + diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.cpp b/src/gallium/drivers/swr/rasterizer/core/utils.cpp new file mode 100644 index 00000000000..f36452f2cec --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/utils.cpp @@ -0,0 +1,148 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file utils.cpp +* +* @brief Utilities used by SWR core. +* +******************************************************************************/ +#if defined(_WIN32) + +#include<Windows.h> +#include <Gdiplus.h> +#include <Gdiplusheaders.h> +#include <cstdint> + +using namespace Gdiplus; + +int GetEncoderClsid(const WCHAR* format, CLSID* pClsid) +{ + uint32_t num = 0; // number of image encoders + uint32_t size = 0; // size of the image encoder array in bytes + + ImageCodecInfo* pImageCodecInfo = nullptr; + + GetImageEncodersSize(&num, &size); + if(size == 0) + return -1; // Failure + + pImageCodecInfo = (ImageCodecInfo*)(malloc(size)); + if(pImageCodecInfo == nullptr) + return -1; // Failure + + GetImageEncoders(num, size, pImageCodecInfo); + + for(uint32_t j = 0; j < num; ++j) + { + if( wcscmp(pImageCodecInfo[j].MimeType, format) == 0 ) + { + *pClsid = pImageCodecInfo[j].Clsid; + free(pImageCodecInfo); + return j; // Success + } + } + + free(pImageCodecInfo); + return -1; // Failure +} + +void SaveImageToPNGFile( + const WCHAR *pFilename, + void *pBuffer, + uint32_t width, + uint32_t height) +{ + // dump pixels to a png + // Initialize GDI+. + GdiplusStartupInput gdiplusStartupInput; + ULONG_PTR gdiplusToken; + GdiplusStartup(&gdiplusToken, &gdiplusStartupInput, nullptr); + + Bitmap *bitmap = new Bitmap(width, height); + BYTE *pBytes = (BYTE*)pBuffer; + static const uint32_t bytesPerPixel = 4; + for (uint32_t y = 0; y < height; ++y) + for (uint32_t x = 0; x < width; ++x) + { + uint32_t pixel = *(uint32_t*)pBytes; + if (pixel == 0xcdcdcdcd) + { + pixel = 0xFFFF00FF; + } + else if (pixel == 0xdddddddd) + { + pixel = 0x80FF0000; + } + else + { + pixel |= 0xFF000000; + } + Color color(pixel); + bitmap->SetPixel(x, y, color); + pBytes += bytesPerPixel; + } + + // Save image. + CLSID pngClsid; + GetEncoderClsid(L"image/png", &pngClsid); + bitmap->Save(pFilename, &pngClsid, nullptr); + + delete bitmap; + + GdiplusShutdown(gdiplusToken); +} + +void OpenBitmapFromFile( + const WCHAR *pFilename, + void **pBuffer, + uint32_t *width, + uint32_t *height) +{ + GdiplusStartupInput gdiplusStartupInput; + ULONG_PTR gdiplusToken; + GdiplusStartup(&gdiplusToken, &gdiplusStartupInput, nullptr); + + Bitmap *bitmap = new Bitmap(pFilename); + + *width = bitmap->GetWidth(); + *height = bitmap->GetHeight(); + *pBuffer = new BYTE[*width * *height * 4]; // width * height * |RGBA| + + // The folder 'stb_image' contains a PNG open/close module which + // is far less painful than this is, yo. + Gdiplus::Color clr; + for (uint32_t y = 0, idx = 0; y < *height; ++y) + { + for (uint32_t x = 0; x < *width; ++x, idx += 4) + { + bitmap->GetPixel(x, *height - y - 1, &clr); + ((BYTE*)*pBuffer)[idx + 0] = clr.GetBlue(); + ((BYTE*)*pBuffer)[idx + 1] = clr.GetGreen(); + ((BYTE*)*pBuffer)[idx + 2] = clr.GetRed(); + ((BYTE*)*pBuffer)[idx + 3] = clr.GetAlpha(); + } + } + + delete bitmap; + bitmap = 0; +} +#endif diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h new file mode 100644 index 00000000000..b9dc48c4fd7 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/utils.h @@ -0,0 +1,831 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file utils.h +* +* @brief Utilities used by SWR core. +* +******************************************************************************/ +#pragma once + +#include <string.h> +#include "common/os.h" +#include "common/simdintrin.h" +#include "common/swr_assert.h" + +#if defined(_WIN32) +void SaveImageToPNGFile( + const WCHAR *pFilename, + void *pBuffer, + uint32_t width, + uint32_t height); + +void OpenBitmapFromFile( + const WCHAR *pFilename, + void **pBuffer, + uint32_t *width, + uint32_t *height); +#endif + +/// @todo assume linux is always 64 bit +#if defined(_WIN64) || defined(__linux__) || defined(__gnu_linux__) +#define _MM_INSERT_EPI64 _mm_insert_epi64 +#define _MM_EXTRACT_EPI64 _mm_extract_epi64 +#else +INLINE INT64 _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx) +{ + OSALIGNLINE(uint32_t) elems[4]; + _mm_store_si128((__m128i*)elems, a); + if (ndx == 0) + { + uint64_t foo = elems[0]; + foo |= (uint64_t)elems[1] << 32; + return foo; + } + else + { + uint64_t foo = elems[2]; + foo |= (uint64_t)elems[3] << 32; + return foo; + } +} + +INLINE __m128i _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx) +{ + OSALIGNLINE(int64_t) elems[2]; + _mm_store_si128((__m128i*)elems, a); + if (ndx == 0) + { + elems[0] = b; + } + else + { + elems[1] = b; + } + __m128i out; + out = _mm_load_si128((const __m128i*)elems); + return out; +} +#endif + +OSALIGNLINE(struct) BBOX +{ + int top, bottom, left, right; + + BBOX() {} + BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {} + + bool operator==(const BBOX& rhs) + { + return (this->top == rhs.top && + this->bottom == rhs.bottom && + this->left == rhs.left && + this->right == rhs.right); + } + + bool operator!=(const BBOX& rhs) + { + return !(*this == rhs); + } +}; + +struct simdBBox +{ + simdscalari top, bottom, left, right; +}; + +INLINE +void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3) +{ + __m128i row0i = _mm_castps_si128(row0); + __m128i row1i = _mm_castps_si128(row1); + __m128i row2i = _mm_castps_si128(row2); + __m128i row3i = _mm_castps_si128(row3); + + __m128i vTemp = row2i; + row2i = _mm_unpacklo_epi32(row2i, row3i); + vTemp = _mm_unpackhi_epi32(vTemp, row3i); + + row3i = row0i; + row0i = _mm_unpacklo_epi32(row0i, row1i); + row3i = _mm_unpackhi_epi32(row3i, row1i); + + row1i = row0i; + row0i = _mm_unpacklo_epi64(row0i, row2i); + row1i = _mm_unpackhi_epi64(row1i, row2i); + + row2i = row3i; + row2i = _mm_unpacklo_epi64(row2i, vTemp); + row3i = _mm_unpackhi_epi64(row3i, vTemp); + + row0 = _mm_castsi128_ps(row0i); + row1 = _mm_castsi128_ps(row1i); + row2 = _mm_castsi128_ps(row2i); + row3 = _mm_castsi128_ps(row3i); +} + +INLINE +void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3) +{ + __m128i vTemp = row2; + row2 = _mm_unpacklo_epi32(row2, row3); + vTemp = _mm_unpackhi_epi32(vTemp, row3); + + row3 = row0; + row0 = _mm_unpacklo_epi32(row0, row1); + row3 = _mm_unpackhi_epi32(row3, row1); + + row1 = row0; + row0 = _mm_unpacklo_epi64(row0, row2); + row1 = _mm_unpackhi_epi64(row1, row2); + + row2 = row3; + row2 = _mm_unpacklo_epi64(row2, vTemp); + row3 = _mm_unpackhi_epi64(row3, vTemp); +} + +#define GCC_VERSION (__GNUC__ * 10000 \ + + __GNUC_MINOR__ * 100 \ + + __GNUC_PATCHLEVEL__) + +#if defined(__GNUC__) && (GCC_VERSION < 40900) +#define _mm_undefined_ps _mm_setzero_ps +#define _mm_undefined_si128 _mm_setzero_si128 +#if KNOB_SIMD_WIDTH == 8 +#define _mm256_undefined_ps _mm256_setzero_ps +#endif +#endif + +#if KNOB_SIMD_WIDTH == 8 +INLINE +void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2) +{ + __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 + __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5 + __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 + __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 + + r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 + r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77 + __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 + __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 + + vDst[0] = _mm256_castps256_ps128(r02r1xlolo); + vDst[1] = _mm256_castps256_ps128(r02r1xlohi); + vDst[2] = _mm256_castps256_ps128(r02r1xhilo); + vDst[3] = _mm256_castps256_ps128(r02r1xhihi); + + vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1); + vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1); + vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1); + vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1); +} + +INLINE +void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3) +{ + __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 + __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5 + __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 + __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 + + r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 + r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77 + __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 + __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 + + vDst[0] = _mm256_castps256_ps128(r02r1xlolo); + vDst[1] = _mm256_castps256_ps128(r02r1xlohi); + vDst[2] = _mm256_castps256_ps128(r02r1xhilo); + vDst[3] = _mm256_castps256_ps128(r02r1xhihi); + + vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1); + vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1); + vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1); + vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1); +} + +INLINE +void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7) +{ + __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1); + __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1); + __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3); + __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3); + __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5); + __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5); + __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7); + __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7); + __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0)); + __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2)); + __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0)); + __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2)); + __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0)); + __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2)); + __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0)); + __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2)); + vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); + vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); + vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); + vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); + vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); + vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); + vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); + vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); +} + +INLINE +void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7) +{ + vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3), + _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7)); +} +#endif + +////////////////////////////////////////////////////////////////////////// +/// TranposeSingleComponent +////////////////////////////////////////////////////////////////////////// +template<uint32_t bpp> +struct TransposeSingleComponent +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Pass-thru for single component. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + { + memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8); + } +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose8_8_8_8 +////////////////////////////////////////////////////////////////////////// +struct Transpose8_8_8_8 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + { + simdscalari src = _simd_load_si((const simdscalari*)pSrc); +#if KNOB_SIMD_WIDTH == 8 +#if KNOB_ARCH == KNOB_ARCH_AVX + __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg + __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa + __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb + __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa + __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg + __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa + __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba + __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba + _mm_store_si128((__m128i*)pDst, c0123lo); + _mm_store_si128((__m128i*)(pDst + 16), c0123hi); +#elif KNOB_ARCH == KNOB_ARCH_AVX2 + simdscalari dst01 = _mm256_shuffle_epi8(src, + _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800)); + simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01); + dst23 = _mm256_shuffle_epi8(dst23, + _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080)); + simdscalari dst = _mm256_or_si256(dst01, dst23); + _simd_store_si((simdscalari*)pDst, dst); +#endif +#else +#error Unsupported vector width +#endif + } +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose8_8_8 +////////////////////////////////////////////////////////////////////////// +struct Transpose8_8_8 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose8_8 +////////////////////////////////////////////////////////////////////////// +struct Transpose8_8 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 8_8 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + { + simdscalari src = _simd_load_si((const simdscalari*)pSrc); + +#if KNOB_SIMD_WIDTH == 8 + __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg + __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg + rg = _mm_unpacklo_epi8(rg, g); + _mm_store_si128((__m128i*)pDst, rg); +#else +#error Unsupported vector width +#endif + } +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose32_32_32_32 +////////////////////////////////////////////////////////////////////////// +struct Transpose32_32_32_32 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + { +#if KNOB_SIMD_WIDTH == 8 + simdscalar src0 = _simd_load_ps((const float*)pSrc); + simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); + simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); + simdscalar src3 = _simd_load_ps((const float*)pSrc + 24); + + __m128 vDst[8]; + vTranspose4x8(vDst, src0, src1, src2, src3); + _mm_store_ps((float*)pDst, vDst[0]); + _mm_store_ps((float*)pDst+4, vDst[1]); + _mm_store_ps((float*)pDst+8, vDst[2]); + _mm_store_ps((float*)pDst+12, vDst[3]); + _mm_store_ps((float*)pDst+16, vDst[4]); + _mm_store_ps((float*)pDst+20, vDst[5]); + _mm_store_ps((float*)pDst+24, vDst[6]); + _mm_store_ps((float*)pDst+28, vDst[7]); +#else +#error Unsupported vector width +#endif + } +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose32_32_32 +////////////////////////////////////////////////////////////////////////// +struct Transpose32_32_32 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + { +#if KNOB_SIMD_WIDTH == 8 + simdscalar src0 = _simd_load_ps((const float*)pSrc); + simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); + simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); + + __m128 vDst[8]; + vTranspose3x8(vDst, src0, src1, src2); + _mm_store_ps((float*)pDst, vDst[0]); + _mm_store_ps((float*)pDst + 4, vDst[1]); + _mm_store_ps((float*)pDst + 8, vDst[2]); + _mm_store_ps((float*)pDst + 12, vDst[3]); + _mm_store_ps((float*)pDst + 16, vDst[4]); + _mm_store_ps((float*)pDst + 20, vDst[5]); + _mm_store_ps((float*)pDst + 24, vDst[6]); + _mm_store_ps((float*)pDst + 28, vDst[7]); +#else +#error Unsupported vector width +#endif + } +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose32_32 +////////////////////////////////////////////////////////////////////////// +struct Transpose32_32 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 32_32 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + { + const float* pfSrc = (const float*)pSrc; + __m128 src_r0 = _mm_load_ps(pfSrc + 0); + __m128 src_r1 = _mm_load_ps(pfSrc + 4); + __m128 src_g0 = _mm_load_ps(pfSrc + 8); + __m128 src_g1 = _mm_load_ps(pfSrc + 12); + + __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0); + __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0); + __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1); + __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1); + + float* pfDst = (float*)pDst; + _mm_store_ps(pfDst + 0, dst0); + _mm_store_ps(pfDst + 4, dst1); + _mm_store_ps(pfDst + 8, dst2); + _mm_store_ps(pfDst + 12, dst3); + } +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose16_16_16_16 +////////////////////////////////////////////////////////////////////////// +struct Transpose16_16_16_16 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + { +#if KNOB_SIMD_WIDTH == 8 + simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); + simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari))); + + __m128i src_r = _mm256_extractf128_si256(src_rg, 0); + __m128i src_g = _mm256_extractf128_si256(src_rg, 1); + __m128i src_b = _mm256_extractf128_si256(src_ba, 0); + __m128i src_a = _mm256_extractf128_si256(src_ba, 1); + + __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g); + __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g); + __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a); + __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a); + + __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0); + __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0); + __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1); + __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1); + + _mm_store_si128(((__m128i*)pDst) + 0, dst0); + _mm_store_si128(((__m128i*)pDst) + 1, dst1); + _mm_store_si128(((__m128i*)pDst) + 2, dst2); + _mm_store_si128(((__m128i*)pDst) + 3, dst3); +#else +#error Unsupported vector width +#endif + } +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose16_16_16 +////////////////////////////////////////////////////////////////////////// +struct Transpose16_16_16 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + { +#if KNOB_SIMD_WIDTH == 8 + simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); + + __m128i src_r = _mm256_extractf128_si256(src_rg, 0); + __m128i src_g = _mm256_extractf128_si256(src_rg, 1); + __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari))); + __m128i src_a = _mm_undefined_si128(); + + __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g); + __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g); + __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a); + __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a); + + __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0); + __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0); + __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1); + __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1); + + _mm_store_si128(((__m128i*)pDst) + 0, dst0); + _mm_store_si128(((__m128i*)pDst) + 1, dst1); + _mm_store_si128(((__m128i*)pDst) + 2, dst2); + _mm_store_si128(((__m128i*)pDst) + 3, dst3); +#else +#error Unsupported vector width +#endif + } +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose16_16 +////////////////////////////////////////////////////////////////////////// +struct Transpose16_16 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 16_16 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + { + simdscalar src = _simd_load_ps((const float*)pSrc); + +#if KNOB_SIMD_WIDTH == 8 + __m128 comp0 = _mm256_castps256_ps128(src); + __m128 comp1 = _mm256_extractf128_ps(src, 1); + + __m128i comp0i = _mm_castps_si128(comp0); + __m128i comp1i = _mm_castps_si128(comp1); + + __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i); + __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i); + + _mm_store_si128((__m128i*)pDst, resLo); + _mm_store_si128((__m128i*)pDst + 1, resHi); +#else +#error Unsupported vector width +#endif + } +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose24_8 +////////////////////////////////////////////////////////////////////////// +struct Transpose24_8 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 24_8 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose32_8_24 +////////////////////////////////////////////////////////////////////////// +struct Transpose32_8_24 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; +}; + + + +////////////////////////////////////////////////////////////////////////// +/// Transpose4_4_4_4 +////////////////////////////////////////////////////////////////////////// +struct Transpose4_4_4_4 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose5_6_5 +////////////////////////////////////////////////////////////////////////// +struct Transpose5_6_5 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose9_9_9_5 +////////////////////////////////////////////////////////////////////////// +struct Transpose9_9_9_5 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose5_5_5_1 +////////////////////////////////////////////////////////////////////////// +struct Transpose5_5_5_1 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose10_10_10_2 +////////////////////////////////////////////////////////////////////////// +struct Transpose10_10_10_2 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose11_11_10 +////////////////////////////////////////////////////////////////////////// +struct Transpose11_11_10 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; +}; + +// helper function to unroll loops +template<int Begin, int End, int Step = 1> +struct UnrollerL { + template<typename Lambda> + INLINE static void step(Lambda& func) { + func(Begin); + UnrollerL<Begin + Step, End, Step>::step(func); + } +}; + +template<int End, int Step> +struct UnrollerL<End, End, Step> { + template<typename Lambda> + static void step(Lambda& func) { + } +}; + +// general CRC compute +INLINE +uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size) +{ +#if defined(_WIN64) || defined(__x86_64__) + uint32_t sizeInQwords = size / sizeof(uint64_t); + uint32_t sizeRemainderBytes = size % sizeof(uint64_t); + uint64_t* pDataWords = (uint64_t*)pData; + for (uint32_t i = 0; i < sizeInQwords; ++i) + { + crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++); + } +#else + uint32_t sizeInDwords = size / sizeof(uint32_t); + uint32_t sizeRemainderBytes = size % sizeof(uint32_t); + uint32_t* pDataWords = (uint32_t*)pData; + for (uint32_t i = 0; i < sizeInDwords; ++i) + { + crc = _mm_crc32_u32(crc, *pDataWords++); + } +#endif + + BYTE* pRemainderBytes = (BYTE*)pDataWords; + for (uint32_t i = 0; i < sizeRemainderBytes; ++i) + { + crc = _mm_crc32_u8(crc, *pRemainderBytes++); + } + + return crc; +} + +////////////////////////////////////////////////////////////////////////// +/// Add byte offset to any-type pointer +////////////////////////////////////////////////////////////////////////// +template <typename T> +INLINE +static T* PtrAdd(T* p, intptr_t offset) +{ + intptr_t intp = reinterpret_cast<intptr_t>(p); + return reinterpret_cast<T*>(intp + offset); +} + +////////////////////////////////////////////////////////////////////////// +/// Is a power-of-2? +////////////////////////////////////////////////////////////////////////// +template <typename T> +INLINE +static bool IsPow2(T value) +{ + return value == (value & (0 - value)); +} + +////////////////////////////////////////////////////////////////////////// +/// Align down to specified alignment +/// Note: IsPow2(alignment) MUST be true +////////////////////////////////////////////////////////////////////////// +template <typename T1, typename T2> +INLINE +static T1 AlignDownPow2(T1 value, T2 alignment) +{ + SWR_ASSERT(IsPow2(alignment)); + return value & ~T1(alignment - 1); +} + +////////////////////////////////////////////////////////////////////////// +/// Align up to specified alignment +/// Note: IsPow2(alignment) MUST be true +////////////////////////////////////////////////////////////////////////// +template <typename T1, typename T2> +INLINE +static T1 AlignUpPow2(T1 value, T2 alignment) +{ + return AlignDownPow2(value + T1(alignment - 1), alignment); +} + +////////////////////////////////////////////////////////////////////////// +/// Align up ptr to specified alignment +/// Note: IsPow2(alignment) MUST be true +////////////////////////////////////////////////////////////////////////// +template <typename T1, typename T2> +INLINE +static T1* AlignUpPow2(T1* value, T2 alignment) +{ + return reinterpret_cast<T1*>( + AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment)); +} + +////////////////////////////////////////////////////////////////////////// +/// Align down to specified alignment +////////////////////////////////////////////////////////////////////////// +template <typename T1, typename T2> +INLINE +static T1 AlignDown(T1 value, T2 alignment) +{ + if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); } + return value - T1(value % alignment); +} + +////////////////////////////////////////////////////////////////////////// +/// Align down to specified alignment +////////////////////////////////////////////////////////////////////////// +template <typename T1, typename T2> +INLINE +static T1* AlignDown(T1* value, T2 alignment) +{ + return (T1*)AlignDown(uintptr_t(value), alignment); +} + +////////////////////////////////////////////////////////////////////////// +/// Align up to specified alignment +/// Note: IsPow2(alignment) MUST be true +////////////////////////////////////////////////////////////////////////// +template <typename T1, typename T2> +INLINE +static T1 AlignUp(T1 value, T2 alignment) +{ + return AlignDown(value + T1(alignment - 1), alignment); +} + +////////////////////////////////////////////////////////////////////////// +/// Align up to specified alignment +/// Note: IsPow2(alignment) MUST be true +////////////////////////////////////////////////////////////////////////// +template <typename T1, typename T2> +INLINE +static T1* AlignUp(T1* value, T2 alignment) +{ + return AlignDown(PtrAdd(value, alignment - 1), alignment); +} + +////////////////////////////////////////////////////////////////////////// +/// Helper structure used to access an array of elements that don't +/// correspond to a typical word size. +////////////////////////////////////////////////////////////////////////// +template<typename T, size_t BitsPerElementT, size_t ArrayLenT> +class BitsArray +{ +private: + static const size_t BITS_PER_WORD = sizeof(size_t) * 8; + static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT; + static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD; + static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1; + + static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD, + "Element size must an integral fraction of pointer size"); + + size_t m_words[NUM_WORDS] = {}; + +public: + + T operator[] (size_t elementIndex) const + { + size_t word = m_words[elementIndex / ELEMENTS_PER_WORD]; + word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT); + return T(word & ELEMENT_MASK); + } +}; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp new file mode 100644 index 00000000000..734c89792f0 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp @@ -0,0 +1,313 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file JitManager.cpp +* +* @brief Implementation if the Jit Manager. +* +* Notes: +* +******************************************************************************/ +#if defined(_WIN32) +#pragma warning(disable: 4800 4146 4244 4267 4355 4996) +#endif + +#include "jit_api.h" +#include "JitManager.h" +#include "fetch_jit.h" + +#if defined(_WIN32) +#include "llvm/ADT/Triple.h" +#endif +#include "llvm/IR/Function.h" +#include "llvm/Support/DynamicLibrary.h" + +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/SourceMgr.h" + +#include "llvm/Analysis/CFGPrinter.h" +#include "llvm/IRReader/IRReader.h" + +#include "core/state.h" +#include "common/containers.hpp" + +#include "state_llvm.h" + +#include <sstream> +#if defined(_WIN32) +#include <psapi.h> +#include <cstring> + +#define INTEL_OUTPUT_DIR "c:\\Intel" +#define SWR_OUTPUT_DIR INTEL_OUTPUT_DIR "\\SWR" +#define JITTER_OUTPUT_DIR SWR_OUTPUT_DIR "\\Jitter" +#endif + +using namespace llvm; + +////////////////////////////////////////////////////////////////////////// +/// @brief Contructor for JitManager. +/// @param simdWidth - SIMD width to be used in generated program. +JitManager::JitManager(uint32_t simdWidth, const char *arch) + : mContext(), mBuilder(mContext), mIsModuleFinalized(true), mJitNumber(0), mVWidth(simdWidth), mArch(arch) +{ + InitializeNativeTarget(); + InitializeNativeTargetAsmPrinter(); + InitializeNativeTargetDisassembler(); + + TargetOptions tOpts; + tOpts.AllowFPOpFusion = FPOpFusion::Fast; + tOpts.NoInfsFPMath = false; + tOpts.NoNaNsFPMath = false; + tOpts.UnsafeFPMath = true; +#if defined(_DEBUG) +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 7 + tOpts.NoFramePointerElim = true; +#endif +#endif + + //tOpts.PrintMachineCode = true; + + std::stringstream fnName("JitModule", std::ios_base::in | std::ios_base::out | std::ios_base::ate); + fnName << mJitNumber++; + std::unique_ptr<Module> newModule(new Module(fnName.str(), mContext)); + mpCurrentModule = newModule.get(); + + auto &&EB = EngineBuilder(std::move(newModule)); + EB.setTargetOptions(tOpts); + EB.setOptLevel(CodeGenOpt::Aggressive); + + StringRef hostCPUName; + + // force JIT to use the same CPU arch as the rest of swr + if(mArch.AVX512F()) + { + assert(0 && "Implement AVX512 jitter"); + hostCPUName = sys::getHostCPUName(); + if (mVWidth == 0) + { + mVWidth = 16; + } + } + else if(mArch.AVX2()) + { + hostCPUName = StringRef("core-avx2"); + if (mVWidth == 0) + { + mVWidth = 8; + } + } + else if(mArch.AVX()) + { + if (mArch.F16C()) + { + hostCPUName = StringRef("core-avx-i"); + } + else + { + hostCPUName = StringRef("corei7-avx"); + } + if (mVWidth == 0) + { + mVWidth = 8; + } + } + else + { + hostCPUName = sys::getHostCPUName(); + if (mVWidth == 0) + { + mVWidth = 8; // 4? + } + } + + EB.setMCPU(hostCPUName); + +#if defined(_WIN32) + // Needed for MCJIT on windows + Triple hostTriple(sys::getProcessTriple()); + hostTriple.setObjectFormat(Triple::ELF); + mpCurrentModule->setTargetTriple(hostTriple.getTriple()); +#endif // _WIN32 + + mpExec = EB.create(); + +#if LLVM_USE_INTEL_JITEVENTS + JITEventListener *vTune = JITEventListener::createIntelJITEventListener(); + mpExec->RegisterJITEventListener(vTune); +#endif + + mFP32Ty = Type::getFloatTy(mContext); // float type + mInt8Ty = Type::getInt8Ty(mContext); + mInt32Ty = Type::getInt32Ty(mContext); // int type + mInt64Ty = Type::getInt64Ty(mContext); // int type + mV4FP32Ty = StructType::get(mContext, std::vector<Type*>(4, mFP32Ty), false); // vector4 float type (represented as structure) + mV4Int32Ty = StructType::get(mContext, std::vector<Type*>(4, mInt32Ty), false); // vector4 int type + + // fetch function signature + // typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, simdvertex& out); + std::vector<Type*> fsArgs; + fsArgs.push_back(PointerType::get(Gen_SWR_FETCH_CONTEXT(this), 0)); + fsArgs.push_back(PointerType::get(Gen_simdvertex(this), 0)); + + mFetchShaderTy = FunctionType::get(Type::getVoidTy(mContext), fsArgs, false); + + mSimtFP32Ty = VectorType::get(mFP32Ty, mVWidth); + mSimtInt32Ty = VectorType::get(mInt32Ty, mVWidth); + + mSimdVectorTy = StructType::get(mContext, std::vector<Type*>(4, mSimtFP32Ty), false); + mSimdVectorInt32Ty = StructType::get(mContext, std::vector<Type*>(4, mSimtInt32Ty), false); + +#if defined(_WIN32) + // explicitly instantiate used symbols from potentially staticly linked libs + sys::DynamicLibrary::AddSymbol("exp2f", &exp2f); + sys::DynamicLibrary::AddSymbol("log2f", &log2f); + sys::DynamicLibrary::AddSymbol("sinf", &sinf); + sys::DynamicLibrary::AddSymbol("cosf", &cosf); + sys::DynamicLibrary::AddSymbol("powf", &powf); +#endif + +#if defined(_WIN32) + if (KNOB_DUMP_SHADER_IR) + { + CreateDirectory(INTEL_OUTPUT_DIR, NULL); + CreateDirectory(SWR_OUTPUT_DIR, NULL); + CreateDirectory(JITTER_OUTPUT_DIR, NULL); + } + + ///@todo Figure out a better solution for this. + // Redirect stdin, stdout, and stderr to attached console. + freopen("CONIN$", "r", stdin); + freopen("CONOUT$", "w", stdout); + freopen("CONOUT$", "w", stderr); +#endif +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Create new LLVM module. +void JitManager::SetupNewModule() +{ + SWR_ASSERT(mIsModuleFinalized == true && "Current module is not finalized!"); + + std::stringstream fnName("JitModule", std::ios_base::in | std::ios_base::out | std::ios_base::ate); + fnName << mJitNumber++; + std::unique_ptr<Module> newModule(new Module(fnName.str(), mContext)); + mpCurrentModule = newModule.get(); +#if defined(_WIN32) + // Needed for MCJIT on windows + Triple hostTriple(sys::getProcessTriple()); + hostTriple.setObjectFormat(Triple::ELF); + newModule->setTargetTriple(hostTriple.getTriple()); +#endif // _WIN32 + + mpExec->addModule(std::move(newModule)); + mIsModuleFinalized = false; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Create new LLVM module from IR. +bool JitManager::SetupModuleFromIR(const uint8_t *pIR) +{ + std::unique_ptr<MemoryBuffer> pMem = MemoryBuffer::getMemBuffer(StringRef((const char*)pIR), ""); + + SMDiagnostic Err; + std::unique_ptr<Module> newModule = parseIR(pMem.get()->getMemBufferRef(), Err, mContext); + + if (newModule == nullptr) + { + SWR_ASSERT(0, "Parse failed! Check Err for details."); + return false; + } + + mpCurrentModule = newModule.get(); +#if defined(_WIN32) + // Needed for MCJIT on windows + Triple hostTriple(sys::getProcessTriple()); + hostTriple.setObjectFormat(Triple::ELF); + newModule->setTargetTriple(hostTriple.getTriple()); +#endif // _WIN32 + + mpExec->addModule(std::move(newModule)); + mIsModuleFinalized = false; + + return true; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Dump function to file. +void JitManager::DumpToFile(Function *f, const char *fileName) +{ + if (KNOB_DUMP_SHADER_IR) + { +#if defined(_WIN32) + DWORD pid = GetCurrentProcessId(); + TCHAR procname[MAX_PATH]; + GetModuleFileName(NULL, procname, MAX_PATH); + const char* pBaseName = strrchr(procname, '\\'); + std::stringstream outDir; + outDir << JITTER_OUTPUT_DIR << pBaseName << "_" << pid << std::ends; + CreateDirectory(outDir.str().c_str(), NULL); +#endif + + std::error_code EC; + const char *funcName = f->getName().data(); + char fName[256]; +#if defined(_WIN32) + sprintf(fName, "%s\\%s.%s.ll", outDir.str().c_str(), funcName, fileName); +#else + sprintf(fName, "%s.%s.ll", funcName, fileName); +#endif + raw_fd_ostream fd(fName, EC, llvm::sys::fs::F_None); + Module* pModule = f->getParent(); + pModule->print(fd, nullptr); + +#if defined(_WIN32) + sprintf(fName, "%s\\cfg.%s.%s.dot", outDir.str().c_str(), funcName, fileName); +#else + sprintf(fName, "cfg.%s.%s.dot", funcName, fileName); +#endif + fd.flush(); + + raw_fd_ostream fd_cfg(fName, EC, llvm::sys::fs::F_Text); + WriteGraph(fd_cfg, (const Function*)f); + + fd_cfg.flush(); + } +} + +extern "C" +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Create JIT context. + /// @param simdWidth - SIMD width to be used in generated program. + HANDLE JITCALL JitCreateContext(uint32_t targetSimdWidth, const char* arch) + { + return new JitManager(targetSimdWidth, arch); + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Destroy JIT context. + void JITCALL JitDestroyContext(HANDLE hJitContext) + { + delete reinterpret_cast<JitManager*>(hJitContext); + } +} diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h new file mode 100644 index 00000000000..c974a611224 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h @@ -0,0 +1,186 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file JitManager.h +* +* @brief JitManager contains the LLVM data structures used for JIT generation +* +* Notes: +* +******************************************************************************/ +#pragma once + +#include "common/os.h" +#include "common/isa.hpp" + +#if defined(_WIN32) +#pragma warning(disable : 4146 4244 4267 4800 4996) +#endif + +// llvm 3.7+ reuses "DEBUG" as an enum value +#pragma push_macro("DEBUG") +#undef DEBUG + +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" + +#include "llvm/Config/llvm-config.h" +#ifndef LLVM_VERSION_MAJOR +#include "llvm/Config/config.h" +#endif + +#include "llvm/IR/Verifier.h" +#include "llvm/ExecutionEngine/MCJIT.h" +#include "llvm/Support/FileSystem.h" +#define LLVM_F_NONE sys::fs::F_None + +#include "llvm/Analysis/Passes.h" + +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +#include "llvm/PassManager.h" +#else +#include "llvm/IR/LegacyPassManager.h" +using namespace llvm::legacy; +#endif + +#include "llvm/CodeGen/Passes.h" +#include "llvm/ExecutionEngine/ExecutionEngine.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Support/Host.h" + + +#pragma pop_macro("DEBUG") + +using namespace llvm; +////////////////////////////////////////////////////////////////////////// +/// JitInstructionSet +/// @brief Subclass of InstructionSet that allows users to override +/// the reporting of support for certain ISA features. This allows capping +/// the jitted code to a certain feature level, e.g. jit AVX level code on +/// a platform that supports AVX2. +////////////////////////////////////////////////////////////////////////// +class JitInstructionSet : public InstructionSet +{ +public: + JitInstructionSet(const char* requestedIsa) : isaRequest(requestedIsa) + { + std::transform(isaRequest.begin(), isaRequest.end(), isaRequest.begin(), ::tolower); + + if(isaRequest == "avx") + { + bForceAVX = true; + bForceAVX2 = false; + bForceAVX512 = false; + } + else if(isaRequest == "avx2") + { + bForceAVX = false; + bForceAVX2 = true; + bForceAVX512 = false; + } + #if 0 + else if(isaRequest == "avx512") + { + bForceAVX = false; + bForceAVX2 = false; + bForceAVX512 = true; + } + #endif + }; + + bool AVX2(void) { return bForceAVX ? 0 : InstructionSet::AVX2(); } + bool AVX512F(void) { return (bForceAVX | bForceAVX2) ? 0 : InstructionSet::AVX512F(); } + bool BMI2(void) { return bForceAVX ? 0 : InstructionSet::BMI2(); } + +private: + bool bForceAVX = false; + bool bForceAVX2 = false; + bool bForceAVX512 = false; + std::string isaRequest; +}; + + + +struct JitLLVMContext : LLVMContext +{ +}; + + +////////////////////////////////////////////////////////////////////////// +/// JitManager +////////////////////////////////////////////////////////////////////////// +struct JitManager +{ + JitManager(uint32_t w, const char *arch); + ~JitManager(){}; + + JitLLVMContext mContext; ///< LLVM compiler + IRBuilder<> mBuilder; ///< LLVM IR Builder + ExecutionEngine* mpExec; + + // Need to be rebuilt after a JIT and before building new IR + Module* mpCurrentModule; + bool mIsModuleFinalized; + uint32_t mJitNumber; + + uint32_t mVWidth; + + // Built in types. + Type* mInt8Ty; + Type* mInt32Ty; + Type* mInt64Ty; + Type* mFP32Ty; + StructType* mV4FP32Ty; + StructType* mV4Int32Ty; + + // helper scalar function types + FunctionType* mUnaryFPTy; + FunctionType* mBinaryFPTy; + FunctionType* mTrinaryFPTy; + FunctionType* mUnaryIntTy; + FunctionType* mBinaryIntTy; + FunctionType* mTrinaryIntTy; + + Type* mSimtFP32Ty; + Type* mSimtInt32Ty; + + Type* mSimdVectorInt32Ty; + Type* mSimdVectorTy; + + // fetch shader types + FunctionType* mFetchShaderTy; + + JitInstructionSet mArch; + + void SetupNewModule(); + bool SetupModuleFromIR(const uint8_t *pIR); + + static void DumpToFile(Function *f, const char *fileName); +}; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp new file mode 100644 index 00000000000..954524afd3a --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp @@ -0,0 +1,772 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file blend_jit.cpp +* +* @brief Implementation of the blend jitter +* +* Notes: +* +******************************************************************************/ +#include "jit_api.h" +#include "blend_jit.h" +#include "builder.h" +#include "state_llvm.h" +#include "common/containers.hpp" +#include "llvm/IR/DataLayout.h" + +#include <sstream> + +// components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized +#define QUANTIZE_THRESHOLD 2 + +////////////////////////////////////////////////////////////////////////// +/// Interface to Jitting a blend shader +////////////////////////////////////////////////////////////////////////// +struct BlendJit : public Builder +{ + BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){}; + + template<bool Color, bool Alpha> + void GenerateBlendFactor(SWR_BLEND_FACTOR factor, Value* constColor[4], Value* src[4], Value* src1[4], Value* dst[4], Value* result[4]) + { + Value* out[4]; + + switch (factor) + { + case BLENDFACTOR_ONE: + out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f); + break; + case BLENDFACTOR_SRC_COLOR: + out[0] = src[0]; + out[1] = src[1]; + out[2] = src[2]; + out[3] = src[3]; + break; + case BLENDFACTOR_SRC_ALPHA: + out[0] = out[1] = out[2] = out[3] = src[3]; + break; + case BLENDFACTOR_DST_ALPHA: + out[0] = out[1] = out[2] = out[3] = dst[3]; + break; + case BLENDFACTOR_DST_COLOR: + out[0] = dst[0]; + out[1] = dst[1]; + out[2] = dst[2]; + out[3] = dst[3]; + break; + case BLENDFACTOR_SRC_ALPHA_SATURATE: + out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3])); + out[3] = VIMMED1(1.0f); + break; + case BLENDFACTOR_CONST_COLOR: + out[0] = constColor[0]; + out[1] = constColor[1]; + out[2] = constColor[2]; + out[3] = constColor[3]; + break; + case BLENDFACTOR_CONST_ALPHA: + out[0] = out[1] = out[2] = out[3] = constColor[3]; + break; + case BLENDFACTOR_SRC1_COLOR: + out[0] = src1[0]; + out[1] = src1[1]; + out[2] = src1[2]; + out[3] = src1[3]; + break; + case BLENDFACTOR_SRC1_ALPHA: + out[0] = out[1] = out[2] = out[3] = src1[3]; + break; + case BLENDFACTOR_ZERO: + out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f); + break; + case BLENDFACTOR_INV_SRC_COLOR: + out[0] = FSUB(VIMMED1(1.0f), src[0]); + out[1] = FSUB(VIMMED1(1.0f), src[1]); + out[2] = FSUB(VIMMED1(1.0f), src[2]); + out[3] = FSUB(VIMMED1(1.0f), src[3]); + break; + case BLENDFACTOR_INV_SRC_ALPHA: + out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]); + break; + case BLENDFACTOR_INV_DST_ALPHA: + out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]); + break; + case BLENDFACTOR_INV_DST_COLOR: + out[0] = FSUB(VIMMED1(1.0f), dst[0]); + out[1] = FSUB(VIMMED1(1.0f), dst[1]); + out[2] = FSUB(VIMMED1(1.0f), dst[2]); + out[3] = FSUB(VIMMED1(1.0f), dst[3]); + break; + case BLENDFACTOR_INV_CONST_COLOR: + out[0] = FSUB(VIMMED1(1.0f), constColor[0]); + out[1] = FSUB(VIMMED1(1.0f), constColor[1]); + out[2] = FSUB(VIMMED1(1.0f), constColor[2]); + out[3] = FSUB(VIMMED1(1.0f), constColor[3]); + break; + case BLENDFACTOR_INV_CONST_ALPHA: + out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]); + break; + case BLENDFACTOR_INV_SRC1_COLOR: + out[0] = FSUB(VIMMED1(1.0f), src1[0]); + out[1] = FSUB(VIMMED1(1.0f), src1[1]); + out[2] = FSUB(VIMMED1(1.0f), src1[2]); + out[3] = FSUB(VIMMED1(1.0f), src1[3]); + break; + case BLENDFACTOR_INV_SRC1_ALPHA: + out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]); + break; + default: + SWR_ASSERT(false, "Unsupported blend factor: %d", factor); + out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f); + break; + } + + if (Color) + { + result[0] = out[0]; + result[1] = out[1]; + result[2] = out[2]; + } + + if (Alpha) + { + result[3] = out[3]; + } + } + + void Clamp(SWR_FORMAT format, Value* src[4]) + { + const SWR_FORMAT_INFO& info = GetFormatInfo(format); + SWR_TYPE type = info.type[0]; + + switch (type) + { + case SWR_TYPE_FLOAT: + break; + + case SWR_TYPE_UNORM: + src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f)); + src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f)); + src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f)); + src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f)); + break; + + case SWR_TYPE_SNORM: + src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f)); + src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f)); + src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f)); + src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f)); + break; + + default: SWR_ASSERT(false, "Unsupport format type: %d", type); + } + } + + void ApplyDefaults(SWR_FORMAT format, Value* src[4]) + { + const SWR_FORMAT_INFO& info = GetFormatInfo(format); + + bool valid[] = { false, false, false, false }; + for (uint32_t c = 0; c < info.numComps; ++c) + { + valid[info.swizzle[c]] = true; + } + + for (uint32_t c = 0; c < 4; ++c) + { + if (!valid[c]) + { + src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty); + } + } + } + + void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4]) + { + const SWR_FORMAT_INFO& info = GetFormatInfo(format); + + for (uint32_t c = 0; c < info.numComps; ++c) + { + if (info.type[c] == SWR_TYPE_UNUSED) + { + src[info.swizzle[c]] = BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty); + } + } + } + + void Quantize(SWR_FORMAT format, Value* src[4]) + { + const SWR_FORMAT_INFO& info = GetFormatInfo(format); + for (uint32_t c = 0; c < info.numComps; ++c) + { + if (info.bpc[c] <= QUANTIZE_THRESHOLD) + { + uint32_t swizComp = info.swizzle[c]; + float factor = (float)((1 << info.bpc[c]) - 1); + switch (info.type[c]) + { + case SWR_TYPE_UNORM: + src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f)); + src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO)); + src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f /factor)); + break; + default: SWR_ASSERT(false, "Unsupported format type: %d", info.type[c]); + } + } + } + } + + template<bool Color, bool Alpha> + void BlendFunc(SWR_BLEND_OP blendOp, Value* src[4], Value* srcFactor[4], Value* dst[4], Value* dstFactor[4], Value* result[4]) + { + Value* out[4]; + Value* srcBlend[4]; + Value* dstBlend[4]; + for (uint32_t i = 0; i < 4; ++i) + { + srcBlend[i] = FMUL(src[i], srcFactor[i]); + dstBlend[i] = FMUL(dst[i], dstFactor[i]); + } + + switch (blendOp) + { + case BLENDOP_ADD: + out[0] = FADD(srcBlend[0], dstBlend[0]); + out[1] = FADD(srcBlend[1], dstBlend[1]); + out[2] = FADD(srcBlend[2], dstBlend[2]); + out[3] = FADD(srcBlend[3], dstBlend[3]); + break; + + case BLENDOP_SUBTRACT: + out[0] = FSUB(srcBlend[0], dstBlend[0]); + out[1] = FSUB(srcBlend[1], dstBlend[1]); + out[2] = FSUB(srcBlend[2], dstBlend[2]); + out[3] = FSUB(srcBlend[3], dstBlend[3]); + break; + + case BLENDOP_REVSUBTRACT: + out[0] = FSUB(dstBlend[0], srcBlend[0]); + out[1] = FSUB(dstBlend[1], srcBlend[1]); + out[2] = FSUB(dstBlend[2], srcBlend[2]); + out[3] = FSUB(dstBlend[3], srcBlend[3]); + break; + + case BLENDOP_MIN: + out[0] = VMINPS(src[0], dst[0]); + out[1] = VMINPS(src[1], dst[1]); + out[2] = VMINPS(src[2], dst[2]); + out[3] = VMINPS(src[3], dst[3]); + break; + + case BLENDOP_MAX: + out[0] = VMAXPS(src[0], dst[0]); + out[1] = VMAXPS(src[1], dst[1]); + out[2] = VMAXPS(src[2], dst[2]); + out[3] = VMAXPS(src[3], dst[3]); + break; + + default: + SWR_ASSERT(false, "Unsupported blend operation: %d", blendOp); + out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f); + break; + } + + if (Color) + { + result[0] = out[0]; + result[1] = out[1]; + result[2] = out[2]; + } + + if (Alpha) + { + result[3] = out[3]; + } + } + + void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4]) + { + // Op: (s == PS output, d = RT contents) + switch(logicOp) + { + case LOGICOP_CLEAR: + result[0] = VIMMED1(0); + result[1] = VIMMED1(0); + result[2] = VIMMED1(0); + result[3] = VIMMED1(0); + break; + + case LOGICOP_NOR: + // ~(s | d) + result[0] = XOR(OR(src[0], dst[0]), VIMMED1(0xFFFFFFFF)); + result[1] = XOR(OR(src[1], dst[1]), VIMMED1(0xFFFFFFFF)); + result[2] = XOR(OR(src[2], dst[2]), VIMMED1(0xFFFFFFFF)); + result[3] = XOR(OR(src[3], dst[3]), VIMMED1(0xFFFFFFFF)); + break; + + case LOGICOP_AND_INVERTED: + // ~s & d + // todo: use avx andnot instr when I can find the intrinsic to call + result[0] = AND(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]); + result[1] = AND(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]); + result[2] = AND(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]); + result[3] = AND(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]); + break; + + case LOGICOP_COPY_INVERTED: + // ~s + result[0] = XOR(src[0], VIMMED1(0xFFFFFFFF)); + result[1] = XOR(src[1], VIMMED1(0xFFFFFFFF)); + result[2] = XOR(src[2], VIMMED1(0xFFFFFFFF)); + result[3] = XOR(src[3], VIMMED1(0xFFFFFFFF)); + break; + + case LOGICOP_AND_REVERSE: + // s & ~d + // todo: use avx andnot instr when I can find the intrinsic to call + result[0] = AND(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]); + result[1] = AND(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]); + result[2] = AND(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]); + result[3] = AND(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]); + break; + + case LOGICOP_INVERT: + // ~d + result[0] = XOR(dst[0], VIMMED1(0xFFFFFFFF)); + result[1] = XOR(dst[1], VIMMED1(0xFFFFFFFF)); + result[2] = XOR(dst[2], VIMMED1(0xFFFFFFFF)); + result[3] = XOR(dst[3], VIMMED1(0xFFFFFFFF)); + break; + + case LOGICOP_XOR: + // s ^ d + result[0] = XOR(src[0], dst[0]); + result[1] = XOR(src[1], dst[1]); + result[2] = XOR(src[2], dst[2]); + result[3] = XOR(src[3], dst[3]); + break; + + case LOGICOP_NAND: + // ~(s & d) + result[0] = XOR(AND(src[0], dst[0]), VIMMED1(0xFFFFFFFF)); + result[1] = XOR(AND(src[1], dst[1]), VIMMED1(0xFFFFFFFF)); + result[2] = XOR(AND(src[2], dst[2]), VIMMED1(0xFFFFFFFF)); + result[3] = XOR(AND(src[3], dst[3]), VIMMED1(0xFFFFFFFF)); + break; + + case LOGICOP_AND: + // s & d + result[0] = AND(src[0], dst[0]); + result[1] = AND(src[1], dst[1]); + result[2] = AND(src[2], dst[2]); + result[3] = AND(src[3], dst[3]); + break; + + case LOGICOP_EQUIV: + // ~(s ^ d) + result[0] = XOR(XOR(src[0], dst[0]), VIMMED1(0xFFFFFFFF)); + result[1] = XOR(XOR(src[1], dst[1]), VIMMED1(0xFFFFFFFF)); + result[2] = XOR(XOR(src[2], dst[2]), VIMMED1(0xFFFFFFFF)); + result[3] = XOR(XOR(src[3], dst[3]), VIMMED1(0xFFFFFFFF)); + break; + + case LOGICOP_NOOP: + result[0] = dst[0]; + result[1] = dst[1]; + result[2] = dst[2]; + result[3] = dst[3]; + break; + + case LOGICOP_OR_INVERTED: + // ~s | d + result[0] = OR(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]); + result[1] = OR(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]); + result[2] = OR(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]); + result[3] = OR(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]); + break; + + case LOGICOP_COPY: + result[0] = src[0]; + result[1] = src[1]; + result[2] = src[2]; + result[3] = src[3]; + break; + + case LOGICOP_OR_REVERSE: + // s | ~d + result[0] = OR(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]); + result[1] = OR(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]); + result[2] = OR(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]); + result[3] = OR(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]); + break; + + case LOGICOP_OR: + // s | d + result[0] = OR(src[0], dst[0]); + result[1] = OR(src[1], dst[1]); + result[2] = OR(src[2], dst[2]); + result[3] = OR(src[3], dst[3]); + break; + + case LOGICOP_SET: + result[0] = VIMMED1(0xFFFFFFFF); + result[1] = VIMMED1(0xFFFFFFFF); + result[2] = VIMMED1(0xFFFFFFFF); + result[3] = VIMMED1(0xFFFFFFFF); + break; + + default: + SWR_ASSERT(false, "Unsupported logic operation: %d", logicOp); + result[0] = result[1] = result[2] = result[3] = VIMMED1(0.0f); + break; + } + } + + void AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* pAlpha, Value* ppMask) + { + // load uint32_t reference + Value* pRef = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_alphaTestReference })); + + Value* pTest = nullptr; + if (state.alphaTestFormat == ALPHA_TEST_UNORM8) + { + // convert float alpha to unorm8 + Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f)); + pAlphaU8 = FP_TO_UI(pAlphaU8, mSimdInt32Ty); + + // compare + switch (state.alphaTestFunction) + { + case ZFUNC_ALWAYS: pTest = VIMMED1(true); break; + case ZFUNC_NEVER: pTest = VIMMED1(false); break; + case ZFUNC_LT: pTest = ICMP_ULT(pAlphaU8, pRef); break; + case ZFUNC_EQ: pTest = ICMP_EQ(pAlphaU8, pRef); break; + case ZFUNC_LE: pTest = ICMP_ULE(pAlphaU8, pRef); break; + case ZFUNC_GT: pTest = ICMP_UGT(pAlphaU8, pRef); break; + case ZFUNC_NE: pTest = ICMP_NE(pAlphaU8, pRef); break; + case ZFUNC_GE: pTest = ICMP_UGE(pAlphaU8, pRef); break; + default: + SWR_ASSERT(false, "Invalid alpha test function"); + break; + } + } + else + { + // cast ref to float + pRef = BITCAST(pRef, mSimdFP32Ty); + + // compare + switch (state.alphaTestFunction) + { + case ZFUNC_ALWAYS: pTest = VIMMED1(true); break; + case ZFUNC_NEVER: pTest = VIMMED1(false); break; + case ZFUNC_LT: pTest = FCMP_OLT(pAlpha, pRef); break; + case ZFUNC_EQ: pTest = FCMP_OEQ(pAlpha, pRef); break; + case ZFUNC_LE: pTest = FCMP_OLE(pAlpha, pRef); break; + case ZFUNC_GT: pTest = FCMP_OGT(pAlpha, pRef); break; + case ZFUNC_NE: pTest = FCMP_ONE(pAlpha, pRef); break; + case ZFUNC_GE: pTest = FCMP_OGE(pAlpha, pRef); break; + default: + SWR_ASSERT(false, "Invalid alpha test function"); + break; + } + } + + // load current mask + Value* pMask = LOAD(ppMask); + + // convert to int1 mask + pMask = MASK(pMask); + + // and with alpha test result + pMask = AND(pMask, pTest); + + // convert back to vector mask + pMask = VMASK(pMask); + + // store new mask + STORE(pMask, ppMask); + } + + Function* Create(const BLEND_COMPILE_STATE& state) + { + static std::size_t jitNum = 0; + + std::stringstream fnName("BlendShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate); + fnName << jitNum++; + + // blend function signature + //typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*); + + std::vector<Type*> args{ + PointerType::get(Gen_SWR_BLEND_STATE(JM()), 0), // SWR_BLEND_STATE* + PointerType::get(mSimdFP32Ty, 0), // simdvector& src + PointerType::get(mSimdFP32Ty, 0), // simdvector& src1 + Type::getInt32Ty(JM()->mContext), // sampleNum + PointerType::get(mSimdFP32Ty, 0), // uint8_t* pDst + PointerType::get(mSimdFP32Ty, 0), // simdvector& result + PointerType::get(mSimdInt32Ty, 0), // simdscalari* oMask + PointerType::get(mSimdInt32Ty, 0), // simdscalari* pMask + }; + + FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false); + Function* blendFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); + + BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc); + + IRB()->SetInsertPoint(entry); + + // arguments + auto argitr = blendFunc->getArgumentList().begin(); + Value* pBlendState = &*argitr++; + pBlendState->setName("pBlendState"); + Value* pSrc = &*argitr++; + pSrc->setName("src"); + Value* pSrc1 = &*argitr++; + pSrc1->setName("src1"); + Value* sampleNum = &*argitr++; + sampleNum->setName("sampleNum"); + Value* pDst = &*argitr++; + pDst->setName("pDst"); + Value* pResult = &*argitr++; + pResult->setName("result"); + Value* ppoMask = &*argitr++; + ppoMask->setName("ppoMask"); + Value* ppMask = &*argitr++; + ppMask->setName("pMask"); + + static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format"); + Value* dst[4]; + Value* constantColor[4]; + Value* src[4]; + Value* src1[4]; + Value* result[4]; + for (uint32_t i = 0; i < 4; ++i) + { + // load hot tile + dst[i] = LOAD(pDst, { i }); + + // load constant color + constantColor[i] = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_constantColor, i })); + + // load src + src[i] = LOAD(pSrc, { i }); + + // load src1 + src1[i] = LOAD(pSrc1, { i }); + } + Value* currentMask = VIMMED1(-1); + if(state.desc.alphaToCoverageEnable) + { + currentMask = FP_TO_SI(FMUL(src[3], VBROADCAST(C((float)state.desc.numSamples))), mSimdInt32Ty); + } + + // alpha test + if (state.desc.alphaTestEnable) + { + AlphaTest(state, pBlendState, src[3], ppMask); + } + + // color blend + if (state.blendState.blendEnable) + { + // clamp sources + Clamp(state.format, src); + Clamp(state.format, src1); + Clamp(state.format, dst); + Clamp(state.format, constantColor); + + // apply defaults to hottile contents to take into account missing components + ApplyDefaults(state.format, dst); + + // Force defaults for unused 'X' components + ApplyUnusedDefaults(state.format, dst); + + // Quantize low precision components + Quantize(state.format, dst); + + // special case clamping for R11G11B10_float which has no sign bit + if (state.format == R11G11B10_FLOAT) + { + dst[0] = VMAXPS(dst[0], VIMMED1(0.0f)); + dst[1] = VMAXPS(dst[1], VIMMED1(0.0f)); + dst[2] = VMAXPS(dst[2], VIMMED1(0.0f)); + dst[3] = VMAXPS(dst[3], VIMMED1(0.0f)); + } + + Value* srcFactor[4]; + Value* dstFactor[4]; + if (state.desc.independentAlphaBlendEnable) + { + GenerateBlendFactor<true, false>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor); + GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor, constantColor, src, src1, dst, srcFactor); + + GenerateBlendFactor<true, false>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor); + GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor, constantColor, src, src1, dst, dstFactor); + + BlendFunc<true, false>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result); + BlendFunc<false, true>(state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result); + } + else + { + GenerateBlendFactor<true, true>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor); + GenerateBlendFactor<true, true>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor); + + BlendFunc<true, true>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result); + } + + // store results out + for (uint32_t i = 0; i < 4; ++i) + { + STORE(result[i], pResult, { i }); + } + } + + if(state.blendState.logicOpEnable) + { + const SWR_FORMAT_INFO& info = GetFormatInfo(state.format); + SWR_ASSERT(info.type[0] == SWR_TYPE_UINT); + Value* vMask[4]; + for(uint32_t i = 0; i < 4; i++) + { + switch(info.bpc[i]) + { + case 0: vMask[i] = VIMMED1(0x00000000); break; + case 2: vMask[i] = VIMMED1(0x00000003); break; + case 5: vMask[i] = VIMMED1(0x0000001F); break; + case 6: vMask[i] = VIMMED1(0x0000003F); break; + case 8: vMask[i] = VIMMED1(0x000000FF); break; + case 10: vMask[i] = VIMMED1(0x000003FF); break; + case 11: vMask[i] = VIMMED1(0x000007FF); break; + case 16: vMask[i] = VIMMED1(0x0000FFFF); break; + case 24: vMask[i] = VIMMED1(0x00FFFFFF); break; + case 32: vMask[i] = VIMMED1(0xFFFFFFFF); break; + default: + vMask[i] = VIMMED1(0x0); + SWR_ASSERT(0, "Unsupported bpc for logic op\n"); + break; + } + src[i] = BITCAST(src[i], mSimdInt32Ty);//, vMask[i]); + dst[i] = BITCAST(dst[i], mSimdInt32Ty); + } + + LogicOpFunc(state.blendState.logicOpFunc, src, dst, result); + + // store results out + for(uint32_t i = 0; i < 4; ++i) + { + // clear upper bits from PS output not in RT format after doing logic op + result[i] = AND(result[i], vMask[i]); + + STORE(BITCAST(result[i], mSimdFP32Ty), pResult, {i}); + } + } + + if(state.desc.oMaskEnable) + { + assert(!(state.desc.alphaToCoverageEnable)); + // load current mask + Value* oMask = LOAD(ppoMask); + Value* sampleMasked = VBROADCAST(SHL(C(1), sampleNum)); + oMask = AND(oMask, sampleMasked); + currentMask = AND(oMask, currentMask); + } + + if(state.desc.sampleMaskEnable) + { + Value* sampleMask = LOAD(pBlendState, { 0, SWR_BLEND_STATE_sampleMask}); + Value* sampleMasked = SHL(C(1), sampleNum); + sampleMask = AND(sampleMask, sampleMasked); + sampleMask = VBROADCAST(ICMP_SGT(sampleMask, C(0))); + sampleMask = S_EXT(sampleMask, mSimdInt32Ty); + currentMask = AND(sampleMask, currentMask); + } + + if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable || + state.desc.oMaskEnable) + { + // load current mask + Value* pMask = LOAD(ppMask); + currentMask = S_EXT(ICMP_SGT(currentMask, VBROADCAST(C(0))), mSimdInt32Ty); + Value* outputMask = AND(pMask, currentMask); + // store new mask + STORE(outputMask, GEP(ppMask, C(0))); + } + + RET_VOID(); + + JitManager::DumpToFile(blendFunc, ""); + + FunctionPassManager passes(JM()->mpCurrentModule); + passes.add(createBreakCriticalEdgesPass()); + passes.add(createCFGSimplificationPass()); + passes.add(createEarlyCSEPass()); + passes.add(createPromoteMemoryToRegisterPass()); + passes.add(createCFGSimplificationPass()); + passes.add(createEarlyCSEPass()); + passes.add(createInstructionCombiningPass()); + passes.add(createInstructionSimplifierPass()); + passes.add(createConstantPropagationPass()); + passes.add(createSCCPPass()); + passes.add(createAggressiveDCEPass()); + + passes.run(*blendFunc); + + JitManager::DumpToFile(blendFunc, "optimized"); + + return blendFunc; + } +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief JITs from fetch shader IR +/// @param hJitMgr - JitManager handle +/// @param func - LLVM function IR +/// @return PFN_FETCH_FUNC - pointer to fetch code +PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc) +{ + const llvm::Function *func = (const llvm::Function*)hFunc; + JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); + PFN_BLEND_JIT_FUNC pfnBlend; + pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); + // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module + pJitMgr->mIsModuleFinalized = true; + + return pfnBlend; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief JIT compiles blend shader +/// @param hJitMgr - JitManager handle +/// @param state - blend state to build function from +extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr, const BLEND_COMPILE_STATE& state) +{ + JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); + + pJitMgr->SetupNewModule(); + + BlendJit theJit(pJitMgr); + HANDLE hFunc = theJit.Create(state); + + return JitBlendFunc(hJitMgr, hFunc); +} diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h new file mode 100644 index 00000000000..057eb92b67e --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h @@ -0,0 +1,93 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file blend_jit.h +* +* @brief Definition of the blend jitter +* +* Notes: +* +******************************************************************************/ +#pragma once + +#include "common/formats.h" +#include "core/context.h" +#include "core/state.h" + +struct RENDER_TARGET_BLEND_COMPILE_STATE +{ + bool blendEnable; + bool logicOpEnable; + SWR_BLEND_FACTOR sourceAlphaBlendFactor; + SWR_BLEND_FACTOR destAlphaBlendFactor; + SWR_BLEND_FACTOR sourceBlendFactor; + SWR_BLEND_FACTOR destBlendFactor; + SWR_BLEND_OP colorBlendFunc; + SWR_BLEND_OP alphaBlendFunc; + SWR_LOGIC_OP logicOpFunc; +}; + +enum ALPHA_TEST_FORMAT +{ + ALPHA_TEST_UNORM8, + ALPHA_TEST_FLOAT32 +}; + +////////////////////////////////////////////////////////////////////////// +/// BLEND_DESC +////////////////////////////////////////////////////////////////////////// +struct BLEND_DESC +{ + union + { + struct + { + uint32_t alphaTestEnable: 1; + uint32_t independentAlphaBlendEnable: 1; + uint32_t alphaToCoverageEnable: 1; + uint32_t oMaskEnable:1; + uint32_t inputCoverageEnable:1; + uint32_t sampleMaskEnable:1; + uint32_t numSamples:5; + uint32_t _reserved : 21; + }; + uint32_t bits; + }; +}; +#define BLEND_ENABLE_MASK 0x3D // a2c | oMaskEnable | inputCoverageEnable | sampleMaskEnable +////////////////////////////////////////////////////////////////////////// +/// State required for blend jit +////////////////////////////////////////////////////////////////////////// +struct BLEND_COMPILE_STATE +{ + SWR_FORMAT format; // format of render target being blended + RENDER_TARGET_BLEND_COMPILE_STATE blendState; + BLEND_DESC desc; + + SWR_ZFUNCTION alphaTestFunction; + ALPHA_TEST_FORMAT alphaTestFormat; + + bool operator==(const BLEND_COMPILE_STATE& other) const + { + return memcmp(this, &other, sizeof(BLEND_COMPILE_STATE)) == 0; + } +}; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp new file mode 100644 index 00000000000..c15bdf1e756 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp @@ -0,0 +1,71 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file builder.h +* +* @brief Includes all the builder related functionality +* +* Notes: +* +******************************************************************************/ + +#include "builder.h" + +using namespace llvm; + +////////////////////////////////////////////////////////////////////////// +/// @brief Contructor for Builder. +/// @param pJitMgr - JitManager which contains modules, function passes, etc. +Builder::Builder(JitManager *pJitMgr) + : mpJitMgr(pJitMgr) +{ + mpIRBuilder = &pJitMgr->mBuilder; + + mVoidTy = Type::getVoidTy(pJitMgr->mContext); + mFP16Ty = Type::getHalfTy(pJitMgr->mContext); + mFP32Ty = Type::getFloatTy(pJitMgr->mContext); + mDoubleTy = Type::getDoubleTy(pJitMgr->mContext); + mInt1Ty = Type::getInt1Ty(pJitMgr->mContext); + mInt8Ty = Type::getInt8Ty(pJitMgr->mContext); + mInt16Ty = Type::getInt16Ty(pJitMgr->mContext); + mInt32Ty = Type::getInt32Ty(pJitMgr->mContext); + mInt64Ty = Type::getInt64Ty(pJitMgr->mContext); + mV4FP32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mFP32Ty), false); // vector4 float type (represented as structure) + mV4Int32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mInt32Ty), false); // vector4 int type + mSimdInt16Ty = VectorType::get(mInt16Ty, mpJitMgr->mVWidth); + mSimdInt32Ty = VectorType::get(mInt32Ty, mpJitMgr->mVWidth); + mSimdInt64Ty = VectorType::get(mInt64Ty, mpJitMgr->mVWidth); + mSimdFP16Ty = VectorType::get(mFP16Ty, mpJitMgr->mVWidth); + mSimdFP32Ty = VectorType::get(mFP32Ty, mpJitMgr->mVWidth); + + if (sizeof(uint32_t*) == 4) + { + mIntPtrTy = mInt32Ty; + mSimdIntPtrTy = mSimdInt32Ty; + } + else + { + SWR_ASSERT(sizeof(uint32_t*) == 8); + mIntPtrTy = mInt64Ty; + mSimdIntPtrTy = mSimdInt64Ty; + } +} diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h new file mode 100644 index 00000000000..49216612cc9 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h @@ -0,0 +1,71 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file builder.h +* +* @brief Includes all the builder related functionality +* +* Notes: +* +******************************************************************************/ +#pragma once + +#include "JitManager.h" +#include "common/formats.h" + +using namespace llvm; + +struct Builder +{ + Builder(JitManager *pJitMgr); + IRBuilder<>* IRB() { return mpIRBuilder; }; + JitManager* JM() { return mpJitMgr; } + + JitManager* mpJitMgr; + IRBuilder<>* mpIRBuilder; + + // Built in types. + Type* mVoidTy; + Type* mInt1Ty; + Type* mInt8Ty; + Type* mInt16Ty; + Type* mInt32Ty; + Type* mInt64Ty; + Type* mIntPtrTy; + Type* mFP16Ty; + Type* mFP32Ty; + Type* mDoubleTy; + Type* mSimdFP16Ty; + Type* mSimdFP32Ty; + Type* mSimdInt16Ty; + Type* mSimdInt32Ty; + Type* mSimdInt64Ty; + Type* mSimdIntPtrTy; + StructType* mV4FP32Ty; + StructType* mV4Int32Ty; + +#include "builder_gen.h" +#include "builder_x86.h" +#include "builder_misc.h" +#include "builder_math.h" + +}; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h new file mode 100644 index 00000000000..92867ec9836 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h @@ -0,0 +1,34 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file builder_math.h +* +* @brief math/alu builder functions +* +* Notes: +* +******************************************************************************/ +#pragma once + +Value* VLOG2PS(Value* src); +Value* VPOW24PS(Value* src); +Value* VEXP2PS(Value* src); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp new file mode 100644 index 00000000000..5394fc7bf5a --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -0,0 +1,1447 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file builder_misc.cpp +* +* @brief Implementation for miscellaneous builder functions +* +* Notes: +* +******************************************************************************/ +#include "builder.h" +#include "llvm/Support/DynamicLibrary.h" + +void __cdecl CallPrint(const char* fmt, ...); + +////////////////////////////////////////////////////////////////////////// +/// @brief Convert an IEEE 754 32-bit single precision float to an +/// 16 bit float with 5 exponent bits and a variable +/// number of mantissa bits. +/// @param val - 32-bit float +/// @todo Maybe move this outside of this file into a header? +static uint16_t Convert32To16Float(float val) +{ + uint32_t sign, exp, mant; + uint32_t roundBits; + + // Extract the sign, exponent, and mantissa + uint32_t uf = *(uint32_t*)&val; + sign = (uf & 0x80000000) >> 31; + exp = (uf & 0x7F800000) >> 23; + mant = uf & 0x007FFFFF; + + // Check for out of range + if (std::isnan(val)) + { + exp = 0x1F; + mant = 0x200; + sign = 1; // set the sign bit for NANs + } + else if (std::isinf(val)) + { + exp = 0x1f; + mant = 0x0; + } + else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value + { + exp = 0x1E; + mant = 0x3FF; + } + else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm + { + mant |= 0x00800000; + for (; exp <= 0x70; mant >>= 1, exp++) + ; + exp = 0; + mant = mant >> 13; + } + else if (exp < 0x66) // Too small to represent -> Zero + { + exp = 0; + mant = 0; + } + else + { + // Saves bits that will be shifted off for rounding + roundBits = mant & 0x1FFFu; + // convert exponent and mantissa to 16 bit format + exp = exp - 0x70; + mant = mant >> 13; + + // Essentially RTZ, but round up if off by only 1 lsb + if (roundBits == 0x1FFFu) + { + mant++; + // check for overflow + if ((mant & 0xC00u) != 0) + exp++; + // make sure only the needed bits are used + mant &= 0x3FF; + } + } + + uint32_t tmpVal = (sign << 15) | (exp << 10) | mant; + return (uint16_t)tmpVal; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision +/// float +/// @param val - 16-bit float +/// @todo Maybe move this outside of this file into a header? +static float ConvertSmallFloatTo32(UINT val) +{ + UINT result; + if ((val & 0x7fff) == 0) + { + result = ((uint32_t)(val & 0x8000)) << 16; + } + else if ((val & 0x7c00) == 0x7c00) + { + result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000; + result |= ((uint32_t)val & 0x8000) << 16; + } + else + { + uint32_t sign = (val & 0x8000) << 16; + uint32_t mant = (val & 0x3ff) << 13; + uint32_t exp = (val >> 10) & 0x1f; + if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals + { + mant <<= 1; + while (mant < (0x400 << 13)) + { + exp--; + mant <<= 1; + } + mant &= (0x3ff << 13); + } + exp = ((exp - 15 + 127) & 0xff) << 23; + result = sign | exp | mant; + } + + return *(float*)&result; +} + +Constant *Builder::C(bool i) +{ + return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0)); +} + +Constant *Builder::C(char i) +{ + return ConstantInt::get(IRB()->getInt8Ty(), i); +} + +Constant *Builder::C(uint8_t i) +{ + return ConstantInt::get(IRB()->getInt8Ty(), i); +} + +Constant *Builder::C(int i) +{ + return ConstantInt::get(IRB()->getInt32Ty(), i); +} + +Constant *Builder::C(int64_t i) +{ + return ConstantInt::get(IRB()->getInt64Ty(), i); +} + +Constant *Builder::C(uint16_t i) +{ + return ConstantInt::get(mInt16Ty,i); +} + +Constant *Builder::C(uint32_t i) +{ + return ConstantInt::get(IRB()->getInt32Ty(), i); +} + +Constant *Builder::C(float i) +{ + return ConstantFP::get(IRB()->getFloatTy(), i); +} + +Constant *Builder::PRED(bool pred) +{ + return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0)); +} + +Value *Builder::VIMMED1(int i) +{ + return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i))); +} + +Value *Builder::VIMMED1(uint32_t i) +{ + return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i))); +} + +Value *Builder::VIMMED1(float i) +{ + return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantFP>(C(i))); +} + +Value *Builder::VIMMED1(bool i) +{ + return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i))); +} + +Value *Builder::VUNDEF_IPTR() +{ + return UndefValue::get(VectorType::get(PointerType::get(mInt32Ty, 0),JM()->mVWidth)); +} + +Value *Builder::VUNDEF_I() +{ + return UndefValue::get(VectorType::get(mInt32Ty, JM()->mVWidth)); +} + +Value *Builder::VUNDEF(Type *ty, uint32_t size) +{ + return UndefValue::get(VectorType::get(ty, size)); +} + +Value *Builder::VUNDEF_F() +{ + return UndefValue::get(VectorType::get(mFP32Ty, JM()->mVWidth)); +} + +Value *Builder::VUNDEF(Type* t) +{ + return UndefValue::get(VectorType::get(t, JM()->mVWidth)); +} + +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index) +{ + return VINSERT(vec, val, C((int64_t)index)); +} +#endif + +Value *Builder::VBROADCAST(Value *src) +{ + // check if src is already a vector + if (src->getType()->isVectorTy()) + { + return src; + } + + return VECTOR_SPLAT(JM()->mVWidth, src); +} + +uint32_t Builder::IMMED(Value* v) +{ + SWR_ASSERT(isa<ConstantInt>(v)); + ConstantInt *pValConst = cast<ConstantInt>(v); + return pValConst->getZExtValue(); +} + +Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList) +{ + std::vector<Value*> indices; + for (auto i : indexList) + indices.push_back(i); + return GEPA(ptr, indices); +} + +Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList) +{ + std::vector<Value*> indices; + for (auto i : indexList) + indices.push_back(C(i)); + return GEPA(ptr, indices); +} + +LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name) +{ + std::vector<Value*> valIndices; + for (auto i : indices) + valIndices.push_back(C(i)); + return LOAD(GEPA(basePtr, valIndices), name); +} + +LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name) +{ + std::vector<Value*> valIndices; + for (auto i : indices) + valIndices.push_back(i); + return LOAD(GEPA(basePtr, valIndices), name); +} + +StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices) +{ + std::vector<Value*> valIndices; + for (auto i : indices) + valIndices.push_back(C(i)); + return STORE(val, GEPA(basePtr, valIndices)); +} + +StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices) +{ + std::vector<Value*> valIndices; + for (auto i : indices) + valIndices.push_back(i); + return STORE(val, GEPA(basePtr, valIndices)); +} + +CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList) +{ + std::vector<Value*> args; + for (auto arg : argsList) + args.push_back(arg); + return CALLA(Callee, args); +} + +Value *Builder::VRCP(Value *va) +{ + return FDIV(VIMMED1(1.0f), va); // 1 / a +} + +Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY) +{ + Value* vOut = FMADDPS(vA, vX, vC); + vOut = FMADDPS(vB, vY, vOut); + return vOut; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Generate an i32 masked load operation in LLVM IR. If not +/// supported on the underlying platform, emulate it with float masked load +/// @param src - base address pointer for the load +/// @param vMask - SIMD wide mask that controls whether to access memory load 0 +Value *Builder::MASKLOADD(Value* src,Value* mask) +{ + Value* vResult; + // use avx2 gather instruction is available + if(JM()->mArch.AVX2()) + { + Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256); + vResult = CALL(func,{src,mask}); + } + else + { + Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256); + Value* fMask = BITCAST(mask,VectorType::get(mFP32Ty,JM()->mVWidth)); + vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,JM()->mVWidth)); + } + return vResult; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief insert a JIT call to CallPrint +/// - outputs formatted string to both stdout and VS output window +/// - DEBUG builds only +/// Usage example: +/// PRINT("index %d = 0x%p\n",{C(lane), pIndex}); +/// where C(lane) creates a constant value to print, and pIndex is the Value* +/// result from a GEP, printing out the pointer to memory +/// @param printStr - constant string to print, which includes format specifiers +/// @param printArgs - initializer list of Value*'s to print to std out +CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs) +{ + // push the arguments to CallPrint into a vector + std::vector<Value*> printCallArgs; + // save room for the format string. we still need to modify it for vectors + printCallArgs.resize(1); + + // search through the format string for special processing + size_t pos = 0; + std::string tempStr(printStr); + pos = tempStr.find('%', pos); + auto v = printArgs.begin(); + + while ((pos != std::string::npos) && (v != printArgs.end())) + { + Value* pArg = *v; + Type* pType = pArg->getType(); + + if (tempStr[pos + 1] == 't') + { + if (pType->isVectorTy()) + { + Type* pContainedType = pType->getContainedType(0); + + std::string vectorFormatStr; + + if (pContainedType->isFloatTy()) + { + tempStr[pos + 1] = 'f'; // Ensure its %f + printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(0)), mDoubleTy)); + + for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i) + { + vectorFormatStr += "%f "; + printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), mDoubleTy)); + } + } + else if (pContainedType->isIntegerTy()) + { + tempStr[pos + 1] = 'd'; // Ensure its %d + printCallArgs.push_back(VEXTRACT(pArg, C(0))); + + for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i) + { + vectorFormatStr += "%d "; + printCallArgs.push_back(VEXTRACT(pArg, C(i))); + } + } + else + { + SWR_ASSERT(0, "Unsupported tyep"); + } + + tempStr.insert(pos, vectorFormatStr); + pos += vectorFormatStr.size(); + } + else + { + if (pType->isFloatTy()) + { + tempStr[pos + 1] = 'f'; // Ensure its %f + printCallArgs.push_back(FP_EXT(pArg, mDoubleTy)); + } + else if (pType->isIntegerTy()) + { + tempStr[pos + 1] = 'd'; // Ensure its %d + printCallArgs.push_back(pArg); + } + } + } + else if (toupper(tempStr[pos + 1]) == 'X') + { + if (pType->isVectorTy()) + { + tempStr[pos] = '0'; + tempStr.insert(pos + 1, "x%08"); + + printCallArgs.push_back(VEXTRACT(pArg, C(0))); + + std::string vectorFormatStr; + for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i) + { + vectorFormatStr += "0x%08X "; + printCallArgs.push_back(VEXTRACT(pArg, C(i))); + } + + tempStr.insert(pos, vectorFormatStr); + pos += vectorFormatStr.size(); + } + else + { + tempStr[pos] = '0'; + tempStr.insert(pos + 1, "x%08"); + printCallArgs.push_back(pArg); + pos += 3; + } + } + // for %f we need to cast float Values to doubles so that they print out correctly + else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy())) + { + printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext))); + pos++; + } + // add special handling for %f and %d format specifiers to make printing llvm vector types easier + else if (pType->isVectorTy()) + { + Type* pContainedType = pType->getContainedType(0); + + if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy())) + { + uint32_t i = 0; + for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++) + { + tempStr.insert(pos, std::string("%f ")); + pos += 3; + printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext))); + } + printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext))); + } + else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy())) + { + uint32_t i = 0; + for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++) + { + tempStr.insert(pos, std::string("%d ")); + pos += 3; + printCallArgs.push_back(VEXTRACT(pArg, C(i))); + } + printCallArgs.push_back(VEXTRACT(pArg, C(i))); + } + else + { + /// not a supported vector to print + /// @todo pointer types too + SWR_ASSERT(0); + } + } + else + { + printCallArgs.push_back(pArg); + } + + // advance to the next arguement + v++; + pos = tempStr.find('%', ++pos); + } + + // create global variable constant string + Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true); + GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr"); + JM()->mpCurrentModule->getGlobalList().push_back(gvPtr); + + // get a pointer to the first character in the constant string array + std::vector<Constant*> geplist{C(0),C(0)}; +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 + Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false); +#else + Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false); +#endif + + // insert the pointer to the format string in the argument vector + printCallArgs[0] = strGEP; + + // get pointer to CallPrint function and insert decl into the module if needed + std::vector<Type*> args; + args.push_back(PointerType::get(mInt8Ty,0)); + FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true); + Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy)); + + // if we haven't yet added the symbol to the symbol table + if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr) + { + sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint); + } + + // insert a call to CallPrint + return CALLA(callPrintFn,printCallArgs); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Wrapper around PRINT with initializer list. +CallInst* Builder::PRINT(const std::string &printStr) +{ + return PRINT(printStr, {}); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Generate a masked gather operation in LLVM IR. If not +/// supported on the underlying platform, emulate it with loads +/// @param vSrc - SIMD wide value that will be loaded if mask is invalid +/// @param pBase - Int8* base VB address pointer value +/// @param vIndices - SIMD wide value of VB byte offsets +/// @param vMask - SIMD wide mask that controls whether to access memory or the src values +/// @param scale - value to scale indices by +Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale) +{ + Value* vGather; + + // use avx2 gather instruction if available + if(JM()->mArch.AVX2()) + { + // force mask to <N x float>, required by vgather + vMask = BITCAST(vMask, mSimdFP32Ty); + vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale); + } + else + { + Value* pStack = STACKSAVE(); + + // store vSrc on the stack. this way we can select between a valid load address and the vSrc address + Value* vSrcPtr = ALLOCA(vSrc->getType()); + STORE(vSrc, vSrcPtr); + + vGather = VUNDEF_F(); + Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty)); + Value *vOffsets = MUL(vIndices,vScaleVec); + Value *mask = MASK(vMask); + for(uint32_t i = 0; i < JM()->mVWidth; ++i) + { + // single component byte index + Value *offset = VEXTRACT(vOffsets,C(i)); + // byte pointer to component + Value *loadAddress = GEP(pBase,offset); + loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0)); + // pointer to the value to load if we're masking off a component + Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)}); + Value *selMask = VEXTRACT(mask,C(i)); + // switch in a safe address to load if we're trying to access a vertex + Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress); + Value *val = LOAD(validAddress); + vGather = VINSERT(vGather,val,C(i)); + } + STACKRESTORE(pStack); + } + + return vGather; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Generate a masked gather operation in LLVM IR. If not +/// supported on the underlying platform, emulate it with loads +/// @param vSrc - SIMD wide value that will be loaded if mask is invalid +/// @param pBase - Int8* base VB address pointer value +/// @param vIndices - SIMD wide value of VB byte offsets +/// @param vMask - SIMD wide mask that controls whether to access memory or the src values +/// @param scale - value to scale indices by +Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale) +{ + Value* vGather; + + // use avx2 gather instruction if available + if(JM()->mArch.AVX2()) + { + vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale); + } + else + { + Value* pStack = STACKSAVE(); + + // store vSrc on the stack. this way we can select between a valid load address and the vSrc address + Value* vSrcPtr = ALLOCA(vSrc->getType()); + STORE(vSrc, vSrcPtr); + + vGather = VUNDEF_I(); + Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty)); + Value *vOffsets = MUL(vIndices, vScaleVec); + Value *mask = MASK(vMask); + for(uint32_t i = 0; i < JM()->mVWidth; ++i) + { + // single component byte index + Value *offset = VEXTRACT(vOffsets, C(i)); + // byte pointer to component + Value *loadAddress = GEP(pBase, offset); + loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0)); + // pointer to the value to load if we're masking off a component + Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)}); + Value *selMask = VEXTRACT(mask, C(i)); + // switch in a safe address to load if we're trying to access a vertex + Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress); + Value *val = LOAD(validAddress, C(0)); + vGather = VINSERT(vGather, val, C(i)); + } + + STACKRESTORE(pStack); + } + return vGather; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief convert x86 <N x float> mask to llvm <N x i1> mask +Value* Builder::MASK(Value* vmask) +{ + Value* src = BITCAST(vmask, mSimdInt32Ty); + return ICMP_SLT(src, VIMMED1(0)); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief convert llvm <N x i1> mask to x86 <N x i32> mask +Value* Builder::VMASK(Value* mask) +{ + return S_EXT(mask, mSimdInt32Ty); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Generate a VPSHUFB operation in LLVM IR. If not +/// supported on the underlying platform, emulate it +/// @param a - 256bit SIMD(32x8bit) of 8bit integer values +/// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values +/// Byte masks in lower 128 lane of b selects 8 bit values from lower +/// 128bits of a, and vice versa for the upper lanes. If the mask +/// value is negative, '0' is inserted. +Value *Builder::PSHUFB(Value* a, Value* b) +{ + Value* res; + // use avx2 pshufb instruction if available + if(JM()->mArch.AVX2()) + { + res = VPSHUFB(a, b); + } + else + { + Constant* cB = dyn_cast<Constant>(b); + // number of 8 bit elements in b + uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements(); + // output vector + Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms)); + + // insert an 8 bit value from the high and low lanes of a per loop iteration + numElms /= 2; + for(uint32_t i = 0; i < numElms; i++) + { + ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i)); + ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms)); + + // extract values from constant mask + char valLow128bLane = (char)(cLow128b->getSExtValue()); + char valHigh128bLane = (char)(cHigh128b->getSExtValue()); + + Value* insertValLow128b; + Value* insertValHigh128b; + + // if the mask value is negative, insert a '0' in the respective output position + // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector + insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF))); + insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms)); + + vShuf = VINSERT(vShuf, insertValLow128b, i); + vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms)); + } + res = vShuf; + } + return res; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32 +/// bits)in LLVM IR. If not supported on the underlying platform, emulate it +/// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only +/// lower 8 values are used. +Value *Builder::PMOVSXBD(Value* a) +{ + Value* res; + // use avx2 byte sign extend instruction if available + if(JM()->mArch.AVX2()) + { + res = VPMOVSXBD(a); + } + else + { + // VPMOVSXBD output type + Type* v8x32Ty = VectorType::get(mInt32Ty, 8); + // Extract 8 values from 128bit lane and sign extend + res = S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); + } + return res; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32 +/// bits)in LLVM IR. If not supported on the underlying platform, emulate it +/// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values. +Value *Builder::PMOVSXWD(Value* a) +{ + Value* res; + // use avx2 word sign extend if available + if(JM()->mArch.AVX2()) + { + res = VPMOVSXWD(a); + } + else + { + // VPMOVSXWD output type + Type* v8x32Ty = VectorType::get(mInt32Ty, 8); + // Extract 8 values from 128bit lane and sign extend + res = S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); + } + return res; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Generate a VPERMD operation (shuffle 32 bit integer values +/// across 128 bit lanes) in LLVM IR. If not supported on the underlying +/// platform, emulate it +/// @param a - 256bit SIMD lane(8x32bit) of integer values. +/// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values +Value *Builder::PERMD(Value* a, Value* idx) +{ + Value* res; + // use avx2 permute instruction if available + if(JM()->mArch.AVX2()) + { + // llvm 3.6.0 swapped the order of the args to vpermd + res = VPERMD(idx, a); + } + else + { + res = VSHUFFLE(a, a, idx); + } + return res; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Generate a VCVTPH2PS operation (float16->float32 conversion) +/// in LLVM IR. If not supported on the underlying platform, emulate it +/// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format. +Value *Builder::CVTPH2PS(Value* a) +{ + if (JM()->mArch.F16C()) + { + return VCVTPH2PS(a); + } + else + { + FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty); + Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy)); + + if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr) + { + sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32); + } + + Value* pResult = UndefValue::get(mSimdFP32Ty); + for (uint32_t i = 0; i < JM()->mVWidth; ++i) + { + Value* pSrc = VEXTRACT(a, C(i)); + Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc}); + pResult = VINSERT(pResult, pConv, C(i)); + } + + return pResult; + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Generate a VCVTPS2PH operation (float32->float16 conversion) +/// in LLVM IR. If not supported on the underlying platform, emulate it +/// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format. +Value *Builder::CVTPS2PH(Value* a, Value* rounding) +{ + if (JM()->mArch.F16C()) + { + return VCVTPS2PH(a, rounding); + } + else + { + // call scalar C function for now + FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty); + Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy)); + + if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr) + { + sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float); + } + + Value* pResult = UndefValue::get(mSimdInt16Ty); + for (uint32_t i = 0; i < JM()->mVWidth; ++i) + { + Value* pSrc = VEXTRACT(a, C(i)); + Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc}); + pResult = VINSERT(pResult, pConv, C(i)); + } + + return pResult; + } +} + +Value *Builder::PMAXSD(Value* a, Value* b) +{ + if (JM()->mArch.AVX2()) + { + return VPMAXSD(a, b); + } + else + { + // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources + Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd); + + // low 128 + Value* aLo = VEXTRACTI128(a, C((uint8_t)0)); + Value* bLo = VEXTRACTI128(b, C((uint8_t)0)); + Value* resLo = CALL(pmaxsd, {aLo, bLo}); + + // high 128 + Value* aHi = VEXTRACTI128(a, C((uint8_t)1)); + Value* bHi = VEXTRACTI128(b, C((uint8_t)1)); + Value* resHi = CALL(pmaxsd, {aHi, bHi}); + + // combine + Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0)); + result = VINSERTI128(result, resHi, C((uint8_t)1)); + + return result; + } +} + +Value *Builder::PMINSD(Value* a, Value* b) +{ + if (JM()->mArch.AVX2()) + { + return VPMINSD(a, b); + } + else + { + // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources + Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd); + + // low 128 + Value* aLo = VEXTRACTI128(a, C((uint8_t)0)); + Value* bLo = VEXTRACTI128(b, C((uint8_t)0)); + Value* resLo = CALL(pminsd, {aLo, bLo}); + + // high 128 + Value* aHi = VEXTRACTI128(a, C((uint8_t)1)); + Value* bHi = VEXTRACTI128(b, C((uint8_t)1)); + Value* resHi = CALL(pminsd, {aHi, bHi}); + + // combine + Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0)); + result = VINSERTI128(result, resHi, C((uint8_t)1)); + + return result; + } +} + +void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, + Value* mask, Value* vGatherComponents[], bool bPackedOutput) +{ + const SWR_FORMAT_INFO &info = GetFormatInfo(format); + if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32) + { + // ensure our mask is the correct type + mask = BITCAST(mask, mSimdFP32Ty); + GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput); + } + else + { + // ensure our mask is the correct type + mask = BITCAST(mask, mSimdInt32Ty); + GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput); + } +} + +void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, + Value* mask, Value* vGatherComponents[], bool bPackedOutput) +{ + switch(info.bpp / info.numComps) + { + case 16: + { + Value* vGatherResult[2]; + Value *vMask; + + // TODO: vGatherMaskedVal + Value* vGatherMaskedVal = VIMMED1((float)0); + + // always have at least one component out of x or y to fetch + + // save mask as it is zero'd out after each gather + vMask = mask; + + vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); + // e.g. result of first 8x32bit integer gather for 16bit components + // 256i - 0 1 2 3 4 5 6 7 + // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy + // + + // if we have at least one component out of x or y to fetch + if(info.numComps > 2) + { + // offset base to the next components(zw) in the vertex to gather + pSrcBase = GEP(pSrcBase, C((char)4)); + vMask = mask; + + vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); + // e.g. result of second 8x32bit integer gather for 16bit components + // 256i - 0 1 2 3 4 5 6 7 + // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw + // + } + else + { + vGatherResult[1] = vGatherMaskedVal; + } + + // Shuffle gathered components into place, each row is a component + Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); + } + break; + case 32: + { + // apply defaults + for (uint32_t i = 0; i < 4; ++i) + { + vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]); + } + + for(uint32_t i = 0; i < info.numComps; i++) + { + uint32_t swizzleIndex = info.swizzle[i]; + + // save mask as it is zero'd out after each gather + Value *vMask = mask; + + // Gather a SIMD of components + vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1)); + + // offset base to the next component to gather + pSrcBase = GEP(pSrcBase, C((char)4)); + } + } + break; + default: + SWR_ASSERT(0, "Invalid float format"); + break; + } +} + +void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, + Value* mask, Value* vGatherComponents[], bool bPackedOutput) +{ + switch (info.bpp / info.numComps) + { + case 8: + { + Value* vGatherMaskedVal = VIMMED1((int32_t)0); + Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1)); + // e.g. result of an 8x32bit integer gather for 8bit components + // 256i - 0 1 2 3 4 5 6 7 + // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw + + Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); + } + break; + case 16: + { + Value* vGatherResult[2]; + Value *vMask; + + // TODO: vGatherMaskedVal + Value* vGatherMaskedVal = VIMMED1((int32_t)0); + + // always have at least one component out of x or y to fetch + + // save mask as it is zero'd out after each gather + vMask = mask; + + vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); + // e.g. result of first 8x32bit integer gather for 16bit components + // 256i - 0 1 2 3 4 5 6 7 + // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy + // + + // if we have at least one component out of x or y to fetch + if(info.numComps > 2) + { + // offset base to the next components(zw) in the vertex to gather + pSrcBase = GEP(pSrcBase, C((char)4)); + vMask = mask; + + vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); + // e.g. result of second 8x32bit integer gather for 16bit components + // 256i - 0 1 2 3 4 5 6 7 + // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw + // + } + else + { + vGatherResult[1] = vGatherMaskedVal; + } + + // Shuffle gathered components into place, each row is a component + Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); + + } + break; + case 32: + { + // apply defaults + for (uint32_t i = 0; i < 4; ++i) + { + vGatherComponents[i] = VIMMED1((int)info.defaults[i]); + } + + for(uint32_t i = 0; i < info.numComps; i++) + { + uint32_t swizzleIndex = info.swizzle[i]; + + // save mask as it is zero'd out after each gather + Value *vMask = mask; + + // Gather a SIMD of components + vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1)); + + // offset base to the next component to gather + pSrcBase = GEP(pSrcBase, C((char)4)); + } + } + break; + default: + SWR_ASSERT(0, "unsupported format"); + break; + } +} + +void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput) +{ + // cast types + Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); + Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits + + // input could either be float or int vector; do shuffle work in int + vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty); + vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty); + + if(bPackedOutput) + { + Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits + + // shuffle mask + Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); + Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy); + // after pshufb: group components together in each 128bit lane + // 256i - 0 1 2 3 4 5 6 7 + // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy + + Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); + // after PERMD: move and pack xy components into each 128bit lane + // 256i - 0 1 2 3 4 5 6 7 + // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy + + // do the same for zw components + Value* vi128ZW = nullptr; + if(info.numComps > 2) + { + Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy); + vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); + } + + for(uint32_t i = 0; i < 4; i++) + { + uint32_t swizzleIndex = info.swizzle[i]; + // todo: fixed for packed + Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); + if(i >= info.numComps) + { + // set the default component val + vGatherOutput[swizzleIndex] = vGatherMaskedVal; + continue; + } + + // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 + uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; + // if x or y, use vi128XY permute result, else use vi128ZW + Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; + + // extract packed component 128 bit lanes + vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); + } + + } + else + { + // pshufb masks for each component + Value* vConstMask[2]; + // x/z shuffle mask + vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, + 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); + + // y/w shuffle mask + vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, + 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); + + + // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits + // apply defaults + for (uint32_t i = 0; i < 4; ++i) + { + vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); + } + + for(uint32_t i = 0; i < info.numComps; i++) + { + uint32_t swizzleIndex = info.swizzle[i]; + + // select correct constMask for x/z or y/w pshufb + uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; + // if x or y, use vi128XY permute result, else use vi128ZW + uint32_t selectedGather = (i < 2) ? 0 : 1; + + vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy); + // after pshufb mask for x channel; z uses the same shuffle from the second gather + // 256i - 0 1 2 3 4 5 6 7 + // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 + } + } +} + +void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput) +{ + // cast types + Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); + Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits + + if(bPackedOutput) + { + Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits + // shuffle mask + Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, + 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}); + Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); + // after pshufb: group components together in each 128bit lane + // 256i - 0 1 2 3 4 5 6 7 + // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww + + Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty); + // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane + // 256i - 0 1 2 3 4 5 6 7 + // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care) + + // do the same for zw components + Value* vi128ZW = nullptr; + if(info.numComps > 2) + { + vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty); + } + + // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex + for(uint32_t i = 0; i < 4; i++) + { + uint32_t swizzleIndex = info.swizzle[i]; + // todo: fix for packed + Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); + if(i >= info.numComps) + { + // set the default component val + vGatherOutput[swizzleIndex] = vGatherMaskedVal; + continue; + } + + // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 + uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; + // if x or y, use vi128XY permute result, else use vi128ZW + Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; + + // sign extend + vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); + } + } + // else zero extend + else{ + // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits + // apply defaults + for (uint32_t i = 0; i < 4; ++i) + { + vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); + } + + for(uint32_t i = 0; i < info.numComps; i++){ + uint32_t swizzleIndex = info.swizzle[i]; + + // pshufb masks for each component + Value* vConstMask; + switch(i) + { + case 0: + // x shuffle mask + vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, + 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1}); + break; + case 1: + // y shuffle mask + vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, + 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1}); + break; + case 2: + // z shuffle mask + vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, + 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1}); + break; + case 3: + // w shuffle mask + vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, + 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1}); + break; + default: + vConstMask = nullptr; + break; + } + + vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); + // after pshufb for x channel + // 256i - 0 1 2 3 4 5 6 7 + // x000 x000 x000 x000 x000 x000 x000 x000 + } + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief emulates a scatter operation. +/// @param pDst - pointer to destination +/// @param vSrc - vector of src data to scatter +/// @param vOffsets - vector of byte offsets from pDst +/// @param vMask - mask of valid lanes +void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask) +{ + Value* pStack = STACKSAVE(); + + // allocate tmp stack for masked off lanes + Value* vTmpPtr = ALLOCA(vSrc->getType()->getVectorElementType()); + + Value *mask = MASK(vMask); + for (uint32_t i = 0; i < JM()->mVWidth; ++i) + { + Value *offset = VEXTRACT(vOffsets, C(i)); + // byte pointer to component + Value *storeAddress = GEP(pDst, offset); + storeAddress = BITCAST(storeAddress, PointerType::get(mFP32Ty, 0)); + Value *selMask = VEXTRACT(mask, C(i)); + Value *srcElem = VEXTRACT(vSrc, C(i)); + // switch in a safe address to load if we're trying to access a vertex + Value *validAddress = SELECT(selMask, storeAddress, vTmpPtr); + STORE(srcElem, validAddress); + } + + STACKRESTORE(pStack); +} + +Value* Builder::VABSPS(Value* a) +{ + Value* asInt = BITCAST(a, mSimdInt32Ty); + Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty); + return result; +} + +Value *Builder::ICLAMP(Value* src, Value* low, Value* high) +{ + Value *lowCmp = ICMP_SLT(src, low); + Value *ret = SELECT(lowCmp, low, src); + + Value *highCmp = ICMP_SGT(ret, high); + ret = SELECT(highCmp, high, ret); + + return ret; +} + +Value *Builder::FCLAMP(Value* src, Value* low, Value* high) +{ + Value *lowCmp = FCMP_OLT(src, low); + Value *ret = SELECT(lowCmp, low, src); + + Value *highCmp = FCMP_OGT(ret, high); + ret = SELECT(highCmp, high, ret); + + return ret; +} + +Value *Builder::FCLAMP(Value* src, float low, float high) +{ + Value* result = VMAXPS(src, VIMMED1(low)); + result = VMINPS(result, VIMMED1(high)); + + return result; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief save/restore stack, providing ability to push/pop the stack and +/// reduce overall stack requirements for temporary stack use +Value* Builder::STACKSAVE() +{ + Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave); +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 + return CALL(pfnStackSave); +#else + return CALLA(pfnStackSave); +#endif +} + +void Builder::STACKRESTORE(Value* pSaved) +{ + Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore); + CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved}); +} + +Value *Builder::FMADDPS(Value* a, Value* b, Value* c) +{ + Value* vOut; + // use FMADs if available + if(JM()->mArch.AVX2()) + { + vOut = VFMADDPS(a, b, c); + } + else + { + vOut = FADD(FMUL(a, b), c); + } + return vOut; +} + +Value* Builder::POPCNT(Value* a) +{ + Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() }); + return CALL(pCtPop, std::initializer_list<Value*>{a}); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief C functions called by LLVM IR +////////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////////// +/// @brief called in JIT code, inserted by PRINT +/// output to both stdout and visual studio debug console +void __cdecl CallPrint(const char* fmt, ...) +{ + va_list args; + va_start(args, fmt); + vprintf(fmt, args); + +#if defined( _WIN32 ) + char strBuf[1024]; + vsnprintf_s(strBuf, _TRUNCATE, fmt, args); + OutputDebugString(strBuf); +#endif +} + +Value *Builder::VEXTRACTI128(Value* a, Constant* imm8) +{ +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 + Function *func = + Intrinsic::getDeclaration(JM()->mpCurrentModule, + Intrinsic::x86_avx_vextractf128_si_256); + return CALL(func, {a, imm8}); +#else + bool flag = !imm8->isZeroValue(); + SmallVector<Constant*,8> idx; + for (unsigned i = 0; i < JM()->mVWidth / 2; i++) { + idx.push_back(C(flag ? i + JM()->mVWidth / 2 : i)); + } + return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx)); +#endif +} + +Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8) +{ +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 + Function *func = + Intrinsic::getDeclaration(JM()->mpCurrentModule, + Intrinsic::x86_avx_vinsertf128_si_256); + return CALL(func, {a, b, imm8}); +#else + bool flag = !imm8->isZeroValue(); + SmallVector<Constant*,8> idx; + for (unsigned i = 0; i < JM()->mVWidth; i++) { + idx.push_back(C(i)); + } + Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx)); + + SmallVector<Constant*,8> idx2; + for (unsigned i = 0; i < JM()->mVWidth / 2; i++) { + idx2.push_back(C(flag ? i : i + JM()->mVWidth)); + } + for (unsigned i = JM()->mVWidth / 2; i < JM()->mVWidth; i++) { + idx2.push_back(C(flag ? i + JM()->mVWidth / 2 : i)); + } + return VSHUFFLE(a, inter, ConstantVector::get(idx2)); +#endif +} diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h new file mode 100644 index 00000000000..48e0558c4dd --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h @@ -0,0 +1,149 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file builder_misc.h +* +* @brief miscellaneous builder functions +* +* Notes: +* +******************************************************************************/ +#pragma once + +Constant *C(bool i); +Constant *C(char i); +Constant *C(uint8_t i); +Constant *C(int i); +Constant *C(int64_t i); +Constant *C(uint16_t i); +Constant *C(uint32_t i); +Constant *C(float i); + +template<typename Ty> +Constant *C(const std::initializer_list<Ty> &constList) +{ + std::vector<Constant*> vConsts; + for(auto i : constList) { + + vConsts.push_back(C((Ty)i)); + } + return ConstantVector::get(vConsts); +} + +Constant *PRED(bool pred); +Value *VIMMED1(int i); +Value *VIMMED1(uint32_t i); +Value *VIMMED1(float i); +Value *VIMMED1(bool i); +Value *VUNDEF(Type* t); +Value *VUNDEF_F(); +Value *VUNDEF_I(); +Value *VUNDEF(Type* ty, uint32_t size); +Value *VUNDEF_IPTR(); +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +Value *VINSERT(Value *vec, Value *val, uint64_t index); +#endif +Value *VBROADCAST(Value *src); +Value *VRCP(Value *va); +Value *VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY); + +uint32_t IMMED(Value* i); + +Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList); +Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList); +CallInst *CALL(Value *Callee, const std::initializer_list<Value*> &args); + +LoadInst *LOAD(Value *BasePtr, const std::initializer_list<uint32_t> &offset, const llvm::Twine& name = ""); +LoadInst *LOADV(Value *BasePtr, const std::initializer_list<Value*> &offset, const llvm::Twine& name = ""); +StoreInst *STORE(Value *Val, Value *BasePtr, const std::initializer_list<uint32_t> &offset); +StoreInst *STOREV(Value *Val, Value *BasePtr, const std::initializer_list<Value*> &offset); + +Value *VCMPPS_EQ(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_EQ_OQ)); } +Value *VCMPPS_LT(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_LT_OQ)); } +Value *VCMPPS_LE(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_LE_OQ)); } +Value *VCMPPS_ISNAN(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_UNORD_Q)); } +Value *VCMPPS_NEQ(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_NEQ_OQ)); } +Value *VCMPPS_GE(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_GE_OQ)); } +Value *VCMPPS_GT(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_GT_OQ)); } +Value *VCMPPS_NOTNAN(Value* a, Value* b){ return VCMPPS(a, b, C((uint8_t)_CMP_ORD_Q)); } + +Value *MASK(Value* vmask); +Value *VMASK(Value* mask); + +////////////////////////////////////////////////////////////////////////// +/// @brief functions that build IR to call x86 intrinsics directly, or +/// emulate them with other instructions if not available on the host +////////////////////////////////////////////////////////////////////////// +Value *MASKLOADD(Value* src, Value* mask); + +void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, + Value* mask, Value* vGatherComponents[], bool bPackedOutput); + +Value *GATHERPS(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale); +void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, + Value* mask, Value* vGatherComponents[], bool bPackedOutput); + +Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale); +void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, + Value* mask, Value* vGatherComponents[], bool bPackedOutput); + +void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask); + +void Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput); +void Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[], Value* vGatherOutput[], bool bPackedOutput); + +Value *PSHUFB(Value* a, Value* b); +Value *PMOVSXBD(Value* a); +Value *PMOVSXWD(Value* a); +Value *PERMD(Value* a, Value* idx); +Value *CVTPH2PS(Value* a); +Value *CVTPS2PH(Value* a, Value* rounding); +Value *PMAXSD(Value* a, Value* b); +Value *PMINSD(Value* a, Value* b); +Value *VABSPS(Value* a); +Value *FMADDPS(Value* a, Value* b, Value* c); + +// LLVM removed VPCMPGTD x86 intrinsic. This emulates that behavior +Value *VPCMPGTD(Value* a, Value* b) +{ + Value* vIndexMask = ICMP_UGT(a,b); + + // need to set the high bit for x86 intrinsic masks + return S_EXT(vIndexMask,VectorType::get(mInt32Ty,JM()->mVWidth)); +} + +Value *ICLAMP(Value* src, Value* low, Value* high); +Value *FCLAMP(Value* src, Value* low, Value* high); +Value *FCLAMP(Value* src, float low, float high); + +CallInst *PRINT(const std::string &printStr); +CallInst *PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs); +Value* STACKSAVE(); +void STACKRESTORE(Value* pSaved); + +Value* POPCNT(Value* a); + +Value* INT3() { return INTERRUPT(C((uint8_t)3)); } + + +Value *VEXTRACTI128(Value* a, Constant* imm8); +Value *VINSERTI128(Value* a, Value* b, Constant* imm8); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp new file mode 100644 index 00000000000..c5a180e27cb --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -0,0 +1,1431 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file fetch_jit.cpp +* +* @brief Implementation of the fetch jitter +* +* Notes: +* +******************************************************************************/ +#include "jit_api.h" +#include "fetch_jit.h" +#include "builder.h" +#include "state_llvm.h" +#include "common/containers.hpp" +#include "llvm/IR/DataLayout.h" +#include <sstream> +#include <tuple> + +//#define FETCH_DUMP_VERTEX 1 + +bool isComponentEnabled(ComponentEnable enableMask, uint8_t component); + +enum ConversionType +{ + CONVERT_NONE, + CONVERT_NORMALIZED, + CONVERT_USCALED, + CONVERT_SSCALED, +}; + +////////////////////////////////////////////////////////////////////////// +/// Interface to Jitting a fetch shader +////////////////////////////////////////////////////////////////////////// +struct FetchJit : public Builder +{ + FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){}; + + Function* Create(const FETCH_COMPILE_STATE& fetchState); + Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex); + Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex); + Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex); + + // package up Shuffle*bpcGatherd args into a tuple for convenience + typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType, + uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4], + const uint32_t (&)[4]> Shuffle8bpcArgs; + void Shuffle8bpcGatherd(Shuffle8bpcArgs &args); + + typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType, + uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs; + void Shuffle16bpcGather(Shuffle16bpcArgs &args); + + void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]); + + Value* GenerateCompCtrlVector(const ComponentControl ctrl); + + void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut); + void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut); +}; + +Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) +{ + static std::size_t fetchNum = 0; + + std::stringstream fnName("FetchShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate); + fnName << fetchNum++; + + Function* fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); + BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch); + + IRB()->SetInsertPoint(entry); + + auto argitr = fetch->getArgumentList().begin(); + + // Fetch shader arguments + Value* fetchInfo = &*argitr; ++argitr; + fetchInfo->setName("fetchInfo"); + Value* pVtxOut = &*argitr; + pVtxOut->setName("vtxOutput"); + // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex + // index 0(just the pointer to the simdvertex structure + // index 1(which element of the simdvertex structure to offset to(in this case 0) + // so the indices being i32's doesn't matter + // TODO: generated this GEP with a VECTOR structure type so this makes sense + std::vector<Value*> vtxInputIndices(2, C(0)); + // GEP + pVtxOut = GEP(pVtxOut, C(0)); + pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, JM()->mVWidth), 0)); + + // SWR_FETCH_CONTEXT::pStreams + Value* streams = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pStreams}); + streams->setName("pStreams"); + + // SWR_FETCH_CONTEXT::pIndices + Value* indices = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pIndices}); + indices->setName("pIndices"); + + // SWR_FETCH_CONTEXT::pLastIndex + Value* pLastIndex = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex}); + pLastIndex->setName("pLastIndex"); + + + Value* vIndices; + switch(fetchState.indexType) + { + case R8_UINT: + indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0)); + if(fetchState.bDisableIndexOOBCheck){ + vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0}); + vIndices = Z_EXT(vIndices, mSimdInt32Ty); + } + else{ + pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0)); + vIndices = GetSimdValid8bitIndices(indices, pLastIndex); + } + break; + case R16_UINT: + indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0)); + if(fetchState.bDisableIndexOOBCheck){ + vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0}); + vIndices = Z_EXT(vIndices, mSimdInt32Ty); + } + else{ + pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0)); + vIndices = GetSimdValid16bitIndices(indices, pLastIndex); + } + break; + case R32_UINT: + (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0}) + : vIndices = GetSimdValid32bitIndices(indices, pLastIndex); + break; // incoming type is already 32bit int + default: SWR_ASSERT(0, "Unsupported index type"); vIndices = nullptr; break; + } + + // store out vertex IDs + STORE(vIndices, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })); + + // store out cut mask if enabled + if (fetchState.bEnableCutIndex) + { + Value* vCutIndex = VIMMED1(fetchState.cutIndex); + Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex)); + STORE(cutMask, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask })); + } + + // Fetch attributes from memory and output to a simdvertex struct + // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use + (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut) + : JitGatherVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut); + + RET_VOID(); + + JitManager::DumpToFile(fetch, "src"); + + verifyFunction(*fetch); + + FunctionPassManager setupPasses(JM()->mpCurrentModule); + + ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification) + setupPasses.add(createBreakCriticalEdgesPass()); + setupPasses.add(createCFGSimplificationPass()); + setupPasses.add(createEarlyCSEPass()); + setupPasses.add(createPromoteMemoryToRegisterPass()); + + setupPasses.run(*fetch); + + JitManager::DumpToFile(fetch, "se"); + + FunctionPassManager optPasses(JM()->mpCurrentModule); + + ///@todo Haven't touched these either. Need to remove some of these and add others. + optPasses.add(createCFGSimplificationPass()); + optPasses.add(createEarlyCSEPass()); + optPasses.add(createInstructionCombiningPass()); + optPasses.add(createInstructionSimplifierPass()); + optPasses.add(createConstantPropagationPass()); + optPasses.add(createSCCPPass()); + optPasses.add(createAggressiveDCEPass()); + + optPasses.run(*fetch); + optPasses.run(*fetch); + + JitManager::DumpToFile(fetch, "opt"); + + return fetch; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Loads attributes from memory using LOADs, shuffling the +/// components into SOA form. +/// *Note* currently does not support component control, +/// component packing, or instancing +/// @param fetchState - info about attributes to be fetched from memory +/// @param streams - value pointer to the current vertex stream +/// @param vIndices - vector value of indices to load +/// @param pVtxOut - value pointer to output simdvertex struct +void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut) +{ + // Zack shuffles; a variant of the Charleston. + + SWRL::UncheckedFixedVector<Value*, 16> vectors; + + std::vector<Constant*> pMask(JM()->mVWidth); + for(uint32_t i = 0; i < JM()->mVWidth; ++i) + { + pMask[i] = (C(i < 4 ? i : 4)); + } + Constant* promoteMask = ConstantVector::get(pMask); + Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4)); + + Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}); + + for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt) + { + Value* elements[4] = {0}; + const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt]; + const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format); + uint32_t numComponents = info.numComps; + uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix. + + vectors.clear(); + + // load SWR_VERTEX_BUFFER_STATE::pData + Value *stream = LOAD(streams, {ied.StreamIndex, 2}); + + // load SWR_VERTEX_BUFFER_STATE::pitch + Value *stride = LOAD(streams, {ied.StreamIndex, 1}); + stride = Z_EXT(stride, mInt64Ty); + + // load SWR_VERTEX_BUFFER_STATE::size + Value *size = LOAD(streams, {ied.StreamIndex, 3}); + size = Z_EXT(size, mInt64Ty); + + Value* startVertexOffset = MUL(Z_EXT(startVertex, mInt64Ty), stride); + + // Load from the stream. + for(uint32_t lane = 0; lane < JM()->mVWidth; ++lane) + { + // Get index + Value* index = VEXTRACT(vIndices, C(lane)); + index = Z_EXT(index, mInt64Ty); + + Value* offset = MUL(index, stride); + offset = ADD(offset, C((int64_t)ied.AlignedByteOffset)); + offset = ADD(offset, startVertexOffset); + + if (!fetchState.bDisableIndexOOBCheck) { + // check for out of bound access, including partial OOB, and mask them to 0 + Value *endOffset = ADD(offset, C((int64_t)info.Bpp)); + Value *oob = ICMP_ULE(endOffset, size); + offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0)); + } + + Value* pointer = GEP(stream, offset); + // We use a full-lane, but don't actually care. + Value* vptr = 0; + + // get a pointer to a 4 component attrib in default address space + switch(bpc) + { + case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break; + case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break; + case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break; + default: SWR_ASSERT(false, "Unsupported underlying bpp!"); + } + + // load 4 components of attribute + Value* vec = ALIGNED_LOAD(vptr, 1, false); + + // Convert To FP32 internally + switch(info.type[0]) + { + case SWR_TYPE_UNORM: + switch(bpc) + { + case 8: + vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); + vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0)))); + break; + case 16: + vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); + vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0)))); + break; + default: + SWR_ASSERT(false, "Unsupported underlying type!"); + break; + } + break; + case SWR_TYPE_SNORM: + switch(bpc) + { + case 8: + vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); + vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0)))); + break; + case 16: + vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); + vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0)))); + break; + default: + SWR_ASSERT(false, "Unsupported underlying type!"); + break; + } + break; + case SWR_TYPE_UINT: + // Zero extend uint32_t types. + switch(bpc) + { + case 8: + case 16: + vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4)); + vec = BITCAST(vec, VectorType::get(mFP32Ty, 4)); + break; + case 32: + break; // Pass through unchanged. + default: + SWR_ASSERT(false, "Unsupported underlying type!"); + break; + } + break; + case SWR_TYPE_SINT: + // Sign extend SINT types. + switch(bpc) + { + case 8: + case 16: + vec = S_EXT(vec, VectorType::get(mInt32Ty, 4)); + vec = BITCAST(vec, VectorType::get(mFP32Ty, 4)); + break; + case 32: + break; // Pass through unchanged. + default: + SWR_ASSERT(false, "Unsupported underlying type!"); + break; + } + break; + case SWR_TYPE_FLOAT: + switch(bpc) + { + case 32: + break; // Pass through unchanged. + default: + SWR_ASSERT(false, "Unsupported underlying type!"); + } + break; + case SWR_TYPE_USCALED: + vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); + break; + case SWR_TYPE_SSCALED: + vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); + break; + case SWR_TYPE_UNKNOWN: + case SWR_TYPE_UNUSED: + SWR_ASSERT(false, "Unsupported type %d!", info.type[0]); + } + + // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4) + // uwvec: 4 x F32, undef value + Value* wvec = VSHUFFLE(vec, uwvec, promoteMask); + vectors.push_back(wvec); + } + + std::vector<Constant*> v01Mask(JM()->mVWidth); + std::vector<Constant*> v23Mask(JM()->mVWidth); + std::vector<Constant*> v02Mask(JM()->mVWidth); + std::vector<Constant*> v13Mask(JM()->mVWidth); + + // Concatenate the vectors together. + elements[0] = VUNDEF_F(); + elements[1] = VUNDEF_F(); + elements[2] = VUNDEF_F(); + elements[3] = VUNDEF_F(); + for(uint32_t b = 0, num4Wide = JM()->mVWidth / 4; b < num4Wide; ++b) + { + v01Mask[4 * b + 0] = C(0 + 4 * b); + v01Mask[4 * b + 1] = C(1 + 4 * b); + v01Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth); + v01Mask[4 * b + 3] = C(1 + 4 * b + JM()->mVWidth); + + v23Mask[4 * b + 0] = C(2 + 4 * b); + v23Mask[4 * b + 1] = C(3 + 4 * b); + v23Mask[4 * b + 2] = C(2 + 4 * b + JM()->mVWidth); + v23Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth); + + v02Mask[4 * b + 0] = C(0 + 4 * b); + v02Mask[4 * b + 1] = C(2 + 4 * b); + v02Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth); + v02Mask[4 * b + 3] = C(2 + 4 * b + JM()->mVWidth); + + v13Mask[4 * b + 0] = C(1 + 4 * b); + v13Mask[4 * b + 1] = C(3 + 4 * b); + v13Mask[4 * b + 2] = C(1 + 4 * b + JM()->mVWidth); + v13Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth); + + std::vector<Constant*> iMask(JM()->mVWidth); + for(uint32_t i = 0; i < JM()->mVWidth; ++i) + { + if(((4 * b) <= i) && (i < (4 * (b + 1)))) + { + iMask[i] = C(i % 4 + JM()->mVWidth); + } + else + { + iMask[i] = C(i); + } + } + Constant* insertMask = ConstantVector::get(iMask); + elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask); + elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask); + elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask); + elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask); + } + + Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask)); + Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask)); + Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask)); + Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask)); + elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask)); + elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask)); + elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask)); + elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask)); + + switch(numComponents + 1) + { + case 1: elements[0] = VIMMED1(0.0f); + case 2: elements[1] = VIMMED1(0.0f); + case 3: elements[2] = VIMMED1(0.0f); + case 4: elements[3] = VIMMED1(1.0f); + } + + for(uint32_t c = 0; c < 4; ++c) + { + Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP"); + STORE(elements[c], dest); + } + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Loads attributes from memory using AVX2 GATHER(s) +/// @param fetchState - info about attributes to be fetched from memory +/// @param fetchInfo - first argument passed to fetch shader +/// @param streams - value pointer to the current vertex stream +/// @param vIndices - vector value of indices to gather +/// @param pVtxOut - value pointer to output simdvertex struct +void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, + Value* streams, Value* vIndices, Value* pVtxOut) +{ + uint32_t currentVertexElement = 0; + uint32_t outputElt = 0; + Value* vVertexElements[4]; + + Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}); + Value* startInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance}); + Value* curInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance}); + Value* vBaseVertex = VBROADCAST(LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex})); + curInstance->setName("curInstance"); + + for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt) + { + const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt]; + const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format); + uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix. + + Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData}); + + // VGATHER* takes an *i8 src pointer + Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0)); + + Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch}); + Value *vStride = VBROADCAST(stride); + + // max vertex index that is fully in bounds + Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)}); + maxVertex = LOAD(maxVertex); + + Value *vCurIndices; + Value *startOffset; + if(ied.InstanceEnable) + { + Value* stepRate = C(ied.InstanceDataStepRate); + + // prevent a div by 0 for 0 step rate + Value* isNonZeroStep = ICMP_UGT(stepRate, C(0)); + stepRate = SELECT(isNonZeroStep, stepRate, C(1)); + + // calc the current offset into instanced data buffer + Value* calcInstance = UDIV(curInstance, stepRate); + + // if step rate is 0, every instance gets instance 0 + calcInstance = SELECT(isNonZeroStep, calcInstance, C(0)); + + vCurIndices = VBROADCAST(calcInstance); + + startOffset = startInstance; + } + else + { + // offset indices by baseVertex + vCurIndices = ADD(vIndices, vBaseVertex); + + startOffset = startVertex; + } + + // All of the OOB calculations are in vertices, not VB offsets, to prevent having to + // do 64bit address offset calculations. + + // calculate byte offset to the start of the VB + Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty)); + pStreamBase = GEP(pStreamBase, baseOffset); + + // if we have a start offset, subtract from max vertex. Used for OOB check + maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty)); + Value* neg = ICMP_SLT(maxVertex, C((int64_t)0)); + // if we have a negative value, we're already OOB. clamp at 0. + maxVertex = SELECT(neg, C(0), TRUNC(maxVertex, mInt32Ty)); + + // Load the in bounds size of a partially valid vertex + Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)}); + partialInboundsSize = LOAD(partialInboundsSize); + Value* vPartialVertexSize = VBROADCAST(partialInboundsSize); + Value* vBpp = VBROADCAST(C(info.Bpp)); + Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset)); + + // is the element is <= the partially valid size + Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets)); + + // are vertices partially OOB? + Value* vMaxVertex = VBROADCAST(maxVertex); + Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex); + + // are vertices are fully in bounds? + Value* vGatherMask = ICMP_ULT(vCurIndices, vMaxVertex); + + // blend in any partially OOB indices that have valid elements + vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask); + vGatherMask = VMASK(vGatherMask); + + // calculate the actual offsets into the VB + Value* vOffsets = MUL(vCurIndices, vStride); + vOffsets = ADD(vOffsets, vAlignmentOffsets); + + // Packing and component control + ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking; + const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1, + (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3}; + + if(info.type[0] == SWR_TYPE_FLOAT) + { + ///@todo: support 64 bit vb accesses + Value* gatherSrc = VIMMED1(0.0f); + + // Gather components from memory to store in a simdvertex structure + switch(bpc) + { + case 16: + { + Value* vGatherResult[2]; + Value *vMask; + + // if we have at least one component out of x or y to fetch + if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ + // save mask as it is zero'd out after each gather + vMask = vGatherMask; + + vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); + // e.g. result of first 8x32bit integer gather for 16bit components + // 256i - 0 1 2 3 4 5 6 7 + // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy + // + } + + // if we have at least one component out of z or w to fetch + if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ + // offset base to the next components(zw) in the vertex to gather + pStreamBase = GEP(pStreamBase, C((char)4)); + vMask = vGatherMask; + + vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); + // e.g. result of second 8x32bit integer gather for 16bit components + // 256i - 0 1 2 3 4 5 6 7 + // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw + // + } + + // if we have at least one component to shuffle into place + if(compMask){ + Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE, + currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); + // Shuffle gathered components into place in simdvertex struct + Shuffle16bpcGather(args); // outputs to vVertexElements ref + } + } + break; + case 32: + { + for(uint32_t i = 0; i < 4; i++) + { + if(!isComponentEnabled(compMask, i)){ + // offset base to the next component in the vertex to gather + pStreamBase = GEP(pStreamBase, C((char)4)); + continue; + } + + // if we need to gather the component + if(compCtrl[i] == StoreSrc){ + // save mask as it is zero'd out after each gather + Value *vMask = vGatherMask; + + // Gather a SIMD of vertices + vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); + } + else{ + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); + } + + if(currentVertexElement > 3){ + StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); + // reset to the next vVertexElement to output + currentVertexElement = 0; + } + + // offset base to the next component in the vertex to gather + pStreamBase = GEP(pStreamBase, C((char)4)); + } + } + break; + default: + SWR_ASSERT(0, "Tried to fetch invalid FP format"); + break; + } + } + else + { + Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd; + ConversionType conversionType = CONVERT_NONE; + + switch(info.type[0]) + { + case SWR_TYPE_UNORM: + conversionType = CONVERT_NORMALIZED; + case SWR_TYPE_UINT: + extendCastType = Instruction::CastOps::ZExt; + break; + case SWR_TYPE_SNORM: + conversionType = CONVERT_NORMALIZED; + case SWR_TYPE_SINT: + extendCastType = Instruction::CastOps::SExt; + break; + case SWR_TYPE_USCALED: + conversionType = CONVERT_USCALED; + extendCastType = Instruction::CastOps::UIToFP; + break; + case SWR_TYPE_SSCALED: + conversionType = CONVERT_SSCALED; + extendCastType = Instruction::CastOps::SIToFP; + break; + default: + break; + } + + // value substituted when component of gather is masked + Value* gatherSrc = VIMMED1(0); + + // Gather components from memory to store in a simdvertex structure + switch (bpc) + { + case 8: + { + // if we have at least one component to fetch + if(compMask){ + Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1)); + // e.g. result of an 8x32bit integer gather for 8bit components + // 256i - 0 1 2 3 4 5 6 7 + // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw + + Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, + currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle); + // Shuffle gathered components into place in simdvertex struct + Shuffle8bpcGatherd(args); // outputs to vVertexElements ref + } + } + break; + case 16: + { + Value* vGatherResult[2]; + Value *vMask; + + // if we have at least one component out of x or y to fetch + if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ + // save mask as it is zero'd out after each gather + vMask = vGatherMask; + + vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); + // e.g. result of first 8x32bit integer gather for 16bit components + // 256i - 0 1 2 3 4 5 6 7 + // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy + // + } + + // if we have at least one component out of z or w to fetch + if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ + // offset base to the next components(zw) in the vertex to gather + pStreamBase = GEP(pStreamBase, C((char)4)); + vMask = vGatherMask; + + vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); + // e.g. result of second 8x32bit integer gather for 16bit components + // 256i - 0 1 2 3 4 5 6 7 + // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw + // + } + + // if we have at least one component to shuffle into place + if(compMask){ + Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, + currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); + // Shuffle gathered components into place in simdvertex struct + Shuffle16bpcGather(args); // outputs to vVertexElements ref + } + } + break; + case 32: + { + SWR_ASSERT(conversionType == CONVERT_NONE); + + // Gathered components into place in simdvertex struct + for(uint32_t i = 0; i < 4; i++) + { + if(!isComponentEnabled(compMask, i)){ + // offset base to the next component in the vertex to gather + pStreamBase = GEP(pStreamBase, C((char)4)); + continue; + } + + // if we need to gather the component + if(compCtrl[i] == StoreSrc){ + // save mask as it is zero'd out after each gather + Value *vMask = vGatherMask; + + vVertexElements[currentVertexElement++] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); + + // e.g. result of a single 8x32bit integer gather for 32bit components + // 256i - 0 1 2 3 4 5 6 7 + // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx + } + else{ + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); + } + + if(currentVertexElement > 3){ + StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); + // reset to the next vVertexElement to output + currentVertexElement = 0; + } + + // offset base to the next component in the vertex to gather + pStreamBase = GEP(pStreamBase, C((char)4)); + } + } + break; + } + } + } + + // if we have a partially filled vVertexElement struct, output it + if(currentVertexElement > 0){ + StoreVertexElements(pVtxOut, outputElt++, currentVertexElement+1, vVertexElements); + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Loads a simd of valid indices. OOB indices are set to 0 +/// *Note* have to do 16bit index checking in scalar until we have AVX-512 +/// support +/// @param pIndices - pointer to 8 bit indices +/// @param pLastIndex - pointer to last valid index +Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex) +{ + // can fit 2 16 bit integers per vWidth lane + Value* vIndices = VUNDEF_I(); + + // store 0 index on stack to be used to conditionally load from if index address is OOB + Value* pZeroIndex = ALLOCA(mInt8Ty); + STORE(C((uint8_t)0), pZeroIndex); + + // Load a SIMD of index pointers + for(int64_t lane = 0; lane < JM()->mVWidth; lane++) + { + // Calculate the address of the requested index + Value *pIndex = GEP(pIndices, C(lane)); + + // check if the address is less than the max index, + Value* mask = ICMP_ULT(pIndex, pLastIndex); + + // if valid, load the index. if not, load 0 from the stack + Value* pValid = SELECT(mask, pIndex, pZeroIndex); + Value *index = LOAD(pValid, "valid index"); + + // zero extended index to 32 bits and insert into the correct simd lane + index = Z_EXT(index, mInt32Ty); + vIndices = VINSERT(vIndices, index, lane); + } + return vIndices; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Loads a simd of valid indices. OOB indices are set to 0 +/// *Note* have to do 16bit index checking in scalar until we have AVX-512 +/// support +/// @param pIndices - pointer to 16 bit indices +/// @param pLastIndex - pointer to last valid index +Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex) +{ + // can fit 2 16 bit integers per vWidth lane + Value* vIndices = VUNDEF_I(); + + // store 0 index on stack to be used to conditionally load from if index address is OOB + Value* pZeroIndex = ALLOCA(mInt16Ty); + STORE(C((uint16_t)0), pZeroIndex); + + // Load a SIMD of index pointers + for(int64_t lane = 0; lane < JM()->mVWidth; lane++) + { + // Calculate the address of the requested index + Value *pIndex = GEP(pIndices, C(lane)); + + // check if the address is less than the max index, + Value* mask = ICMP_ULT(pIndex, pLastIndex); + + // if valid, load the index. if not, load 0 from the stack + Value* pValid = SELECT(mask, pIndex, pZeroIndex); + Value *index = LOAD(pValid, "valid index"); + + // zero extended index to 32 bits and insert into the correct simd lane + index = Z_EXT(index, mInt32Ty); + vIndices = VINSERT(vIndices, index, lane); + } + return vIndices; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Loads a simd of valid indices. OOB indices are set to 0 +/// @param pIndices - pointer to 32 bit indices +/// @param pLastIndex - pointer to last valid index +Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex) +{ + DataLayout dL(JM()->mpCurrentModule); + unsigned int ptrSize = dL.getPointerSize() * 8; // ptr size in bits + Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize)); + Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize)); + + // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index) + Value* numIndicesLeft = SUB(iLastIndex,iIndices); + numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty); + numIndicesLeft = SDIV(numIndicesLeft, C(4)); + + // create a vector of index counts from the base index ptr passed into the fetch + const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)}; + Constant* vIndexOffsets = ConstantVector::get(vecIndices); + + // compare index count to the max valid index + // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load + // vIndexOffsets 0 1 2 3 4 5 6 7 + // ------------------------------ + // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass + // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0 + Value* vMaxIndex = VBROADCAST(numIndicesLeft); + Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets); + + // VMASKLOAD takes an *i8 src pointer + pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0)); + + // Load the indices; OOB loads 0 + return MASKLOADD(pIndices,vIndexMask); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends, +/// denormalizes if needed, converts to F32 if needed, and positions in +// the proper SIMD rows to be output to the simdvertex structure +/// @param args: (tuple of args, listed below) +/// @param vGatherResult - 8 gathered 8bpc vertices +/// @param pVtxOut - base pointer to output simdvertex struct +/// @param extendType - sign extend or zero extend +/// @param bNormalized - do we need to denormalize? +/// @param currentVertexElement - reference to the current vVertexElement +/// @param outputElt - reference to the current offset from simdvertex we're o +/// @param compMask - component packing mask +/// @param compCtrl - component control val +/// @param vVertexElements[4] - vertex components to output +/// @param swizzle[4] - component swizzle location +void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args) +{ + // Unpack tuple args + Value*& vGatherResult = std::get<0>(args); + Value* pVtxOut = std::get<1>(args); + const Instruction::CastOps extendType = std::get<2>(args); + const ConversionType conversionType = std::get<3>(args); + uint32_t ¤tVertexElement = std::get<4>(args); + uint32_t &outputElt = std::get<5>(args); + const ComponentEnable compMask = std::get<6>(args); + const ComponentControl (&compCtrl)[4] = std::get<7>(args); + Value* (&vVertexElements)[4] = std::get<8>(args); + const uint32_t (&swizzle)[4] = std::get<9>(args); + + // cast types + Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); + Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits + + // have to do extra work for sign extending + if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){ + Type* v16x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 2); // 8x16bit ints in a 128bit lane + Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits + + // shuffle mask, including any swizzling + const char x = (char)swizzle[0]; const char y = (char)swizzle[1]; + const char z = (char)swizzle[2]; const char w = (char)swizzle[3]; + Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12), + char(y), char(y+4), char(y+8), char(y+12), + char(z), char(z+4), char(z+8), char(z+12), + char(w), char(w+4), char(w+8), char(w+12), + char(x), char(x+4), char(x+8), char(x+12), + char(y), char(y+4), char(y+8), char(y+12), + char(z), char(z+4), char(z+8), char(z+12), + char(w), char(w+4), char(w+8), char(w+12)}); + + Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy); + // after pshufb: group components together in each 128bit lane + // 256i - 0 1 2 3 4 5 6 7 + // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww + + Value* vi128XY = nullptr; + if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ + vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty); + // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane + // 256i - 0 1 2 3 4 5 6 7 + // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care) + } + + // do the same for zw components + Value* vi128ZW = nullptr; + if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ + vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty); + } + + // init denormalize variables if needed + Instruction::CastOps fpCast; + Value* conversionFactor; + + switch (conversionType) + { + case CONVERT_NORMALIZED: + fpCast = Instruction::CastOps::SIToFP; + conversionFactor = VIMMED1((float)(1.0 / 127.0)); + break; + case CONVERT_SSCALED: + fpCast = Instruction::CastOps::SIToFP; + conversionFactor = VIMMED1((float)(1.0)); + break; + case CONVERT_USCALED: + SWR_ASSERT(0, "Type should not be sign extended!"); + conversionFactor = nullptr; + break; + default: + SWR_ASSERT(conversionType == CONVERT_NONE); + conversionFactor = nullptr; + break; + } + + // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex + for(uint32_t i = 0; i < 4; i++){ + if(!isComponentEnabled(compMask, i)){ + continue; + } + + if(compCtrl[i] == ComponentControl::StoreSrc){ + // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 + uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; + // if x or y, use vi128XY permute result, else use vi128ZW + Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; + + // sign extend + vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty)); + + // denormalize if needed + if(conversionType != CONVERT_NONE){ + vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); + } + currentVertexElement++; + } + else{ + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); + } + + if(currentVertexElement > 3){ + StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); + // reset to the next vVertexElement to output + currentVertexElement = 0; + } + } + } + // else zero extend + else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP)) + { + // init denormalize variables if needed + Instruction::CastOps fpCast; + Value* conversionFactor; + + switch (conversionType) + { + case CONVERT_NORMALIZED: + fpCast = Instruction::CastOps::UIToFP; + conversionFactor = VIMMED1((float)(1.0 / 255.0)); + break; + case CONVERT_USCALED: + fpCast = Instruction::CastOps::UIToFP; + conversionFactor = VIMMED1((float)(1.0)); + break; + case CONVERT_SSCALED: + SWR_ASSERT(0, "Type should not be zero extended!"); + conversionFactor = nullptr; + break; + default: + SWR_ASSERT(conversionType == CONVERT_NONE); + conversionFactor = nullptr; + break; + } + + // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits + for(uint32_t i = 0; i < 4; i++){ + if(!isComponentEnabled(compMask, i)){ + continue; + } + + if(compCtrl[i] == ComponentControl::StoreSrc){ + // pshufb masks for each component + Value* vConstMask; + switch(swizzle[i]){ + case 0: + // x shuffle mask + vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, + 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1}); + break; + case 1: + // y shuffle mask + vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, + 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1}); + break; + case 2: + // z shuffle mask + vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, + 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1}); + break; + case 3: + // w shuffle mask + vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, + 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1}); + break; + default: + vConstMask = nullptr; + break; + } + + vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy); + // after pshufb for x channel + // 256i - 0 1 2 3 4 5 6 7 + // x000 x000 x000 x000 x000 x000 x000 x000 + + // denormalize if needed + if (conversionType != CONVERT_NONE){ + vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); + } + currentVertexElement++; + } + else{ + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); + } + + if(currentVertexElement > 3){ + StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); + // reset to the next vVertexElement to output + currentVertexElement = 0; + } + } + } + else + { + SWR_ASSERT(0, "Unsupported conversion type"); + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends, +/// denormalizes if needed, converts to F32 if needed, and positions in +// the proper SIMD rows to be output to the simdvertex structure +/// @param args: (tuple of args, listed below) +/// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index +/// @param pVtxOut - base pointer to output simdvertex struct +/// @param extendType - sign extend or zero extend +/// @param bNormalized - do we need to denormalize? +/// @param currentVertexElement - reference to the current vVertexElement +/// @param outputElt - reference to the current offset from simdvertex we're o +/// @param compMask - component packing mask +/// @param compCtrl - component control val +/// @param vVertexElements[4] - vertex components to output +void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) +{ + // Unpack tuple args + Value* (&vGatherResult)[2] = std::get<0>(args); + Value* pVtxOut = std::get<1>(args); + const Instruction::CastOps extendType = std::get<2>(args); + const ConversionType conversionType = std::get<3>(args); + uint32_t ¤tVertexElement = std::get<4>(args); + uint32_t &outputElt = std::get<5>(args); + const ComponentEnable compMask = std::get<6>(args); + const ComponentControl(&compCtrl)[4] = std::get<7>(args); + Value* (&vVertexElements)[4] = std::get<8>(args); + + // cast types + Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); + Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits + + // have to do extra work for sign extending + if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)|| + (extendType == Instruction::CastOps::FPExt)) + { + // is this PP float? + bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false; + + Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane + Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits + + // shuffle mask + Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); + Value* vi128XY = nullptr; + if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ + Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy); + // after pshufb: group components together in each 128bit lane + // 256i - 0 1 2 3 4 5 6 7 + // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy + + vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); + // after PERMD: move and pack xy components into each 128bit lane + // 256i - 0 1 2 3 4 5 6 7 + // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy + } + + // do the same for zw components + Value* vi128ZW = nullptr; + if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ + Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy); + vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); + } + + // init denormalize variables if needed + Instruction::CastOps IntToFpCast; + Value* conversionFactor; + + switch (conversionType) + { + case CONVERT_NORMALIZED: + IntToFpCast = Instruction::CastOps::SIToFP; + conversionFactor = VIMMED1((float)(1.0 / 32767.0)); + break; + case CONVERT_SSCALED: + IntToFpCast = Instruction::CastOps::SIToFP; + conversionFactor = VIMMED1((float)(1.0)); + break; + case CONVERT_USCALED: + SWR_ASSERT(0, "Type should not be sign extended!"); + conversionFactor = nullptr; + break; + default: + SWR_ASSERT(conversionType == CONVERT_NONE); + conversionFactor = nullptr; + break; + } + + // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex + for(uint32_t i = 0; i < 4; i++){ + if(!isComponentEnabled(compMask, i)){ + continue; + } + + if(compCtrl[i] == ComponentControl::StoreSrc){ + // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 + uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; + // if x or y, use vi128XY permute result, else use vi128ZW + Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; + + if(bFP) { + // extract 128 bit lanes to sign extend each component + vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); + } + else { + // extract 128 bit lanes to sign extend each component + vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); + + // denormalize if needed + if(conversionType != CONVERT_NONE){ + vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); + } + } + currentVertexElement++; + } + else{ + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); + } + + if(currentVertexElement > 3){ + StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); + // reset to the next vVertexElement to output + currentVertexElement = 0; + } + } + + } + // else zero extend + else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP)) + { + // pshufb masks for each component + Value* vConstMask[2]; + if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){ + // x/z shuffle mask + vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, + 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); + } + + if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){ + // y/w shuffle mask + vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, + 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); + } + + // init denormalize variables if needed + Instruction::CastOps fpCast; + Value* conversionFactor; + + switch (conversionType) + { + case CONVERT_NORMALIZED: + fpCast = Instruction::CastOps::UIToFP; + conversionFactor = VIMMED1((float)(1.0 / 65535.0)); + break; + case CONVERT_USCALED: + fpCast = Instruction::CastOps::UIToFP; + conversionFactor = VIMMED1((float)(1.0f)); + break; + case CONVERT_SSCALED: + SWR_ASSERT(0, "Type should not be zero extended!"); + conversionFactor = nullptr; + break; + default: + SWR_ASSERT(conversionType == CONVERT_NONE); + conversionFactor = nullptr; + break; + } + + // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits + for(uint32_t i = 0; i < 4; i++){ + if(!isComponentEnabled(compMask, i)){ + continue; + } + + if(compCtrl[i] == ComponentControl::StoreSrc){ + // select correct constMask for x/z or y/w pshufb + uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; + // if x or y, use vi128XY permute result, else use vi128ZW + uint32_t selectedGather = (i < 2) ? 0 : 1; + + vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy); + // after pshufb mask for x channel; z uses the same shuffle from the second gather + // 256i - 0 1 2 3 4 5 6 7 + // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 + + // denormalize if needed + if(conversionType != CONVERT_NONE){ + vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); + } + currentVertexElement++; + } + else{ + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); + } + + if(currentVertexElement > 3){ + StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); + // reset to the next vVertexElement to output + currentVertexElement = 0; + } + } + } + else + { + SWR_ASSERT(0, "Unsupported conversion type"); + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Output a simdvertex worth of elements to the current outputElt +/// @param pVtxOut - base address of VIN output struct +/// @param outputElt - simdvertex offset in VIN to write to +/// @param numEltsToStore - number of simdvertex rows to write out +/// @param vVertexElements - LLVM Value*[] simdvertex to write out +void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]) +{ + for(uint32_t c = 0; c < numEltsToStore; ++c) + { + // STORE expects FP32 x vWidth type, just bitcast if needed + if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){ +#if FETCH_DUMP_VERTEX + PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]}); +#endif + vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty); + } +#if FETCH_DUMP_VERTEX + else + { + PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]}); + } +#endif + // outputElt * 4 = offsetting by the size of a simdvertex + // + c offsets to a 32bit x vWidth row within the current vertex + Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP"); + STORE(vVertexElements[c], dest); + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Generates a constant vector of values based on the +/// ComponentControl value +/// @param ctrl - ComponentControl value +Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl) +{ + switch(ctrl) + { + case NoStore: return VUNDEF_I(); + case Store0: return VIMMED1(0); + case Store1Fp: return VIMMED1(1.0f); + case Store1Int: return VIMMED1(1); + case StoreSrc: + default: SWR_ASSERT(0, "Invalid component control"); return VUNDEF_I(); + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Returns the enable mask for the specified component. +/// @param enableMask - enable bits +/// @param component - component to check if enabled. +bool isComponentEnabled(ComponentEnable enableMask, uint8_t component) +{ + switch (component) + { + // X + case 0: return (enableMask & ComponentEnable::X); + // Y + case 1: return (enableMask & ComponentEnable::Y); + // Z + case 2: return (enableMask & ComponentEnable::Z); + // W + case 3: return (enableMask & ComponentEnable::W); + + default: return false; + } +} + + +////////////////////////////////////////////////////////////////////////// +/// @brief JITs from fetch shader IR +/// @param hJitMgr - JitManager handle +/// @param func - LLVM function IR +/// @return PFN_FETCH_FUNC - pointer to fetch code +PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc) +{ + const llvm::Function* func = (const llvm::Function*)hFunc; + JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); + PFN_FETCH_FUNC pfnFetch; + + pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); + // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module + pJitMgr->mIsModuleFinalized = true; + +#if defined(KNOB_SWRC_TRACING) + char fName[1024]; + const char *funcName = func->getName().data(); + sprintf(fName, "%s.bin", funcName); + FILE *fd = fopen(fName, "wb"); + fwrite((void *)pfnFetch, 1, 2048, fd); + fclose(fd); +#endif + + return pfnFetch; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief JIT compiles fetch shader +/// @param hJitMgr - JitManager handle +/// @param state - fetch state to build function from +extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state) +{ + JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); + + pJitMgr->SetupNewModule(); + + FetchJit theJit(pJitMgr); + HANDLE hFunc = theJit.Create(state); + + return JitFetchFunc(hJitMgr, hFunc); +} diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h new file mode 100644 index 00000000000..ea3625d2fde --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h @@ -0,0 +1,128 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file fetch_jit.h +* +* @brief Definition of the fetch jitter +* +* Notes: +* +******************************************************************************/ +#pragma once + +#include "common/formats.h" +#include "core/state.h" + +////////////////////////////////////////////////////////////////////////// +/// INPUT_ELEMENT_DESC +////////////////////////////////////////////////////////////////////////// +struct INPUT_ELEMENT_DESC +{ + union + { + struct + { + uint32_t AlignedByteOffset : 12; + uint32_t Format : 10; + uint32_t StreamIndex : 6; + uint32_t InstanceEnable : 1; + uint32_t ComponentControl0 : 3; + uint32_t ComponentControl1 : 3; + uint32_t ComponentControl2 : 3; + uint32_t ComponentControl3 : 3; + uint32_t ComponentPacking : 4; + uint32_t _reserved : 19; + }; + uint64_t bits; + }; + uint32_t InstanceDataStepRate; +}; + +// used to set ComponentPacking +enum ComponentEnable +{ + NONE = 0x0, + X = 0x1, + Y = 0x2, + XY = 0x3, + Z = 0x4, + XZ = 0x5, + YZ = 0x6, + XYZ = 0x7, + W = 0x8, + XW = 0x9, + YW = 0xA, + XYW = 0xB, + ZW = 0xC, + XZW = 0xD, + YZW = 0xE, + XYZW = 0xF, +}; + +enum ComponentControl +{ + NoStore = 0, + StoreSrc = 1, + Store0 = 2, + Store1Fp = 3, + Store1Int = 4, +}; + +////////////////////////////////////////////////////////////////////////// +/// State required for fetch shader jit compile. +////////////////////////////////////////////////////////////////////////// +struct FETCH_COMPILE_STATE +{ + uint32_t numAttribs; + INPUT_ELEMENT_DESC layout[KNOB_NUM_ATTRIBUTES]; + SWR_FORMAT indexType; + uint32_t cutIndex{ 0xffffffff }; + + // Options that effect the JIT'd code + bool bDisableVGATHER; // if enabled, FetchJit will generate loads/shuffles instead of VGATHERs + bool bDisableIndexOOBCheck; // if enabled, FetchJit will exclude index OOB check + bool bEnableCutIndex{ false }; // compares indices with the cut index and returns a cut mask + + FETCH_COMPILE_STATE(bool useVGATHER = false, bool indexOOBCheck = false) : + bDisableVGATHER(useVGATHER), bDisableIndexOOBCheck(indexOOBCheck){}; + + bool operator==(const FETCH_COMPILE_STATE &other) const + { + if (numAttribs != other.numAttribs) return false; + if (indexType != other.indexType) return false; + if (bDisableVGATHER != other.bDisableVGATHER) return false; + if (bDisableIndexOOBCheck != other.bDisableIndexOOBCheck) return false; + if (bEnableCutIndex != other.bEnableCutIndex) return false; + if (cutIndex != other.cutIndex) return false; + + for(uint32_t i = 0; i < numAttribs; ++i) + { + if((layout[i].bits != other.layout[i].bits) || + ((layout[i].InstanceEnable == 1) && + (layout[i].InstanceDataStepRate != other.layout[i].InstanceDataStepRate))){ + return false; + } + } + + return true; + } +}; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h new file mode 100644 index 00000000000..39d63836673 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h @@ -0,0 +1,108 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file jit_api.h +* +* @brief Platform independent JIT interface +* +* Notes: +* +******************************************************************************/ +#pragma once +#include "common/os.h" + +#include "fetch_jit.h" +#include "streamout_jit.h" +#include "blend_jit.h" + +#if defined(_WIN32) +#define EXCEPTION_PRINT_STACK(ret) ret +#endif // _WIN32 + +#if defined(_WIN32) +#define JITCALL __stdcall +#else +#define JITCALL +#endif + +extern "C" +{ + +struct ShaderInfo; + +////////////////////////////////////////////////////////////////////////// +/// Jit Compile Info Input +////////////////////////////////////////////////////////////////////////// +struct JIT_COMPILE_INPUT +{ + SWR_SHADER_TYPE type; + + const void* pIR; ///< Pointer to LLVM IR text. + + bool enableJitSampler; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief Create JIT context. +HANDLE JITCALL JitCreateContext(uint32_t targetSimdWidth, const char* arch); + +////////////////////////////////////////////////////////////////////////// +/// @brief Destroy JIT context. +void JITCALL JitDestroyContext(HANDLE hJitContext); + +////////////////////////////////////////////////////////////////////////// +/// @brief JIT compile shader. +/// @param hJitContext - Jit Context +/// @param input - Input containing LLVM IR and other information +/// @param output - Output containing information about JIT shader +ShaderInfo* JITCALL JitCompileShader( + HANDLE hJitContext, + const JIT_COMPILE_INPUT& input); + +////////////////////////////////////////////////////////////////////////// +/// @brief JIT destroy shader. +/// @param hJitContext - Jit Context +/// @param pShaderInfo - pointer to shader object. +void JITCALL JitDestroyShader( + HANDLE hJitContext, + ShaderInfo*& pShaderInfo); + +////////////////////////////////////////////////////////////////////////// +/// @brief JIT compiles fetch shader +/// @param hJitContext - Jit Context +/// @param state - Fetch state to build function from +PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitContext, const FETCH_COMPILE_STATE& state); + +////////////////////////////////////////////////////////////////////////// +/// @brief JIT compiles streamout shader +/// @param hJitContext - Jit Context +/// @param state - SO state to build function from +PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitContext, const STREAMOUT_COMPILE_STATE& state); + +////////////////////////////////////////////////////////////////////////// +/// @brief JIT compiles blend shader +/// @param hJitContext - Jit Context +/// @param state - blend state to build function from +PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitContext, const BLEND_COMPILE_STATE& state); + + +}; // extern "C" diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py new file mode 100644 index 00000000000..1814b7c8d5f --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py @@ -0,0 +1,401 @@ +# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +#!deps/python32/python.exe + +import os, sys, re +import argparse +import json as JSON +import operator + +header = r"""/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file %s +* +* @brief auto-generated file +* +* DO NOT EDIT +* +******************************************************************************/ + +""" + +""" +""" +def gen_file_header(filename): + global header + headerStr = header % filename + return headerStr.splitlines() + + +inst_aliases = { + 'SHUFFLE_VECTOR': 'VSHUFFLE', + 'INSERT_ELEMENT': 'VINSERT', + 'EXTRACT_ELEMENT': 'VEXTRACT', + 'MEM_SET': 'MEMSET', + 'MEM_CPY': 'MEMCPY', + 'MEM_MOVE': 'MEMMOVE', + 'L_SHR': 'LSHR', + 'A_SHR': 'ASHR', + 'BIT_CAST': 'BITCAST', + 'U_DIV': 'UDIV', + 'S_DIV': 'SDIV', + 'U_REM': 'UREM', + 'S_REM': 'SREM', + 'BIN_OP': 'BINOP', +} + +intrinsics = [ + ["VGATHERPS", "x86_avx2_gather_d_ps_256", ["src", "pBase", "indices", "mask", "scale"]], + ["VGATHERDD", "x86_avx2_gather_d_d_256", ["src", "pBase", "indices", "mask", "scale"]], + ["VSQRTPS", "x86_avx_sqrt_ps_256", ["a"]], + ["VRSQRTPS", "x86_avx_rsqrt_ps_256", ["a"]], + ["VRCPPS", "x86_avx_rcp_ps_256", ["a"]], + ["VMINPS", "x86_avx_min_ps_256", ["a", "b"]], + ["VMAXPS", "x86_avx_max_ps_256", ["a", "b"]], + ["VPMINSD", "x86_avx2_pmins_d", ["a", "b"]], + ["VPMAXSD", "x86_avx2_pmaxs_d", ["a", "b"]], + ["VROUND", "x86_avx_round_ps_256", ["a", "rounding"]], + ["VCMPPS", "x86_avx_cmp_ps_256", ["a", "b", "cmpop"]], + ["VBLENDVPS", "x86_avx_blendv_ps_256", ["a", "b", "mask"]], + ["BEXTR_32", "x86_bmi_bextr_32", ["src", "control"]], + ["VMASKLOADD", "x86_avx2_maskload_d_256", ["src", "mask"]], + ["VMASKMOVPS", "x86_avx_maskload_ps_256", ["src", "mask"]], + ["VPSHUFB", "x86_avx2_pshuf_b", ["a", "b"]], + ["VPMOVSXBD", "x86_avx2_pmovsxbd", ["a"]], # sign extend packed 8bit components + ["VPMOVSXWD", "x86_avx2_pmovsxwd", ["a"]], # sign extend packed 16bit components + ["VPERMD", "x86_avx2_permd", ["idx", "a"]], + ["VCVTPH2PS", "x86_vcvtph2ps_256", ["a"]], + ["VCVTPS2PH", "x86_vcvtps2ph_256", ["a", "round"]], + ["VHSUBPS", "x86_avx_hsub_ps_256", ["a", "b"]], + ["VPTESTC", "x86_avx_ptestc_256", ["a", "b"]], + ["VPTESTZ", "x86_avx_ptestz_256", ["a", "b"]], + ["VFMADDPS", "x86_fma_vfmadd_ps_256", ["a", "b", "c"]], + ["VCVTTPS2DQ", "x86_avx_cvtt_ps2dq_256", ["a"]], + ["VMOVMSKPS", "x86_avx_movmsk_ps_256", ["a"]], + ["INTERRUPT", "x86_int", ["a"]], + ] + +def convert_uppercamel(name): + s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) + return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).upper() + +""" + Given an input file (e.g. IRBuilder.h) generates function dictionary. +""" +def parse_ir_builder(input_file): + + functions = [] + + lines = input_file.readlines() + + idx = 0 + while idx < len(lines) - 1: + line = lines[idx].rstrip() + idx += 1 + + #match = re.search(r"\*Create", line) + match = re.search(r"[\*\s]Create(\w*)\(", line) + if match is not None: + #print("Line: %s" % match.group(1)) + + if re.search(r"^\s*Create", line) is not None: + func_sig = lines[idx-2].rstrip() + line + else: + func_sig = line + + end_of_args = False + while not end_of_args: + end_paren = re.search(r"\)", line) + if end_paren is not None: + end_of_args = True + else: + line = lines[idx].rstrip() + func_sig += line + idx += 1 + + delfunc = re.search(r"LLVM_DELETED_FUNCTION|= delete;", func_sig) + + if not delfunc: + func = re.search(r"(.*?)\*[\n\s]*(Create\w*)\((.*?)\)", func_sig) + if func is not None: + + return_type = func.group(1).lstrip() + '*' + func_name = func.group(2) + arguments = func.group(3) + + func_args = '' + func_args_nodefs = '' + + num_args = arguments.count(',') + + arg_names = [] + num_args = 0 + args = arguments.split(',') + for arg in args: + arg = arg.lstrip() + if arg: + if num_args > 0: + func_args += ', ' + func_args_nodefs += ', ' + func_args += arg + func_args_nodefs += arg.split(' =')[0] + + split_args = arg.split('=') + arg_name = split_args[0].rsplit(None, 1)[-1] + + #print("Before ArgName = %s" % arg_name) + + reg_arg = re.search(r"[\&\*]*(\w*)", arg_name) + if reg_arg: + #print("Arg Name = %s" % reg_arg.group(1)) + arg_names += [reg_arg.group(1)] + + num_args += 1 + + ignore = False + + # The following functions need to be ignored. + if func_name == 'CreateInsertNUWNSWBinOp': + ignore = True + + if func_name == 'CreateMaskedIntrinsic': + ignore = True + + # Convert CamelCase to CAMEL_CASE + func_mod = re.search(r"Create(\w*)", func_name) + if func_mod: + func_mod = func_mod.group(1) + func_mod = convert_uppercamel(func_mod) + if func_mod[0:2] == 'F_' or func_mod[0:2] == 'I_': + func_mod = func_mod[0] + func_mod[2:] + + # Substitute alias based on CAMEL_CASE name. + func_alias = inst_aliases.get(func_mod) + if not func_alias: + func_alias = func_mod + + if func_name == 'CreateCall' or func_name == 'CreateGEP': + arglist = re.search(r'ArrayRef', func_args) + if arglist: + func_alias = func_alias + 'A' + + if not ignore: + functions.append({ + "name": func_name, + "alias": func_alias, + "return": return_type, + "args": func_args, + "args_nodefs": func_args_nodefs, + "arg_names": arg_names + }) + + return functions + +""" + Auto-generates macros for LLVM IR +""" +def generate_gen_h(functions, output_file): + output_lines = gen_file_header(os.path.basename(output_file.name)) + + output_lines += [ + '#pragma once', + '', + '//////////////////////////////////////////////////////////////////////////', + '/// Auto-generated Builder IR declarations', + '//////////////////////////////////////////////////////////////////////////', + ] + + for func in functions: + name = func['name'] + if func['alias']: + name = func['alias'] + output_lines += [ + '%s%s(%s);' % (func['return'], name, func['args']) + ] + + output_file.write('\n'.join(output_lines) + '\n') + +""" + Auto-generates macros for LLVM IR +""" +def generate_gen_cpp(functions, output_file): + output_lines = gen_file_header(os.path.basename(output_file.name)) + + output_lines += [ + '#include \"builder.h\"', + '' + ] + + for func in functions: + name = func['name'] + if func['alias']: + name = func['alias'] + + args = func['arg_names'] + func_args = '' + first_arg = True + for arg in args: + if not first_arg: + func_args += ', ' + func_args += arg + first_arg = False + + output_lines += [ + '//////////////////////////////////////////////////////////////////////////', + '%sBuilder::%s(%s)' % (func['return'], name, func['args_nodefs']), + '{', + ' return IRB()->%s(%s);' % (func['name'], func_args), + '}', + '', + ] + + output_file.write('\n'.join(output_lines) + '\n') + +""" + Auto-generates macros for LLVM IR +""" +def generate_x86_h(output_file): + output_lines = gen_file_header(os.path.basename(output_file.name)) + + output_lines += [ + '#pragma once', + '', + '//////////////////////////////////////////////////////////////////////////', + '/// Auto-generated x86 intrinsics', + '//////////////////////////////////////////////////////////////////////////', + ] + + for inst in intrinsics: + #print("Inst: %s, x86: %s numArgs: %d" % (inst[0], inst[1], len(inst[2]))) + + args = '' + first = True + for arg in inst[2]: + if not first: + args += ', ' + args += ("Value* %s" % arg) + first = False + + output_lines += [ + 'Value *%s(%s);' % (inst[0], args) + ] + + output_file.write('\n'.join(output_lines) + '\n') + +""" + Auto-generates macros for LLVM IR +""" +def generate_x86_cpp(output_file): + output_lines = gen_file_header(os.path.basename(output_file.name)) + + output_lines += [ + '#include \"builder.h\"', + '' + ] + + for inst in intrinsics: + #print("Inst: %s, x86: %s numArgs: %d" % (inst[0], inst[1], len(inst[2]))) + + args = '' + pass_args = '' + first = True + for arg in inst[2]: + if not first: + args += ', ' + pass_args += ', ' + args += ("Value* %s" % arg) + pass_args += arg + first = False + + output_lines += [ + '//////////////////////////////////////////////////////////////////////////', + 'Value *Builder::%s(%s)' % (inst[0], args), + '{', + ' Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::%s);' % inst[1], + ' return CALL(func, std::initializer_list<Value*>{%s});' % pass_args, + '}', + '', + ] + + output_file.write('\n'.join(output_lines) + '\n') + +""" + Function which is invoked when this script is started from a command line. + Will present and consume a set of arguments which will tell this script how + to behave +""" +def main(): + + # Parse args... + parser = argparse.ArgumentParser() + parser.add_argument("--input", "-i", type=argparse.FileType('r'), help="Path to IRBuilder.h", required=False) + parser.add_argument("--output", "-o", type=argparse.FileType('w'), help="Path to output file", required=True) + parser.add_argument("--gen_h", "-gen_h", help="Generate builder_gen.h", action="store_true", default=False) + parser.add_argument("--gen_cpp", "-gen_cpp", help="Generate builder_gen.cpp", action="store_true", default=False) + parser.add_argument("--gen_x86_h", "-gen_x86_h", help="Generate x86 intrinsics. No input is needed.", action="store_true", default=False) + parser.add_argument("--gen_x86_cpp", "-gen_x86_cpp", help="Generate x86 intrinsics. No input is needed.", action="store_true", default=False) + args = parser.parse_args() + + if args.input: + functions = parse_ir_builder(args.input) + + if args.gen_h: + generate_gen_h(functions, args.output) + + if args.gen_cpp: + generate_gen_cpp(functions, args.output) + else: + if args.gen_x86_h: + generate_x86_h(args.output) + + if args.gen_x86_cpp: + generate_x86_cpp(args.output) + + if args.gen_h: + print("Need to specify --input for --gen_h!") + + if args.gen_cpp: + print("Need to specify --input for --gen_cpp!") + +if __name__ == '__main__': + main() +# END OF FILE diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py new file mode 100644 index 00000000000..7bba435467b --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py @@ -0,0 +1,341 @@ +# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +#!deps/python32/python.exe + +import os, sys, re +import argparse +import json as JSON +import operator + +header = r""" +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file %s +* +* @brief auto-generated file +* +* DO NOT EDIT +* +******************************************************************************/ + +#pragma once + +""" + +""" +""" +def gen_file_header(filename): + global header + headerStr = header % filename + return headerStr.splitlines() + +""" +""" +def gen_llvm_type(type, name, postfix_name, is_pointer, is_pointer_pointer, is_array, is_array_array, array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file): + + llvm_type = '' + + if is_llvm_struct: + if is_pointer or is_pointer_pointer: + llvm_type = 'Type::getInt32Ty(ctx)' + else: + llvm_type = 'ArrayType::get(Type::getInt8Ty(ctx), sizeof(%s))' % type + elif is_llvm_enum: + llvm_type = 'Type::getInt32Ty(ctx)' + elif is_llvm_pfn: + llvm_type = 'PointerType::get(Type::getInt8Ty(ctx), 0)' + else: + if type == "BYTE" or type == "char" or type == "uint8_t" or type == "int8_t" or type == 'bool': + llvm_type = 'Type::getInt8Ty(ctx)' + elif type == 'UINT64' or type == 'INT64' or type == 'uint64_t' or type == 'int64_t': + llvm_type = 'Type::getInt64Ty(ctx)' + elif type == 'UINT16' or type == 'int16_t' or type == 'uint16_t': + llvm_type = 'Type::getInt16Ty(ctx)' + elif type == 'UINT' or type == 'INT' or type == 'int' or type == 'BOOL' or type == 'uint32_t' or type == 'int32_t': + llvm_type = 'Type::getInt32Ty(ctx)' + elif type == 'float' or type == 'FLOAT': + llvm_type = 'Type::getFloatTy(ctx)' + elif type == 'double' or type == 'DOUBLE': + llvm_type = 'Type::getDoubleTy(ctx)' + elif type == 'void' or type == 'VOID': + llvm_type = 'Type::getInt32Ty(ctx)' + elif type == 'HANDLE': + llvm_type = 'PointerType::get(Type::getInt32Ty(ctx), 0)' + elif type == 'simdscalar': + llvm_type = 'VectorType::get(Type::getFloatTy(ctx), pJitMgr->mVWidth)' + elif type == 'simdscalari': + llvm_type = 'VectorType::get(Type::getInt32Ty(ctx), pJitMgr->mVWidth)' + elif type == 'simdvector': + llvm_type = 'ArrayType::get(VectorType::get(Type::getFloatTy(ctx), pJitMgr->mVWidth), 4)' + else: + llvm_type = 'Gen_%s%s(pJitMgr)' % (type, postfix_name) + + if is_pointer: + llvm_type = 'PointerType::get(%s, 0)' % llvm_type + + if is_pointer_pointer: + llvm_type = 'PointerType::get(%s, 0)' % llvm_type + + if is_array_array: + llvm_type = 'ArrayType::get(ArrayType::get(%s, %s), %s)' % (llvm_type, array_count1, array_count) + elif is_array: + llvm_type = 'ArrayType::get(%s, %s)' % (llvm_type, array_count) + + return [' members.push_back( %s ); // %s' % (llvm_type, name)] + +""" +""" +def gen_llvm_types(input_file, output_file): + + output_lines = gen_file_header(os.path.basename(output_file.name)) + + lines = input_file.readlines() + + postfix_name = "" + + for idx in range(len(lines)): + line = lines[idx].rstrip() + + match = re.match(r"(\s*)struct(\s*)(\w+)", line) + if match: + llvm_args = [] + + # Detect start of structure + is_fwd_decl = re.search(r";", line) + + if not is_fwd_decl: + + # Extract the command name + struct_name = match.group(3).strip() + + output_lines += [ + '//////////////////////////////////////////////////////////////////////////', + '/// Generate LLVM type information for %s' % struct_name, + 'INLINE static StructType *Gen_%s%s(JitManager* pJitMgr)' % (struct_name, postfix_name), + '{', + ' LLVMContext& ctx = pJitMgr->mContext;', + ' std::vector<Type*> members;', + '', + ] + + end_of_struct = False + + while not end_of_struct and idx < len(lines)-1: + idx += 1 + line = lines[idx].rstrip() + + is_llvm_typedef = re.search(r"@llvm_typedef", line) + if is_llvm_typedef is not None: + is_llvm_typedef = True + else: + is_llvm_typedef = False + + ########################################### + # Is field a llvm struct? Tells script to treat type as array of bytes that is size of structure. + is_llvm_struct = re.search(r"@llvm_struct", line) + + if is_llvm_struct is not None: + is_llvm_struct = True + else: + is_llvm_struct = False + + ########################################### + # Is field a llvm enum? Tells script to treat type as an enum and replaced with uint32 type. + is_llvm_enum = re.search(r"@llvm_enum", line) + + if is_llvm_enum is not None: + is_llvm_enum = True + else: + is_llvm_enum = False + + ########################################### + # Is field a llvm function pointer? Tells script to treat type as an enum and replaced with uint32 type. + is_llvm_pfn = re.search(r"@llvm_pfn", line) + + if is_llvm_pfn is not None: + is_llvm_pfn = True + else: + is_llvm_pfn = False + + ########################################### + # Is field const? + is_const = re.search(r"\s+const\s+", line) + + if is_const is not None: + is_const = True + else: + is_const = False + + ########################################### + # Is field a pointer? + is_pointer_pointer = re.search("\*\*", line) + + if is_pointer_pointer is not None: + is_pointer_pointer = True + else: + is_pointer_pointer = False + + ########################################### + # Is field a pointer? + is_pointer = re.search("\*", line) + + if is_pointer is not None: + is_pointer = True + else: + is_pointer = False + + ########################################### + # Is field an array of arrays? + # TODO: Can add this to a list. + is_array_array = re.search("\[(\w*)\]\[(\w*)\]", line) + array_count = '0' + array_count1 = '0' + + if is_array_array is not None: + array_count = is_array_array.group(1) + array_count1 = is_array_array.group(2) + is_array_array = True + else: + is_array_array = False + + ########################################### + # Is field an array? + is_array = re.search("\[(\w*)\]", line) + + if is_array is not None: + array_count = is_array.group(1) + is_array = True + else: + is_array = False + + is_scoped = re.search("::", line) + + if is_scoped is not None: + is_scoped = True + else: + is_scoped = False + + type = None + name = None + if is_const and is_pointer: + + if is_scoped: + field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+::)(\w+)(\s*\**\s*)(\w+)", line) + + type = "%s%s" % (field_match.group(4), field_match.group(5)) + name = field_match.group(7) + else: + field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*\**\s*)(\w+)", line) + + type = field_match.group(4) + name = field_match.group(6) + + elif is_pointer: + field_match = re.match(r"(\s*)(\s+)(\w+\<*\w*\>*)(\s*\**\s*)(\w+)", line) + + if field_match: + type = field_match.group(3) + name = field_match.group(5) + elif is_const: + field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*)(\w+)", line) + + if field_match: + type = field_match.group(4) + name = field_match.group(6) + else: + if is_scoped: + field_match = re.match(r"\s*(\w+\<*\w*\>*)\s*::\s*(\w+\<*\w*\>*)\s+(\w+)", line) + + if field_match: + type = field_match.group(1) + '::' + field_match.group(2) + name = field_match.group(3) + else: + field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+)", line) + + if field_match: + type = field_match.group(2) + name = field_match.group(4) + + if is_llvm_typedef is False: + if type is not None: + output_lines += gen_llvm_type(type, name, postfix_name, is_pointer, is_pointer_pointer, is_array, is_array_array, array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file) + llvm_args.append(name) + + # Detect end of structure + end_of_struct = re.match(r"(\s*)};", line) + + if (end_of_struct): + output_lines += [ + '', + ' return StructType::get(ctx, members, false);', + '}', + '', + ] + + for i in range(len(llvm_args)): + output_lines.append('static const uint32_t %s%s_%s = %s;' % (struct_name, postfix_name, llvm_args[i], i)) + + output_lines.append('') + + output_file.write('\n'.join(output_lines) + '\n') + +""" + Function which is invoked when this script is started from a command line. + Will present and consume a set of arguments which will tell this script how + to behave +""" +def main(): + + # Parse args... + parser = argparse.ArgumentParser() + parser.add_argument("--input", "-i", type=argparse.FileType('r'), + help="Path to input file containing structs", required=True) + parser.add_argument("--output", "-o", type=argparse.FileType('w'), + help="Path to output file", required=True) + parser.add_argument("--scalar", "-scalar", help="Generates scalar files with all enums", action="store_true", default=False) + args = parser.parse_args() + + gen_llvm_types(args.input, args.output) + +if __name__ == '__main__': + main() +# END OF FILE diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp new file mode 100644 index 00000000000..6c5f22bc47c --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp @@ -0,0 +1,357 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file streamout_jit.cpp +* +* @brief Implementation of the streamout jitter +* +* Notes: +* +******************************************************************************/ +#include "jit_api.h" +#include "streamout_jit.h" +#include "builder.h" +#include "state_llvm.h" +#include "common/containers.hpp" +#include "llvm/IR/DataLayout.h" + +#include <sstream> +#include <unordered_set> + +////////////////////////////////////////////////////////////////////////// +/// Interface to Jitting a fetch shader +////////////////////////////////////////////////////////////////////////// +struct StreamOutJit : public Builder +{ + StreamOutJit(JitManager* pJitMgr) : Builder(pJitMgr){}; + + // returns pointer to SWR_STREAMOUT_BUFFER + Value* getSOBuffer(Value* pSoCtx, uint32_t buffer) + { + return LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pBuffer, buffer }); + } + + + ////////////////////////////////////////////////////////////////////////// + // @brief checks if streamout buffer is oob + // @return <i1> true/false + Value* oob(const STREAMOUT_COMPILE_STATE& state, Value* pSoCtx, uint32_t buffer) + { + Value* returnMask = C(false); + + Value* pBuf = getSOBuffer(pSoCtx, buffer); + + // load enable + // @todo bool data types should generate <i1> llvm type + Value* enabled = TRUNC(LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_enable }), IRB()->getInt1Ty()); + + // load buffer size + Value* bufferSize = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_bufferSize }); + + // load current streamOffset + Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset }); + + // load buffer pitch + Value* pitch = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch }); + + // buffer is considered oob if in use in a decl but not enabled + returnMask = OR(returnMask, NOT(enabled)); + + // buffer is oob if cannot fit a prims worth of verts + Value* newOffset = ADD(streamOffset, MUL(pitch, C(state.numVertsPerPrim))); + returnMask = OR(returnMask, ICMP_SGT(newOffset, bufferSize)); + + return returnMask; + } + + + ////////////////////////////////////////////////////////////////////////// + // @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector, + // packing the active mask bits + // ex. bitmask 0011 -> (0, 1, 0, 0) + // bitmask 1000 -> (3, 0, 0, 0) + // bitmask 1100 -> (2, 3, 0, 0) + Value* PackMask(uint32_t bitmask) + { + std::vector<Constant*> indices(4, C(0)); + DWORD index; + uint32_t elem = 0; + while (_BitScanForward(&index, bitmask)) + { + indices[elem++] = C((int)index); + bitmask &= ~(1 << index); + } + + return ConstantVector::get(indices); + } + + ////////////////////////////////////////////////////////////////////////// + // @brief convert scalar bitmask to <4xfloat> bitmask + Value* ToMask(uint32_t bitmask) + { + std::vector<Constant*> indices; + for (uint32_t i = 0; i < 4; ++i) + { + if (bitmask & (1 << i)) + { + indices.push_back(C(-1.0f)); + } + else + { + indices.push_back(C(0.0f)); + } + } + return ConstantVector::get(indices); + } + + ////////////////////////////////////////////////////////////////////////// + // @brief processes a single decl from the streamout stream. Reads 4 components from the input + // stream and writes N components to the output buffer given the componentMask or if + // a hole, just increments the buffer pointer + // @param pStream - pointer to current attribute + // @param pOutBuffers - pointers to the current location of each output buffer + // @param decl - input decl + void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl) + { + // @todo add this to x86 macros + Function* maskStore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskstore_ps); + + uint32_t numComponents = _mm_popcnt_u32(decl.componentMask); + uint32_t packedMask = (1 << numComponents) - 1; + if (!decl.hole) + { + // increment stream pointer to correct slot + Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot)); + + // load 4 components from stream + Type* simd4Ty = VectorType::get(IRB()->getFloatTy(), 4); + Type* simd4PtrTy = PointerType::get(simd4Ty, 0); + pAttrib = BITCAST(pAttrib, simd4PtrTy); + Value *vattrib = LOAD(pAttrib); + + // shuffle/pack enabled components + Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask)); + + // store to output buffer + // cast SO buffer to i8*, needed by maskstore + Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(mInt8Ty, 0)); + + // cast input to <4xfloat> + Value* src = BITCAST(vpackedAttrib, simd4Ty); + CALL(maskStore, {pOut, ToMask(packedMask), src}); + } + + // increment SO buffer + pOutBuffers[decl.bufferIndex] = GEP(pOutBuffers[decl.bufferIndex], C(numComponents)); + } + + ////////////////////////////////////////////////////////////////////////// + // @brief builds a single vertex worth of data for the given stream + // @param streamState - state for this stream + // @param pCurVertex - pointer to src stream vertex data + // @param pOutBuffer - pointers to up to 4 SO buffers + void buildVertex(const STREAMOUT_STREAM& streamState, Value* pCurVertex, Value* pOutBuffer[4]) + { + for (uint32_t d = 0; d < streamState.numDecls; ++d) + { + const STREAMOUT_DECL& decl = streamState.decl[d]; + buildDecl(pCurVertex, pOutBuffer, decl); + } + } + + void buildStream(const STREAMOUT_COMPILE_STATE& state, const STREAMOUT_STREAM& streamState, Value* pSoCtx, BasicBlock* returnBB, Function* soFunc) + { + // get list of active SO buffers + std::unordered_set<uint32_t> activeSOBuffers; + for (uint32_t d = 0; d < streamState.numDecls; ++d) + { + const STREAMOUT_DECL& decl = streamState.decl[d]; + activeSOBuffers.insert(decl.bufferIndex); + } + + // always increment numPrimStorageNeeded + Value *numPrimStorageNeeded = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded }); + numPrimStorageNeeded = ADD(numPrimStorageNeeded, C(1)); + STORE(numPrimStorageNeeded, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded }); + + // check OOB on active SO buffers. If any buffer is out of bound, don't write + // the primitive to any buffer + Value* oobMask = C(false); + for (uint32_t buffer : activeSOBuffers) + { + oobMask = OR(oobMask, oob(state, pSoCtx, buffer)); + } + + BasicBlock* validBB = BasicBlock::Create(JM()->mContext, "valid", soFunc); + + // early out if OOB + COND_BR(oobMask, returnBB, validBB); + + IRB()->SetInsertPoint(validBB); + + Value* numPrimsWritten = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten }); + numPrimsWritten = ADD(numPrimsWritten, C(1)); + STORE(numPrimsWritten, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten }); + + // compute start pointer for each output buffer + Value* pOutBuffer[4]; + Value* pOutBufferStartVertex[4]; + Value* outBufferPitch[4]; + for (uint32_t b: activeSOBuffers) + { + Value* pBuf = getSOBuffer(pSoCtx, b); + Value* pData = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pBuffer }); + Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset }); + pOutBuffer[b] = GEP(pData, streamOffset); + pOutBufferStartVertex[b] = pOutBuffer[b]; + + outBufferPitch[b] = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch }); + } + + // loop over the vertices of the prim + Value* pStreamData = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pPrimData }); + for (uint32_t v = 0; v < state.numVertsPerPrim; ++v) + { + buildVertex(streamState, pStreamData, pOutBuffer); + + // increment stream and output buffer pointers + // stream verts are always 32*4 dwords apart + pStreamData = GEP(pStreamData, C(KNOB_NUM_ATTRIBUTES * 4)); + + // output buffers offset using pitch in buffer state + for (uint32_t b : activeSOBuffers) + { + pOutBufferStartVertex[b] = GEP(pOutBufferStartVertex[b], outBufferPitch[b]); + pOutBuffer[b] = pOutBufferStartVertex[b]; + } + } + + // update each active buffer's streamOffset + for (uint32_t b : activeSOBuffers) + { + Value* pBuf = getSOBuffer(pSoCtx, b); + Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset }); + streamOffset = ADD(streamOffset, MUL(C(state.numVertsPerPrim), outBufferPitch[b])); + STORE(streamOffset, pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset }); + } + } + + Function* Create(const STREAMOUT_COMPILE_STATE& state) + { + static std::size_t soNum = 0; + + std::stringstream fnName("SOShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate); + fnName << soNum++; + + // SO function signature + // typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT*) + + std::vector<Type*> args{ + PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT* + }; + + FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false); + Function* soFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); + + // create return basic block + BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", soFunc); + BasicBlock* returnBB = BasicBlock::Create(JM()->mContext, "return", soFunc); + + IRB()->SetInsertPoint(entry); + + // arguments + auto argitr = soFunc->getArgumentList().begin(); + Value* pSoCtx = &*argitr++; + pSoCtx->setName("pSoCtx"); + + const STREAMOUT_STREAM& streamState = state.stream; + buildStream(state, streamState, pSoCtx, returnBB, soFunc); + + BR(returnBB); + + IRB()->SetInsertPoint(returnBB); + RET_VOID(); + + JitManager::DumpToFile(soFunc, "SoFunc"); + + FunctionPassManager passes(JM()->mpCurrentModule); + passes.add(createBreakCriticalEdgesPass()); + passes.add(createCFGSimplificationPass()); + passes.add(createEarlyCSEPass()); + passes.add(createPromoteMemoryToRegisterPass()); + passes.add(createCFGSimplificationPass()); + passes.add(createEarlyCSEPass()); + passes.add(createInstructionCombiningPass()); + passes.add(createInstructionSimplifierPass()); + passes.add(createConstantPropagationPass()); + passes.add(createSCCPPass()); + passes.add(createAggressiveDCEPass()); + + passes.run(*soFunc); + + JitManager::DumpToFile(soFunc, "SoFunc_optimized"); + + return soFunc; + } +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief JITs from streamout shader IR +/// @param hJitMgr - JitManager handle +/// @param func - LLVM function IR +/// @return PFN_SO_FUNC - pointer to SOS function +PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc) +{ + const llvm::Function *func = (const llvm::Function*)hFunc; + JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); + PFN_SO_FUNC pfnStreamOut; + pfnStreamOut = (PFN_SO_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); + // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module + pJitMgr->mIsModuleFinalized = true; + + return pfnStreamOut; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief JIT compiles streamout shader +/// @param hJitMgr - JitManager handle +/// @param state - SO state to build function from +extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitMgr, const STREAMOUT_COMPILE_STATE& state) +{ + JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); + + STREAMOUT_COMPILE_STATE soState = state; + if (soState.offsetAttribs) + { + for (uint32_t i = 0; i < soState.stream.numDecls; ++i) + { + soState.stream.decl[i].attribSlot -= soState.offsetAttribs; + } + } + + pJitMgr->SetupNewModule(); + + StreamOutJit theJit(pJitMgr); + HANDLE hFunc = theJit.Create(soState); + + return JitStreamoutFunc(hJitMgr, hFunc); +} diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h new file mode 100644 index 00000000000..097f8ab44d9 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h @@ -0,0 +1,94 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file streamout_jit.h +* +* @brief Definition of the streamout jitter +* +* Notes: +* +******************************************************************************/ +#pragma once + +#include "common/formats.h" +#include "core/state.h" + +////////////////////////////////////////////////////////////////////////// +/// STREAMOUT_DECL - Stream decl +////////////////////////////////////////////////////////////////////////// +struct STREAMOUT_DECL +{ + // Buffer that stream maps to. + DWORD bufferIndex; + + // attribute to stream + uint32_t attribSlot; + + // attribute component mask + uint32_t componentMask; + + // indicates this decl is a hole + bool hole; +}; + +////////////////////////////////////////////////////////////////////////// +/// STREAMOUT_STREAM - Stream decls +////////////////////////////////////////////////////////////////////////// +struct STREAMOUT_STREAM +{ + // numnber of decls for this stream + uint32_t numDecls; + + // array of numDecls decls + STREAMOUT_DECL decl[128]; +}; + +////////////////////////////////////////////////////////////////////////// +/// State required for streamout jit +////////////////////////////////////////////////////////////////////////// +struct STREAMOUT_COMPILE_STATE +{ + // number of verts per primitive + uint32_t numVertsPerPrim; + uint32_t offsetAttribs; ///< attrib offset to subtract from all STREAMOUT_DECL::attribSlot values. + + uint64_t streamMask; + + // stream decls + STREAMOUT_STREAM stream; + + bool operator==(const STREAMOUT_COMPILE_STATE &other) const + { + if (numVertsPerPrim != other.numVertsPerPrim) return false; + if (stream.numDecls != other.stream.numDecls) return false; + + for (uint32_t i = 0; i < stream.numDecls; ++i) + { + if (stream.decl[i].bufferIndex != other.stream.decl[i].bufferIndex) return false; + if (stream.decl[i].attribSlot != other.stream.decl[i].attribSlot) return false; + if (stream.decl[i].componentMask != other.stream.decl[i].componentMask) return false; + if (stream.decl[i].hole != other.stream.decl[i].hole) return false; + } + + return true; + } +}; diff --git a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp new file mode 100644 index 00000000000..ad73cd840a7 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp @@ -0,0 +1,287 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file ClearTile.cpp +* +* @brief Functionality for ClearTile. StoreHotTileClear clears a single macro +* tile in the destination. +* +******************************************************************************/ +#include "common/os.h" +#include "core/context.h" +#include "common/formats.h" +#include "memory/TilingFunctions.h" +#include "memory/tilingtraits.h" +#include "memory/Convert.h" + +typedef void(*PFN_STORE_TILES_CLEAR)(const FLOAT*, SWR_SURFACE_STATE*, UINT, UINT); + +////////////////////////////////////////////////////////////////////////// +/// Clear Raster Tile Function Tables. +////////////////////////////////////////////////////////////////////////// +static PFN_STORE_TILES_CLEAR sStoreTilesClearColorTable[NUM_SWR_FORMATS]; + +static PFN_STORE_TILES_CLEAR sStoreTilesClearDepthTable[NUM_SWR_FORMATS]; + +////////////////////////////////////////////////////////////////////////// +/// StoreRasterTileClear +////////////////////////////////////////////////////////////////////////// +template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> +struct StoreRasterTileClear +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pColor - Pointer to clear color. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void StoreClear( + const BYTE* dstFormattedColor, + UINT dstBytesPerPixel, + SWR_SURFACE_STATE* pDstSurface, + UINT x, UINT y) // (x, y) pixel coordinate to start of raster tile. + { + // Compute destination address for raster tile. + BYTE* pDstTile = (BYTE*)pDstSurface->pBaseAddress + + (y * pDstSurface->pitch) + (x * dstBytesPerPixel); + + // start of first row + BYTE* pDst = pDstTile; + UINT dstBytesPerRow = 0; + + // For each raster tile pixel in row 0 (rx, 0) + for (UINT rx = 0; (rx < KNOB_TILE_X_DIM) && ((x + rx) < pDstSurface->width); ++rx) + { + memcpy(pDst, dstFormattedColor, dstBytesPerPixel); + + // Increment pointer to next pixel in row. + pDst += dstBytesPerPixel; + dstBytesPerRow += dstBytesPerPixel; + } + + // start of second row + pDst = pDstTile + pDstSurface->pitch; + + // For each remaining row in the rest of the raster tile + for (UINT ry = 1; (ry < KNOB_TILE_Y_DIM) && ((y + ry) < pDstSurface->height); ++ry) + { + // copy row + memcpy(pDst, pDstTile, dstBytesPerRow); + + // Increment pointer to first pixel in next row. + pDst += pDstSurface->pitch; + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// StoreMacroTileClear - Stores a macro tile clear to its raster tiles. +////////////////////////////////////////////////////////////////////////// +template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> +struct StoreMacroTileClear +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores a macrotile to the destination surface. + /// @param pColor - Pointer to color to write to pixels. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to macro tile + static void StoreClear( + const FLOAT *pColor, + SWR_SURFACE_STATE* pDstSurface, + UINT x, UINT y) + { + UINT dstBytesPerPixel = (FormatTraits<DstFormat>::bpp / 8); + + BYTE dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel + + FLOAT srcColor[4]; + + for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp) + { + srcColor[comp] = pColor[FormatTraits<DstFormat>::swizzle(comp)]; + } + + // using this helper function, but the Tiling Traits is unused inside it so just using a dummy value + ConvertPixelFromFloat<DstFormat>(dstFormattedColor, srcColor); + + // Store each raster tile from the hot tile to the destination surface. + // TODO: Put in check for partial coverage on x/y -- SWR_ASSERT if it happens. + // Intent is for this function to only handle full tiles. + for (UINT row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) + { + for (UINT col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) + { + StoreRasterTileClear<SrcFormat, DstFormat>::StoreClear(dstFormattedColor, dstBytesPerPixel, pDstSurface, (x + col), (y + row)); + } + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief Writes clear color to every pixel of a render surface +/// @param hPrivateContext - Handle to private DC +/// @param renderTargetIndex - Index to destination render target +/// @param x, y - Coordinates to raster tile. +/// @param pClearColor - Pointer to clear color +void StoreHotTileClear( + SWR_SURFACE_STATE *pDstSurface, + SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, + UINT x, + UINT y, + const float* pClearColor) +{ + PFN_STORE_TILES_CLEAR pfnStoreTilesClear = NULL; + + SWR_ASSERT(renderTargetIndex != SWR_ATTACHMENT_STENCIL); ///@todo Not supported yet. + + if (renderTargetIndex != SWR_ATTACHMENT_DEPTH) + { + pfnStoreTilesClear = sStoreTilesClearColorTable[pDstSurface->format]; + } + else + { + pfnStoreTilesClear = sStoreTilesClearDepthTable[pDstSurface->format]; + } + + SWR_ASSERT(pfnStoreTilesClear != NULL); + + // Store a macro tile. + /// @todo Once all formats are supported then if check can go away. This is to help us near term to make progress. + if (pfnStoreTilesClear != NULL) + { + pfnStoreTilesClear(pClearColor, pDstSurface, x, y); + } +} + +////////////////////////////////////////////////////////////////////////// +/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables. +#define INIT_STORE_TILES_CLEAR_COLOR_TABLE() \ + memset(sStoreTilesClearColorTable, 0, sizeof(sStoreTilesClearColorTable)); \ + \ + sStoreTilesClearColorTable[R32G32B32A32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::StoreClear; \ + sStoreTilesClearColorTable[R32G32B32A32_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32A32_SINT>::StoreClear; \ + sStoreTilesClearColorTable[R32G32B32A32_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32A32_UINT>::StoreClear; \ + sStoreTilesClearColorTable[R32G32B32X32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::StoreClear; \ + sStoreTilesClearColorTable[R32G32B32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32_FLOAT>::StoreClear; \ + sStoreTilesClearColorTable[R32G32B32_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32_SINT>::StoreClear; \ + sStoreTilesClearColorTable[R32G32B32_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32_UINT>::StoreClear; \ + sStoreTilesClearColorTable[R16G16B16A16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[R16G16B16A16_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::StoreClear; \ + sStoreTilesClearColorTable[R16G16B16A16_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_SINT>::StoreClear; \ + sStoreTilesClearColorTable[R16G16B16A16_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_UINT>::StoreClear; \ + sStoreTilesClearColorTable[R16G16B16A16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::StoreClear; \ + sStoreTilesClearColorTable[R32G32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32_FLOAT>::StoreClear; \ + sStoreTilesClearColorTable[R32G32_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32_SINT>::StoreClear; \ + sStoreTilesClearColorTable[R32G32_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32_UINT>::StoreClear; \ + sStoreTilesClearColorTable[R16G16B16X16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[R16G16B16X16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::StoreClear; \ + sStoreTilesClearColorTable[B8G8R8A8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[B8G8R8A8_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::StoreClear; \ + sStoreTilesClearColorTable[R10G10B10A2_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[R10G10B10A2_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreClear; \ + sStoreTilesClearColorTable[R10G10B10A2_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreClear; \ + sStoreTilesClearColorTable[R8G8B8A8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[R8G8B8A8_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::StoreClear; \ + sStoreTilesClearColorTable[R8G8B8A8_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::StoreClear; \ + sStoreTilesClearColorTable[R8G8B8A8_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_SINT>::StoreClear; \ + sStoreTilesClearColorTable[R8G8B8A8_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_UINT>::StoreClear; \ + sStoreTilesClearColorTable[R16G16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[R16G16_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_SNORM>::StoreClear; \ + sStoreTilesClearColorTable[R16G16_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_SINT>::StoreClear; \ + sStoreTilesClearColorTable[R16G16_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_UINT>::StoreClear; \ + sStoreTilesClearColorTable[R16G16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_FLOAT>::StoreClear; \ + sStoreTilesClearColorTable[B10G10R10A2_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[B10G10R10A2_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreClear; \ + sStoreTilesClearColorTable[R11G11B10_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreClear; \ + sStoreTilesClearColorTable[R32_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32_SINT>::StoreClear; \ + sStoreTilesClearColorTable[R32_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32_UINT>::StoreClear; \ + sStoreTilesClearColorTable[R32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32_FLOAT>::StoreClear; \ + sStoreTilesClearColorTable[A32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, A32_FLOAT>::StoreClear; \ + sStoreTilesClearColorTable[B8G8R8X8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[B8G8R8X8_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::StoreClear; \ + sStoreTilesClearColorTable[R8G8B8X8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[R8G8B8X8_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::StoreClear; \ + sStoreTilesClearColorTable[B10G10R10X2_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[B5G6R5_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G6R5_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[B5G6R5_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreClear; \ + sStoreTilesClearColorTable[B5G5R5A1_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[B5G5R5A1_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreClear; \ + sStoreTilesClearColorTable[B4G4R4A4_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[B4G4R4A4_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreClear; \ + sStoreTilesClearColorTable[R8G8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[R8G8_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_SNORM>::StoreClear; \ + sStoreTilesClearColorTable[R8G8_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_SINT>::StoreClear; \ + sStoreTilesClearColorTable[R8G8_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_UINT>::StoreClear; \ + sStoreTilesClearColorTable[R16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[R16_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_SNORM>::StoreClear; \ + sStoreTilesClearColorTable[R16_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_SINT>::StoreClear; \ + sStoreTilesClearColorTable[R16_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_UINT>::StoreClear; \ + sStoreTilesClearColorTable[R16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_FLOAT>::StoreClear; \ + sStoreTilesClearColorTable[A16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, A16_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[A16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, A16_FLOAT>::StoreClear; \ + sStoreTilesClearColorTable[B5G5R5X1_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[B5G5R5X1_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreClear; \ + sStoreTilesClearColorTable[R8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[R8_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_SNORM>::StoreClear; \ + sStoreTilesClearColorTable[R8_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_SINT>::StoreClear; \ + sStoreTilesClearColorTable[R8_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_UINT>::StoreClear; \ + sStoreTilesClearColorTable[A8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, A8_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[BC1_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC1_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[BC2_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC2_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[BC3_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC3_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[BC4_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC4_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[BC5_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC5_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[BC1_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC1_UNORM_SRGB>::StoreClear; \ + sStoreTilesClearColorTable[BC2_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC2_UNORM_SRGB>::StoreClear; \ + sStoreTilesClearColorTable[BC3_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC3_UNORM_SRGB>::StoreClear; \ + sStoreTilesClearColorTable[R8G8B8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[R8G8B8_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_SNORM>::StoreClear; \ + sStoreTilesClearColorTable[BC4_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC4_SNORM>::StoreClear; \ + sStoreTilesClearColorTable[BC5_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC5_SNORM>::StoreClear; \ + sStoreTilesClearColorTable[R16G16B16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_FLOAT>::StoreClear; \ + sStoreTilesClearColorTable[R16G16B16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_UNORM>::StoreClear; \ + sStoreTilesClearColorTable[R16G16B16_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_SNORM>::StoreClear; \ + sStoreTilesClearColorTable[R8G8B8_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::StoreClear; \ + sStoreTilesClearColorTable[R16G16B16_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_UINT>::StoreClear; \ + sStoreTilesClearColorTable[R16G16B16_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_SINT>::StoreClear; \ + sStoreTilesClearColorTable[R10G10B10A2_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreClear; \ + sStoreTilesClearColorTable[R10G10B10A2_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreClear; \ + sStoreTilesClearColorTable[B10G10R10A2_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreClear; \ + sStoreTilesClearColorTable[B10G10R10A2_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreClear; \ + sStoreTilesClearColorTable[B10G10R10A2_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreClear; \ + sStoreTilesClearColorTable[R8G8B8_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_UINT>::StoreClear; \ + sStoreTilesClearColorTable[R8G8B8_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_SINT>::StoreClear; \ + +////////////////////////////////////////////////////////////////////////// +/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables. +#define INIT_STORE_TILES_CLEAR_DEPTH_TABLE() \ + memset(sStoreTilesClearDepthTable, 0, sizeof(sStoreTilesClearDepthTable)); \ + \ + sStoreTilesClearDepthTable[R32_FLOAT] = StoreMacroTileClear<R32_FLOAT, R32_FLOAT>::StoreClear; \ + sStoreTilesClearDepthTable[R24_UNORM_X8_TYPELESS] = StoreMacroTileClear<R32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreClear; \ + +////////////////////////////////////////////////////////////////////////// +/// @brief Sets up tables for ClearTile +void InitSimClearTilesTable() +{ + INIT_STORE_TILES_CLEAR_COLOR_TABLE(); + INIT_STORE_TILES_CLEAR_DEPTH_TABLE(); +} diff --git a/src/gallium/drivers/swr/rasterizer/memory/Convert.h b/src/gallium/drivers/swr/rasterizer/memory/Convert.h new file mode 100644 index 00000000000..0f9e0ad4bd8 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/memory/Convert.h @@ -0,0 +1,698 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file Convert.h +* +* @brief Conversion utility functions +* +******************************************************************************/ +#pragma once + +#if defined(_WIN32) +// disable "potential divide by 0" +#pragma warning(disable: 4723) +#endif + +////////////////////////////////////////////////////////////////////////// +/// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision +/// float +/// @param val - 16-bit float +/// @todo Maybe move this outside of this file into a header? +static float ConvertSmallFloatTo32(UINT val) +{ + UINT result; + if ((val & 0x7fff) == 0) + { + result = ((uint32_t)(val & 0x8000)) << 16; + } + else if ((val & 0x7c00) == 0x7c00) + { + result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000; + result |= ((uint32_t)val & 0x8000) << 16; + } + else + { + uint32_t sign = (val & 0x8000) << 16; + uint32_t mant = (val & 0x3ff) << 13; + uint32_t exp = (val >> 10) & 0x1f; + if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals + { + mant <<= 1; + while (mant < (0x400 << 13)) + { + exp--; + mant <<= 1; + } + mant &= (0x3ff << 13); + } + exp = ((exp - 15 + 127) & 0xff) << 23; + result = sign | exp | mant; + } + + return *(float*)&result; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Convert an IEEE 754 32-bit single precision float to an +/// unsigned small float with 5 exponent bits and a variable +/// number of mantissa bits. +/// @param val - 32-bit float +/// @todo Maybe move this outside of this file into a header? +template<UINT numMantissaBits> +static UINT Convert32ToSmallFloat(float val) +{ + uint32_t sign, exp, mant; + uint32_t roundBits; + + // Extract the sign, exponent, and mantissa + UINT uf = *(UINT*)&val; + + sign = (uf & 0x80000000) >> 31; + exp = (uf & 0x7F800000) >> 23; + mant = uf & 0x007FFFFF; + + // 10/11 bit floats are unsigned. Negative values are clamped to 0. + if (sign != 0) + { + exp = mant = 0; + } + // Check for out of range + else if ((exp == 0xFF) && (mant != 0)) // NaN + { + exp = 0x1F; + mant = 1 << numMantissaBits; + } + else if ((exp == 0xFF) && (mant == 0)) // INF + { + exp = 0x1F; + mant = 0; + } + else if (exp > (0x70 + 0x1E)) // Too big to represent + { + exp = 0x1Eu; + mant = (1 << numMantissaBits) - 1; // 0x3F for 6 bit mantissa. + } + else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm + { + mant |= 0x00800000; + for (; exp <= 0x70; mant >>= 1, exp++) + ; + exp = 0; + mant = mant >> (23 - numMantissaBits); + } + else if (exp < 0x66) // Too small to represent -> Zero + { + exp = 0; + mant = 0; + } + else + { + // Saves bits that will be shifted off for rounding + roundBits = mant & 0x1FFFu; + // convert exponent and mantissa to 16 bit format + exp = exp - 0x70u; + mant = mant >> (23 - numMantissaBits); + + // Essentially RTZ, but round up if off by only 1 lsb + if (roundBits == 0x1FFFu) + { + mant++; + // check for overflow + if ((mant & (0x3 << numMantissaBits)) != 0) // 0x60 = 0x3 << (num Mantissa Bits) + exp++; + // make sure only the needed bits are used + mant &= (1 << numMantissaBits) - 1; + } + } + + UINT tmpVal = (exp << numMantissaBits) | mant; + return tmpVal; +} + +#if KNOB_ARCH == KNOB_ARCH_AVX +////////////////////////////////////////////////////////////////////////// +/// @brief Convert an IEEE 754 32-bit single precision float to an +/// 16 bit float with 5 exponent bits and a variable +/// number of mantissa bits. +/// @param val - 32-bit float +/// @todo Maybe move this outside of this file into a header? +static uint16_t Convert32To16Float(float val) +{ + uint32_t sign, exp, mant; + uint32_t roundBits; + + // Extract the sign, exponent, and mantissa + uint32_t uf = *(uint32_t*)&val; + sign = (uf & 0x80000000) >> 31; + exp = (uf & 0x7F800000) >> 23; + mant = uf & 0x007FFFFF; + + // Check for out of range + if (std::isnan(val)) + { + exp = 0x1F; + mant = 0x200; + sign = 1; // set the sign bit for NANs + } + else if (std::isinf(val)) + { + exp = 0x1f; + mant = 0x0; + } + else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value + { + exp = 0x1E; + mant = 0x3FF; + } + else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm + { + mant |= 0x00800000; + for (; exp <= 0x70; mant >>= 1, exp++) + ; + exp = 0; + mant = mant >> 13; + } + else if (exp < 0x66) // Too small to represent -> Zero + { + exp = 0; + mant = 0; + } + else + { + // Saves bits that will be shifted off for rounding + roundBits = mant & 0x1FFFu; + // convert exponent and mantissa to 16 bit format + exp = exp - 0x70; + mant = mant >> 13; + + // Essentially RTZ, but round up if off by only 1 lsb + if (roundBits == 0x1FFFu) + { + mant++; + // check for overflow + if ((mant & 0xC00u) != 0) + exp++; + // make sure only the needed bits are used + mant &= 0x3FF; + } + } + + uint32_t tmpVal = (sign << 15) | (exp << 10) | mant; + return (uint16_t)tmpVal; +} +#endif + +////////////////////////////////////////////////////////////////////////// +/// @brief Retrieve color from hot tile source which is always float. +/// @param pDstPixel - Pointer to destination pixel. +/// @param srcPixel - Pointer to source pixel (pre-swizzled according to dest). +template<SWR_FORMAT DstFormat> +static void ConvertPixelFromFloat( + BYTE* pDstPixel, + const float srcPixel[4]) +{ + UINT outColor[4]; // typeless bits + + // Store component + for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp) + { + SWR_TYPE type = FormatTraits<DstFormat>::GetType(comp); + + float src = srcPixel[comp]; + + switch (type) + { + case SWR_TYPE_UNORM: + { + // Force NaN to 0. IEEE standard, comparisons involving NaN always evaluate to false. + src = (src != src) ? 0.0f : src; + + // Clamp [0, 1] + src = std::max(src, 0.0f); + src = std::min(src, 1.0f); + + // SRGB + if (FormatTraits<DstFormat>::isSRGB && comp != 3) + { + src = (src <= 0.0031308f) ? (12.92f * src) : (1.055f * powf(src, (1.0f / 2.4f)) - 0.055f); + } + + // Float scale to integer scale. + UINT scale = (1 << FormatTraits<DstFormat>::GetBPC(comp)) - 1; + src = (float)scale * src; + src = roundf(src); + outColor[comp] = (UINT)src; // Drop fractional part. + break; + } + case SWR_TYPE_SNORM: + { + SWR_ASSERT(!FormatTraits<DstFormat>::isSRGB); + + // Force NaN to 0. IEEE standard, comparisons involving NaN always evaluate to false. + src = (src != src) ? 0.0f : src; + + // Clamp [-1, 1] + src = std::max(src, -1.0f); + src = std::min(src, 1.0f); + + // Float scale to integer scale. + UINT scale = (1 << (FormatTraits<DstFormat>::GetBPC(comp) - 1)) - 1; + src = (float)scale * src; + + // Round + src += (src >= 0) ? 0.5f : -0.5f; + + INT out = (INT)src; + + outColor[comp] = *(UINT*)&out; + + break; + } + case SWR_TYPE_UINT: + { + ///@note The *(UINT*)& is currently necessary as the hot tile appears to always be float. + // However, the number in the hot tile should be unsigned integer. So doing this + // to preserve bits intead of doing a float -> integer conversion. + if (FormatTraits<DstFormat>::GetBPC(comp) == 32) + { + outColor[comp] = *(UINT*)&src; + } + else + { + outColor[comp] = *(UINT*)&src; + UINT max = (1 << FormatTraits<DstFormat>::GetBPC(comp)) - 1; // 2^numBits - 1 + + outColor[comp] = std::min(max, outColor[comp]); + } + break; + } + case SWR_TYPE_SINT: + { + if (FormatTraits<DstFormat>::GetBPC(comp) == 32) + { + outColor[comp] = *(UINT*)&src; + } + else + { + INT out = *(INT*)&src; // Hot tile format is SINT? + INT max = (1 << (FormatTraits<DstFormat>::GetBPC(comp) - 1)) - 1; + INT min = -1 - max; + + ///@note The output is unsigned integer (bag of bits) and so performing + // the clamping here based on range of output component. Also, manually adding + // the sign bit in the appropriate spot. Maybe a better way? + out = std::max(out, min); + out = std::min(out, max); + + outColor[comp] = *(UINT*)&out; + } + break; + } + case SWR_TYPE_FLOAT: + { + if (FormatTraits<DstFormat>::GetBPC(comp) == 16) + { + // Convert from 32-bit float to 16-bit float using _mm_cvtps_ph + // @todo 16bit float instruction support is orthogonal to avx support. need to + // add check for F16C support instead. +#if KNOB_ARCH == KNOB_ARCH_AVX2 + __m128 src128 = _mm_set1_ps(src); + __m128i srci128 = _mm_cvtps_ph(src128, _MM_FROUND_TRUNC); + UINT value = _mm_extract_epi16(srci128, 0); +#else + UINT value = Convert32To16Float(src); +#endif + + outColor[comp] = value; + } + else if (FormatTraits<DstFormat>::GetBPC(comp) == 11) + { + outColor[comp] = Convert32ToSmallFloat<6>(src); + } + else if (FormatTraits<DstFormat>::GetBPC(comp) == 10) + { + outColor[comp] = Convert32ToSmallFloat<5>(src); + } + else + { + outColor[comp] = *(UINT*)&src; + } + + break; + } + default: + SWR_ASSERT(0); + break; + } + } + + typename FormatTraits<DstFormat>::FormatT* pPixel = (typename FormatTraits<DstFormat>::FormatT*)pDstPixel; + + switch (FormatTraits<DstFormat>::numComps) + { + case 4: + pPixel->a = outColor[3]; + case 3: + pPixel->b = outColor[2]; + case 2: + pPixel->g = outColor[1]; + case 1: + pPixel->r = outColor[0]; + break; + default: + SWR_ASSERT(0); + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Convert pixel in any format to float32 +/// @param pDstPixel - Pointer to destination pixel. +/// @param srcPixel - Pointer to source pixel +template<SWR_FORMAT SrcFormat> +INLINE static void ConvertPixelToFloat( + float dstPixel[4], + const BYTE* pSrc) +{ + UINT srcColor[4]; // typeless bits + + // unpack src pixel + typename FormatTraits<SrcFormat>::FormatT* pPixel = (typename FormatTraits<SrcFormat>::FormatT*)pSrc; + + // apply format defaults + for (uint32_t comp = 0; comp < 4; ++comp) + { + uint32_t def = FormatTraits<SrcFormat>::GetDefault(comp); + dstPixel[comp] = *(float*)&def; + } + + // load format data + switch (FormatTraits<SrcFormat>::numComps) + { + case 4: + srcColor[3] = pPixel->a; + case 3: + srcColor[2] = pPixel->b; + case 2: + srcColor[1] = pPixel->g; + case 1: + srcColor[0] = pPixel->r; + break; + default: + SWR_ASSERT(0); + } + + // Convert components + for (UINT comp = 0; comp < FormatTraits<SrcFormat>::numComps; ++comp) + { + SWR_TYPE type = FormatTraits<SrcFormat>::GetType(comp); + + UINT src = srcColor[comp]; + + switch (type) + { + case SWR_TYPE_UNORM: + { + float dst; + if (FormatTraits<SrcFormat>::isSRGB && comp != 3) + { + dst = *(float*)&srgb8Table[src]; + } + else + { + // component sizes > 16 must use fp divide to maintain ulp requirements + if (FormatTraits<SrcFormat>::GetBPC(comp) > 16) + { + dst = (float)src / (float)((1 << FormatTraits<SrcFormat>::GetBPC(comp)) - 1); + } + else + { + const float scale = (1.0f / (float)((1 << FormatTraits<SrcFormat>::GetBPC(comp)) - 1)); + dst = (float)src * scale; + } + } + dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = dst; + break; + } + case SWR_TYPE_SNORM: + { + SWR_ASSERT(!FormatTraits<SrcFormat>::isSRGB); + + float dst; + if (src == 0x10) + { + dst = -1.0f; + } + else + { + switch (FormatTraits<SrcFormat>::GetBPC(comp)) + { + case 8: + dst = (float)((int8_t)src); + break; + case 16: + dst = (float)((int16_t)src); + break; + case 32: + dst = (float)((int32_t)src); + break; + default: + assert(0 && "attempted to load from SNORM with unsupported bpc"); + dst = 0.0f; + break; + } + dst = dst * (1.0f / ((1 << (FormatTraits<SrcFormat>::GetBPC(comp) - 1)) - 1)); + } + dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = dst; + break; + } + case SWR_TYPE_UINT: + { + UINT dst = (UINT)src; + dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst; + break; + } + case SWR_TYPE_SINT: + { + int dst; + switch (FormatTraits<SrcFormat>::GetBPC(comp)) + { + case 8: + dst = (int8_t)src; + break; + case 16: + dst = (int16_t)src; + break; + case 32: + dst = (int32_t)src; + break; + default: + assert(0 && "attempted to load from SINT with unsupported bpc"); + dst = 0; + break; + } + dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst; + break; + } + case SWR_TYPE_FLOAT: + { + float dst; + if (FormatTraits<SrcFormat>::GetBPC(comp) == 16) + { +#if KNOB_ARCH == KNOB_ARCH_AVX2 + // Convert from 16-bit float to 32-bit float using _mm_cvtph_ps + // @todo 16bit float instruction support is orthogonal to avx support. need to + // add check for F16C support instead. + __m128i src128 = _mm_set1_epi32(src); + __m128 res = _mm_cvtph_ps(src128); + _mm_store_ss(&dst, res); +#else + dst = ConvertSmallFloatTo32(src); +#endif + } + else if (FormatTraits<SrcFormat>::GetBPC(comp) == 11) + { + dst = ConvertSmallFloatTo32(src << 4); + } + else if (FormatTraits<SrcFormat>::GetBPC(comp) == 10) + { + dst = ConvertSmallFloatTo32(src << 5); + } + else + { + dst = *(float*)&src; + } + + dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst; + break; + } + default: + SWR_ASSERT(0); + break; + } + } +} + +// non-templated version of conversion functions +INLINE static void ConvertPixelFromFloat( + SWR_FORMAT format, + uint8_t* pDst, + const float srcPixel[4]) +{ + switch (format) + { + case R32G32B32A32_FLOAT: ConvertPixelFromFloat<R32G32B32A32_FLOAT>(pDst, srcPixel); break; + case R32G32B32A32_SINT: ConvertPixelFromFloat<R32G32B32A32_SINT>(pDst, srcPixel); break; + case R32G32B32A32_UINT: ConvertPixelFromFloat<R32G32B32A32_UINT>(pDst, srcPixel); break; + case R32G32B32X32_FLOAT: ConvertPixelFromFloat<R32G32B32X32_FLOAT>(pDst, srcPixel); break; + case R32G32B32A32_SSCALED: ConvertPixelFromFloat<R32G32B32A32_SSCALED>(pDst, srcPixel); break; + case R32G32B32A32_USCALED: ConvertPixelFromFloat<R32G32B32A32_USCALED>(pDst, srcPixel); break; + case R32G32B32_FLOAT: ConvertPixelFromFloat<R32G32B32_FLOAT>(pDst, srcPixel); break; + case R32G32B32_SINT: ConvertPixelFromFloat<R32G32B32_SINT>(pDst, srcPixel); break; + case R32G32B32_UINT: ConvertPixelFromFloat<R32G32B32_UINT>(pDst, srcPixel); break; + case R32G32B32_SSCALED: ConvertPixelFromFloat<R32G32B32_SSCALED>(pDst, srcPixel); break; + case R32G32B32_USCALED: ConvertPixelFromFloat<R32G32B32_USCALED>(pDst, srcPixel); break; + case R16G16B16A16_UNORM: ConvertPixelFromFloat<R16G16B16A16_UNORM>(pDst, srcPixel); break; + case R16G16B16A16_SNORM: ConvertPixelFromFloat<R16G16B16A16_SNORM>(pDst, srcPixel); break; + case R16G16B16A16_SINT: ConvertPixelFromFloat<R16G16B16A16_SINT>(pDst, srcPixel); break; + case R16G16B16A16_UINT: ConvertPixelFromFloat<R16G16B16A16_UINT>(pDst, srcPixel); break; + case R16G16B16A16_FLOAT: ConvertPixelFromFloat<R16G16B16A16_FLOAT>(pDst, srcPixel); break; + case R32G32_FLOAT: ConvertPixelFromFloat<R32G32_FLOAT>(pDst, srcPixel); break; + case R32G32_SINT: ConvertPixelFromFloat<R32G32_SINT>(pDst, srcPixel); break; + case R32G32_UINT: ConvertPixelFromFloat<R32G32_UINT>(pDst, srcPixel); break; + case R32_FLOAT_X8X24_TYPELESS: ConvertPixelFromFloat<R32_FLOAT_X8X24_TYPELESS>(pDst, srcPixel); break; + case R16G16B16X16_UNORM: ConvertPixelFromFloat<R16G16B16X16_UNORM>(pDst, srcPixel); break; + case R16G16B16X16_FLOAT: ConvertPixelFromFloat<R16G16B16X16_FLOAT>(pDst, srcPixel); break; + case R16G16B16A16_SSCALED: ConvertPixelFromFloat<R16G16B16A16_SSCALED>(pDst, srcPixel); break; + case R16G16B16A16_USCALED: ConvertPixelFromFloat<R16G16B16A16_USCALED>(pDst, srcPixel); break; + case R32G32_SSCALED: ConvertPixelFromFloat<R32G32_SSCALED>(pDst, srcPixel); break; + case R32G32_USCALED: ConvertPixelFromFloat<R32G32_USCALED>(pDst, srcPixel); break; + case R32_FLOAT_X8X24_TYPELESS_LD: ConvertPixelFromFloat<R32_FLOAT_X8X24_TYPELESS_LD>(pDst, srcPixel); break; + case B8G8R8A8_UNORM: ConvertPixelFromFloat<B8G8R8A8_UNORM>(pDst, srcPixel); break; + case B8G8R8A8_UNORM_SRGB: ConvertPixelFromFloat<B8G8R8A8_UNORM_SRGB>(pDst, srcPixel); break; + case R10G10B10A2_UNORM: ConvertPixelFromFloat<R10G10B10A2_UNORM>(pDst, srcPixel); break; + case R10G10B10A2_UNORM_SRGB: ConvertPixelFromFloat<R10G10B10A2_UNORM_SRGB>(pDst, srcPixel); break; + case R10G10B10A2_UINT: ConvertPixelFromFloat<R10G10B10A2_UINT>(pDst, srcPixel); break; + case R8G8B8A8_UNORM: ConvertPixelFromFloat<R8G8B8A8_UNORM>(pDst, srcPixel); break; + case R8G8B8A8_UNORM_SRGB: ConvertPixelFromFloat<R8G8B8A8_UNORM_SRGB>(pDst, srcPixel); break; + case R8G8B8A8_SNORM: ConvertPixelFromFloat<R8G8B8A8_SNORM>(pDst, srcPixel); break; + case R8G8B8A8_SINT: ConvertPixelFromFloat<R8G8B8A8_SINT>(pDst, srcPixel); break; + case R8G8B8A8_UINT: ConvertPixelFromFloat<R8G8B8A8_UINT>(pDst, srcPixel); break; + case R16G16_UNORM: ConvertPixelFromFloat<R16G16_UNORM>(pDst, srcPixel); break; + case R16G16_SNORM: ConvertPixelFromFloat<R16G16_SNORM>(pDst, srcPixel); break; + case R16G16_SINT: ConvertPixelFromFloat<R16G16_SINT>(pDst, srcPixel); break; + case R16G16_UINT: ConvertPixelFromFloat<R16G16_UINT>(pDst, srcPixel); break; + case R16G16_FLOAT: ConvertPixelFromFloat<R16G16_FLOAT>(pDst, srcPixel); break; + case B10G10R10A2_UNORM: ConvertPixelFromFloat<B10G10R10A2_UNORM>(pDst, srcPixel); break; + case B10G10R10A2_UNORM_SRGB: ConvertPixelFromFloat<B10G10R10A2_UNORM_SRGB>(pDst, srcPixel); break; + case R11G11B10_FLOAT: ConvertPixelFromFloat<R11G11B10_FLOAT>(pDst, srcPixel); break; + case R32_SINT: ConvertPixelFromFloat<R32_SINT>(pDst, srcPixel); break; + case R32_UINT: ConvertPixelFromFloat<R32_UINT>(pDst, srcPixel); break; + case R32_FLOAT: ConvertPixelFromFloat<R32_FLOAT>(pDst, srcPixel); break; + case R24_UNORM_X8_TYPELESS: ConvertPixelFromFloat<R24_UNORM_X8_TYPELESS>(pDst, srcPixel); break; + case R24_UNORM_X8_TYPELESS_LD: ConvertPixelFromFloat<R24_UNORM_X8_TYPELESS_LD>(pDst, srcPixel); break; + case A32_FLOAT: ConvertPixelFromFloat<A32_FLOAT>(pDst, srcPixel); break; + case B8G8R8X8_UNORM: ConvertPixelFromFloat<B8G8R8X8_UNORM>(pDst, srcPixel); break; + case B8G8R8X8_UNORM_SRGB: ConvertPixelFromFloat<B8G8R8X8_UNORM_SRGB>(pDst, srcPixel); break; + case R8G8B8X8_UNORM: ConvertPixelFromFloat<R8G8B8X8_UNORM>(pDst, srcPixel); break; + case R8G8B8X8_UNORM_SRGB: ConvertPixelFromFloat<R8G8B8X8_UNORM_SRGB>(pDst, srcPixel); break; + case R9G9B9E5_SHAREDEXP: ConvertPixelFromFloat<R9G9B9E5_SHAREDEXP>(pDst, srcPixel); break; + case B10G10R10X2_UNORM: ConvertPixelFromFloat<B10G10R10X2_UNORM>(pDst, srcPixel); break; + case R10G10B10X2_USCALED: ConvertPixelFromFloat<R10G10B10X2_USCALED>(pDst, srcPixel); break; + case R8G8B8A8_SSCALED: ConvertPixelFromFloat<R8G8B8A8_SSCALED>(pDst, srcPixel); break; + case R8G8B8A8_USCALED: ConvertPixelFromFloat<R8G8B8A8_USCALED>(pDst, srcPixel); break; + case R16G16_SSCALED: ConvertPixelFromFloat<R16G16_SSCALED>(pDst, srcPixel); break; + case R16G16_USCALED: ConvertPixelFromFloat<R16G16_USCALED>(pDst, srcPixel); break; + case R32_SSCALED: ConvertPixelFromFloat<R32_SSCALED>(pDst, srcPixel); break; + case R32_USCALED: ConvertPixelFromFloat<R32_USCALED>(pDst, srcPixel); break; + case B5G6R5_UNORM: ConvertPixelFromFloat<B5G6R5_UNORM>(pDst, srcPixel); break; + case B5G6R5_UNORM_SRGB: ConvertPixelFromFloat<B5G6R5_UNORM_SRGB>(pDst, srcPixel); break; + case B5G5R5A1_UNORM: ConvertPixelFromFloat<B5G5R5A1_UNORM>(pDst, srcPixel); break; + case B5G5R5A1_UNORM_SRGB: ConvertPixelFromFloat<B5G5R5A1_UNORM_SRGB>(pDst, srcPixel); break; + case B4G4R4A4_UNORM: ConvertPixelFromFloat<B4G4R4A4_UNORM>(pDst, srcPixel); break; + case B4G4R4A4_UNORM_SRGB: ConvertPixelFromFloat<B4G4R4A4_UNORM_SRGB>(pDst, srcPixel); break; + case R8G8_UNORM: ConvertPixelFromFloat<R8G8_UNORM>(pDst, srcPixel); break; + case R8G8_SNORM: ConvertPixelFromFloat<R8G8_SNORM>(pDst, srcPixel); break; + case R8G8_SINT: ConvertPixelFromFloat<R8G8_SINT>(pDst, srcPixel); break; + case R8G8_UINT: ConvertPixelFromFloat<R8G8_UINT>(pDst, srcPixel); break; + case R16_UNORM: ConvertPixelFromFloat<R16_UNORM>(pDst, srcPixel); break; + case R16_SNORM: ConvertPixelFromFloat<R16_SNORM>(pDst, srcPixel); break; + case R16_SINT: ConvertPixelFromFloat<R16_SINT>(pDst, srcPixel); break; + case R16_UINT: ConvertPixelFromFloat<R16_UINT>(pDst, srcPixel); break; + case R16_FLOAT: ConvertPixelFromFloat<R16_FLOAT>(pDst, srcPixel); break; + case A16_UNORM: ConvertPixelFromFloat<A16_UNORM>(pDst, srcPixel); break; + case A16_FLOAT: ConvertPixelFromFloat<A16_FLOAT>(pDst, srcPixel); break; + case B5G5R5X1_UNORM: ConvertPixelFromFloat<B5G5R5X1_UNORM>(pDst, srcPixel); break; + case B5G5R5X1_UNORM_SRGB: ConvertPixelFromFloat<B5G5R5X1_UNORM_SRGB>(pDst, srcPixel); break; + case R8G8_SSCALED: ConvertPixelFromFloat<R8G8_SSCALED>(pDst, srcPixel); break; + case R8G8_USCALED: ConvertPixelFromFloat<R8G8_USCALED>(pDst, srcPixel); break; + case R16_SSCALED: ConvertPixelFromFloat<R16_SSCALED>(pDst, srcPixel); break; + case R16_USCALED: ConvertPixelFromFloat<R16_USCALED>(pDst, srcPixel); break; + case R8_UNORM: ConvertPixelFromFloat<R8_UNORM>(pDst, srcPixel); break; + case R8_SNORM: ConvertPixelFromFloat<R8_SNORM>(pDst, srcPixel); break; + case R8_SINT: ConvertPixelFromFloat<R8_SINT>(pDst, srcPixel); break; + case R8_UINT: ConvertPixelFromFloat<R8_UINT>(pDst, srcPixel); break; + case A8_UNORM: ConvertPixelFromFloat<A8_UNORM>(pDst, srcPixel); break; + case R8_SSCALED: ConvertPixelFromFloat<R8_SSCALED>(pDst, srcPixel); break; + case R8_USCALED: ConvertPixelFromFloat<R8_USCALED>(pDst, srcPixel); break; + case YCRCB_SWAPUVY: ConvertPixelFromFloat<YCRCB_SWAPUVY>(pDst, srcPixel); break; + case BC1_UNORM: ConvertPixelFromFloat<BC1_UNORM>(pDst, srcPixel); break; + case BC2_UNORM: ConvertPixelFromFloat<BC2_UNORM>(pDst, srcPixel); break; + case BC3_UNORM: ConvertPixelFromFloat<BC3_UNORM>(pDst, srcPixel); break; + case BC4_UNORM: ConvertPixelFromFloat<BC4_UNORM>(pDst, srcPixel); break; + case BC5_UNORM: ConvertPixelFromFloat<BC5_UNORM>(pDst, srcPixel); break; + case BC1_UNORM_SRGB: ConvertPixelFromFloat<BC1_UNORM_SRGB>(pDst, srcPixel); break; + case BC2_UNORM_SRGB: ConvertPixelFromFloat<BC2_UNORM_SRGB>(pDst, srcPixel); break; + case BC3_UNORM_SRGB: ConvertPixelFromFloat<BC3_UNORM_SRGB>(pDst, srcPixel); break; + case YCRCB_SWAPUV: ConvertPixelFromFloat<YCRCB_SWAPUV>(pDst, srcPixel); break; + case R8G8B8_UNORM: ConvertPixelFromFloat<R8G8B8_UNORM>(pDst, srcPixel); break; + case R8G8B8_SNORM: ConvertPixelFromFloat<R8G8B8_SNORM>(pDst, srcPixel); break; + case R8G8B8_SSCALED: ConvertPixelFromFloat<R8G8B8_SSCALED>(pDst, srcPixel); break; + case R8G8B8_USCALED: ConvertPixelFromFloat<R8G8B8_USCALED>(pDst, srcPixel); break; + case BC4_SNORM: ConvertPixelFromFloat<BC4_SNORM>(pDst, srcPixel); break; + case BC5_SNORM: ConvertPixelFromFloat<BC5_SNORM>(pDst, srcPixel); break; + case R16G16B16_FLOAT: ConvertPixelFromFloat<R16G16B16_FLOAT>(pDst, srcPixel); break; + case R16G16B16_UNORM: ConvertPixelFromFloat<R16G16B16_UNORM>(pDst, srcPixel); break; + case R16G16B16_SNORM: ConvertPixelFromFloat<R16G16B16_SNORM>(pDst, srcPixel); break; + case R16G16B16_SSCALED: ConvertPixelFromFloat<R16G16B16_SSCALED>(pDst, srcPixel); break; + case R16G16B16_USCALED: ConvertPixelFromFloat<R16G16B16_USCALED>(pDst, srcPixel); break; + case BC7_UNORM: ConvertPixelFromFloat<BC7_UNORM>(pDst, srcPixel); break; + case BC7_UNORM_SRGB: ConvertPixelFromFloat<BC7_UNORM_SRGB>(pDst, srcPixel); break; + case R8G8B8_UNORM_SRGB: ConvertPixelFromFloat<R8G8B8_UNORM_SRGB>(pDst, srcPixel); break; + case R16G16B16_UINT: ConvertPixelFromFloat<R16G16B16_UINT>(pDst, srcPixel); break; + case R16G16B16_SINT: ConvertPixelFromFloat<R16G16B16_SINT>(pDst, srcPixel); break; + case R10G10B10A2_SNORM: ConvertPixelFromFloat<R10G10B10A2_SNORM>(pDst, srcPixel); break; + case R10G10B10A2_USCALED: ConvertPixelFromFloat<R10G10B10A2_USCALED>(pDst, srcPixel); break; + case R10G10B10A2_SSCALED: ConvertPixelFromFloat<R10G10B10A2_SSCALED>(pDst, srcPixel); break; + case R10G10B10A2_SINT: ConvertPixelFromFloat<R10G10B10A2_SINT>(pDst, srcPixel); break; + case B10G10R10A2_SNORM: ConvertPixelFromFloat<B10G10R10A2_SNORM>(pDst, srcPixel); break; + case B10G10R10A2_USCALED: ConvertPixelFromFloat<B10G10R10A2_USCALED>(pDst, srcPixel); break; + case B10G10R10A2_SSCALED: ConvertPixelFromFloat<B10G10R10A2_SSCALED>(pDst, srcPixel); break; + case B10G10R10A2_UINT: ConvertPixelFromFloat<B10G10R10A2_UINT>(pDst, srcPixel); break; + case B10G10R10A2_SINT: ConvertPixelFromFloat<B10G10R10A2_SINT>(pDst, srcPixel); break; + case R8G8B8_UINT: ConvertPixelFromFloat<R8G8B8_UINT>(pDst, srcPixel); break; + case R8G8B8_SINT: ConvertPixelFromFloat<R8G8B8_SINT>(pDst, srcPixel); break; + default: + break; + } +} + + diff --git a/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp new file mode 100644 index 00000000000..5d9c0045a8a --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp @@ -0,0 +1,396 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file LoadTile.cpp +* +* @brief Functionality for Load +* +******************************************************************************/ +#include "common/os.h" +#include "common/formats.h" +#include "core/context.h" +#include "core/rdtsc_core.h" +#include "memory/TilingFunctions.h" +#include "memory/tilingtraits.h" +#include "memory/Convert.h" + +typedef void(*PFN_LOAD_TILES)(SWR_SURFACE_STATE*, uint8_t*, uint32_t, uint32_t, uint32_t); + +////////////////////////////////////////////////////////////////////////// +/// Load Raster Tile Function Tables. +////////////////////////////////////////////////////////////////////////// +static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_NONE[NUM_SWR_FORMATS]; +static PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_NONE[NUM_SWR_FORMATS]; + +static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS]; +static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR[NUM_SWR_FORMATS]; + +static PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS]; + +////////////////////////////////////////////////////////////////////////// +/// LoadRasterTile +////////////////////////////////////////////////////////////////////////// +template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> +struct LoadRasterTile +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Retrieve color from hot tile source which is always float. + /// @param pSrc - Pointer to raster tile. + /// @param x, y - Coordinates to raster tile. + /// @param output - output color + INLINE static void SetSwizzledDstColor( + const float srcColor[4], + uint32_t x, uint32_t y, + uint8_t* pDst) + { + typedef SimdTile<DstFormat, SrcFormat> SimdT; + + SimdT* pDstSimdTiles = (SimdT*)pDst; + + // Compute which simd tile we're accessing within 8x8 tile. + // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates. + uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM); + + SimdT* pSimdTile = &pDstSimdTiles[simdIndex]; + + uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM); + + pSimdTile->SetSwizzledColor(simdOffset, srcColor); + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Loads an 8x8 raster tile from the src surface. + /// @param pSrcSurface - Src surface state + /// @param pDst - Destination hot tile pointer + /// @param x, y - Coordinates to raster tile. + INLINE static void Load( + SWR_SURFACE_STATE* pSrcSurface, + uint8_t* pDst, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile. + { + uint32_t lodWidth = (pSrcSurface->width == 1) ? 1 : pSrcSurface->width >> pSrcSurface->lod; + uint32_t lodHeight = (pSrcSurface->height == 1) ? 1 : pSrcSurface->height >> pSrcSurface->lod; + + // For each raster tile pixel (rx, ry) + for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry) + { + for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx) + { + if (((x + rx) < lodWidth) && + ((y + ry) < lodHeight)) + { + uint8_t* pSrc = (uint8_t*)ComputeSurfaceAddress<false>(x + rx, y + ry, pSrcSurface->arrayIndex + renderTargetArrayIndex, + pSrcSurface->arrayIndex + renderTargetArrayIndex, sampleNum, + pSrcSurface->lod, pSrcSurface); + + float srcColor[4]; + ConvertPixelToFloat<SrcFormat>(srcColor, pSrc); + + // store pixel to hottile + SetSwizzledDstColor(srcColor, rx, ry, pDst); + } + } + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// LoadMacroTile - Loads a macro tile which consists of raster tiles. +////////////////////////////////////////////////////////////////////////// +template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> +struct LoadMacroTile +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Load a macrotile to the destination surface. + /// @param pSrc - Pointer to macro tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to macro tile + static void Load( + SWR_SURFACE_STATE* pSrcSurface, + uint8_t *pDstHotTile, + uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) + { + // Load each raster tile from the hot tile to the destination surface. + for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) + { + for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) + { + for (uint32_t sampleNum = 0; sampleNum < pSrcSurface->numSamples; sampleNum++) + { + LoadRasterTile<TTraits, SrcFormat, DstFormat>::Load(pSrcSurface, pDstHotTile, + (x + col), (y + row), sampleNum, renderTargetArrayIndex); + pDstHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<DstFormat>::bpp / 8); + } + } + } + } +}; + +static void BUCKETS_START(UINT id) +{ +#ifdef KNOB_ENABLE_RDTSC + gBucketMgr.StartBucket(id); +#endif +} + +static void BUCKETS_STOP(UINT id) +{ +#ifdef KNOB_ENABLE_RDTSC + gBucketMgr.StopBucket(id); +#endif +} + +// on demand buckets for load tiles +static std::vector<int> sBuckets(NUM_SWR_FORMATS, -1); +static std::mutex sBucketMutex; + +////////////////////////////////////////////////////////////////////////// +/// @brief Loads a full hottile from a render surface +/// @param hPrivateContext - Handle to private DC +/// @param dstFormat - Format for hot tile. +/// @param renderTargetIndex - Index to src render target +/// @param x, y - Coordinates to raster tile. +/// @param pDstHotTile - Pointer to Hot Tile +void LoadHotTile( + SWR_SURFACE_STATE *pSrcSurface, + SWR_FORMAT dstFormat, + SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, + uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, + uint8_t *pDstHotTile) +{ + PFN_LOAD_TILES pfnLoadTiles = NULL; + + // don't need to load null surfaces + if (pSrcSurface->type == SURFACE_NULL) + { + return; + } + + // force 0 if requested renderTargetArrayIndex is OOB + if (renderTargetArrayIndex >= pSrcSurface->depth) + { + renderTargetArrayIndex = 0; + } + + if (renderTargetIndex < SWR_ATTACHMENT_DEPTH) + { + switch (pSrcSurface->tileMode) + { + case SWR_TILE_NONE: + pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_NONE[pSrcSurface->format]; + break; + case SWR_TILE_MODE_YMAJOR: + pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR[pSrcSurface->format]; + break; + case SWR_TILE_MODE_XMAJOR: + pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR[pSrcSurface->format]; + break; + case SWR_TILE_MODE_WMAJOR: + SWR_ASSERT(pSrcSurface->format == R8_UINT); + pfnLoadTiles = LoadMacroTile<TilingTraits<SWR_TILE_MODE_WMAJOR, 8>, R8_UINT, R8_UINT>::Load; + break; + default: + SWR_ASSERT(0, "Unsupported tiling mode"); + break; + } + } + else if (renderTargetIndex == SWR_ATTACHMENT_DEPTH) + { + // Currently depth can map to linear and tile-y. + switch (pSrcSurface->tileMode) + { + case SWR_TILE_NONE: + pfnLoadTiles = sLoadTilesDepthTable_SWR_TILE_NONE[pSrcSurface->format]; + break; + case SWR_TILE_MODE_YMAJOR: + pfnLoadTiles = sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR[pSrcSurface->format]; + break; + default: + SWR_ASSERT(0, "Unsupported tiling mode"); + break; + } + } + else + { + SWR_ASSERT(renderTargetIndex == SWR_ATTACHMENT_STENCIL); + SWR_ASSERT(pSrcSurface->format == R8_UINT); + switch (pSrcSurface->tileMode) + { + case SWR_TILE_NONE: + pfnLoadTiles = LoadMacroTile<TilingTraits<SWR_TILE_NONE, 8>, R8_UINT, R8_UINT>::Load; + break; + case SWR_TILE_MODE_WMAJOR: + pfnLoadTiles = LoadMacroTile<TilingTraits<SWR_TILE_MODE_WMAJOR, 8>, R8_UINT, R8_UINT>::Load; + break; + default: + SWR_ASSERT(0, "Unsupported tiling mode"); + break; + } + } + + if (pfnLoadTiles == nullptr) + { + SWR_ASSERT(false, "Unsupported format for load tile"); + return; + } + + // Load a macro tile. +#ifdef KNOB_ENABLE_RDTSC + if (sBuckets[pSrcSurface->format] == -1) + { + // guard sBuckets update since storetiles is called by multiple threads + sBucketMutex.lock(); + if (sBuckets[pSrcSurface->format] == -1) + { + const SWR_FORMAT_INFO& info = GetFormatInfo(pSrcSurface->format); + BUCKET_DESC desc{ info.name, "", false, 0xffffffff }; + sBuckets[pSrcSurface->format] = gBucketMgr.RegisterBucket(desc); + } + sBucketMutex.unlock(); + } +#endif + + BUCKETS_START(sBuckets[pSrcSurface->format]); + pfnLoadTiles(pSrcSurface, pDstHotTile, x, y, renderTargetArrayIndex); + BUCKETS_STOP(sBuckets[pSrcSurface->format]); +} + +////////////////////////////////////////////////////////////////////////// +/// INIT_LOAD_TILES_TABLE - Helper macro for setting up the tables. +#define INIT_LOAD_TILES_COLOR_TABLE(tilemode) \ + memset(sLoadTilesColorTable_##tilemode, 0, sizeof(sLoadTilesColorTable_##tilemode)); \ + \ + sLoadTilesColorTable_##tilemode[R32G32B32A32_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R32G32B32A32_SINT] = LoadMacroTile<TilingTraits<tilemode, 128>, R32G32B32A32_SINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R32G32B32A32_UINT] = LoadMacroTile<TilingTraits<tilemode, 128>, R32G32B32A32_UINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R32G32B32X32_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 128>, R32G32B32X32_FLOAT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R32G32B32_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 96>, R32G32B32_FLOAT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R32G32B32_SINT] = LoadMacroTile<TilingTraits<tilemode, 96>, R32G32B32_SINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R32G32B32_UINT] = LoadMacroTile<TilingTraits<tilemode, 96>, R32G32B32_UINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R16G16B16A16_UNORM] = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R16G16B16A16_SNORM] = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_SNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R16G16B16A16_SINT] = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_SINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R16G16B16A16_UINT] = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_UINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R16G16B16A16_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_FLOAT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R32G32_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 64>, R32G32_FLOAT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R32G32_SINT] = LoadMacroTile<TilingTraits<tilemode, 64>, R32G32_SINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R32G32_UINT] = LoadMacroTile<TilingTraits<tilemode, 64>, R32G32_UINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R16G16B16X16_UNORM] = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16X16_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R16G16B16X16_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16X16_FLOAT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[B8G8R8A8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, B8G8R8A8_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[B8G8R8A8_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 32>, B8G8R8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R10G10B10A2_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R10G10B10A2_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R10G10B10A2_UINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_UINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R8G8B8A8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R8G8B8A8_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R8G8B8A8_SNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_SNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R8G8B8A8_SINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_SINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R8G8B8A8_UINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_UINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R16G16_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R16G16_SNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_SNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R16G16_SINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_SINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R16G16_UINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_UINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R16G16_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_FLOAT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[B10G10R10A2_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[B10G10R10A2_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R11G11B10_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 32>, R11G11B10_FLOAT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R32_SINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R32_SINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R32_UINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R32_UINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R32_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 32>, R32_FLOAT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[A32_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 32>, A32_FLOAT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[B8G8R8X8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, B8G8R8X8_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[B8G8R8X8_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 32>, B8G8R8X8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R8G8B8X8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8X8_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R8G8B8X8_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8X8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[B10G10R10X2_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10X2_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[B5G6R5_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, B5G6R5_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[B5G6R5_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 16>, B5G6R5_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[B5G5R5A1_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, B5G5R5A1_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[B5G5R5A1_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 16>, B5G5R5A1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[B4G4R4A4_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, B4G4R4A4_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[B4G4R4A4_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 16>, B4G4R4A4_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R8G8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, R8G8_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R8G8_SNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, R8G8_SNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R8G8_SINT] = LoadMacroTile<TilingTraits<tilemode, 16>, R8G8_SINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R8G8_UINT] = LoadMacroTile<TilingTraits<tilemode, 16>, R8G8_UINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R16_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, R16_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R16_SNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, R16_SNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R16_SINT] = LoadMacroTile<TilingTraits<tilemode, 16>, R16_SINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R16_UINT] = LoadMacroTile<TilingTraits<tilemode, 16>, R16_UINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R16_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 16>, R16_FLOAT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[A16_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, A16_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[A16_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 16>, A16_FLOAT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[B5G5R5X1_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, B5G5R5X1_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[B5G5R5X1_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 16>, B5G5R5X1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 8>, R8_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R8_SNORM] = LoadMacroTile<TilingTraits<tilemode, 8>, R8_SNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R8_SINT] = LoadMacroTile<TilingTraits<tilemode, 8>, R8_SINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R8_UINT] = LoadMacroTile<TilingTraits<tilemode, 8>, R8_UINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[A8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 8>, A8_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[BC1_UNORM] = LoadMacroTile<TilingTraits<tilemode, 64>, BC1_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[BC2_UNORM] = LoadMacroTile<TilingTraits<tilemode, 128>, BC2_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[BC3_UNORM] = LoadMacroTile<TilingTraits<tilemode, 128>, BC3_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[BC4_UNORM] = LoadMacroTile<TilingTraits<tilemode, 64>, BC4_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[BC5_UNORM] = LoadMacroTile<TilingTraits<tilemode, 128>, BC5_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[BC1_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 64>, BC1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[BC2_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 128>, BC2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[BC3_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 128>, BC3_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R8G8B8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R8G8B8_SNORM] = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_SNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[BC4_SNORM] = LoadMacroTile<TilingTraits<tilemode, 64>, BC4_SNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[BC5_SNORM] = LoadMacroTile<TilingTraits<tilemode, 128>, BC5_SNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R16G16B16_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_FLOAT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R16G16B16_UNORM] = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_UNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R16G16B16_SNORM] = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_SNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R8G8B8_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R16G16B16_UINT] = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_UINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R16G16B16_SINT] = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_SINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R10G10B10A2_SNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_SNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R10G10B10A2_SINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_SINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[B10G10R10A2_SNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_SNORM, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[B10G10R10A2_UINT] = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_UINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[B10G10R10A2_SINT] = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_SINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R8G8B8_UINT] = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_UINT, R32G32B32A32_FLOAT>::Load; \ + sLoadTilesColorTable_##tilemode[R8G8B8_SINT] = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_SINT, R32G32B32A32_FLOAT>::Load; \ + +////////////////////////////////////////////////////////////////////////// +/// INIT_LOAD_TILES_TABLE - Helper macro for setting up the tables. +#define INIT_LOAD_TILES_DEPTH_TABLE(tilemode) \ + memset(sLoadTilesDepthTable_##tilemode, 0, sizeof(sLoadTilesDepthTable_##tilemode)); \ + \ + sLoadTilesDepthTable_##tilemode[R16_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, R16_UNORM, R32_FLOAT>::Load; \ + sLoadTilesDepthTable_##tilemode[R32_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 32>, R32_FLOAT, R32_FLOAT>::Load; \ + sLoadTilesDepthTable_##tilemode[R24_UNORM_X8_TYPELESS] = LoadMacroTile<TilingTraits<tilemode, 32>, R24_UNORM_X8_TYPELESS, R32_FLOAT>::Load; \ + +////////////////////////////////////////////////////////////////////////// +/// @brief Sets up tables for LoadTile +void InitSimLoadTilesTable() +{ + INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_NONE); + INIT_LOAD_TILES_DEPTH_TABLE(SWR_TILE_NONE); + + INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_MODE_YMAJOR); + INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_MODE_XMAJOR); + + INIT_LOAD_TILES_DEPTH_TABLE(SWR_TILE_MODE_YMAJOR); +} diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp new file mode 100644 index 00000000000..9ed1d0bd0ec --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp @@ -0,0 +1,1717 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file StoreTile.cpp +* +* @brief Functionality for Store. +* +******************************************************************************/ +#include "common/os.h" +#include "common/formats.h" +#include "core/context.h" +#include "core/rdtsc_core.h" +#include "core/format_conversion.h" + +#include "memory/TilingFunctions.h" +#include "memory/tilingtraits.h" +#include "memory/Convert.h" +#include "core/multisample.h" + +#include <array> +#include <sstream> + +typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t); + +////////////////////////////////////////////////////////////////////////// +/// Store Raster Tile Function Tables. +////////////////////////////////////////////////////////////////////////// +static PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {}; +static PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {}; +static PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {}; + +////////////////////////////////////////////////////////////////////////// +/// StorePixels +/// @brief Stores a 4x2 (AVX) raster-tile to two rows. +/// @param pSrc - Pointer to source raster tile in SWRZ pixel order +/// @param ppDsts - Array of destination pointers. Each pointer is +/// to a single row of at most 16B. +/// @tparam NumDests - Number of destination pointers. Each pair of +/// pointers is for a 16-byte column of two rows. +////////////////////////////////////////////////////////////////////////// +template <size_t PixelSize, size_t NumDests> +struct StorePixels +{ + static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete; +}; + +////////////////////////////////////////////////////////////////////////// +/// StorePixels (32-bit pixel specialization) +/// @brief Stores a 4x2 (AVX) raster-tile to two rows. +/// @param pSrc - Pointer to source raster tile in SWRZ pixel order +/// @param ppDsts - Array of destination pointers. Each pointer is +/// to a single row of at most 16B. +/// @tparam NumDests - Number of destination pointers. Each pair of +/// pointers is for a 16-byte column of two rows. +////////////////////////////////////////////////////////////////////////// +template <> +struct StorePixels<8, 2> +{ + static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) + { + // Each 4-pixel row is 4 bytes. + const uint16_t* pPixSrc = (const uint16_t*)pSrc; + + // Unswizzle from SWR-Z order + uint16_t* pRow = (uint16_t*)ppDsts[0]; + pRow[0] = pPixSrc[0]; + pRow[1] = pPixSrc[2]; + + pRow = (uint16_t*)ppDsts[1]; + pRow[0] = pPixSrc[1]; + pRow[1] = pPixSrc[3]; + } +}; + +////////////////////////////////////////////////////////////////////////// +/// StorePixels (32-bit pixel specialization) +/// @brief Stores a 4x2 (AVX) raster-tile to two rows. +/// @param pSrc - Pointer to source raster tile in SWRZ pixel order +/// @param ppDsts - Array of destination pointers. Each pointer is +/// to a single row of at most 16B. +/// @tparam NumDests - Number of destination pointers. Each pair of +/// pointers is for a 16-byte column of two rows. +////////////////////////////////////////////////////////////////////////// +template <> +struct StorePixels<16, 2> +{ + static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) + { + // Each 4-pixel row is 8 bytes. + const uint32_t* pPixSrc = (const uint32_t*)pSrc; + + // Unswizzle from SWR-Z order + uint32_t* pRow = (uint32_t*)ppDsts[0]; + pRow[0] = pPixSrc[0]; + pRow[1] = pPixSrc[2]; + + pRow = (uint32_t*)ppDsts[1]; + pRow[0] = pPixSrc[1]; + pRow[1] = pPixSrc[3]; + } +}; + +////////////////////////////////////////////////////////////////////////// +/// StorePixels (32-bit pixel specialization) +/// @brief Stores a 4x2 (AVX) raster-tile to two rows. +/// @param pSrc - Pointer to source raster tile in SWRZ pixel order +/// @param ppDsts - Array of destination pointers. Each pointer is +/// to a single row of at most 16B. +/// @tparam NumDests - Number of destination pointers. Each pair of +/// pointers is for a 16-byte column of two rows. +////////////////////////////////////////////////////////////////////////// +template <> +struct StorePixels<32, 2> +{ + static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) + { + // Each 4-pixel row is 16-bytes + __m128i *pZRow01 = (__m128i*)pSrc; + __m128i vQuad00 = _mm_load_si128(pZRow01); + __m128i vQuad01 = _mm_load_si128(pZRow01 + 1); + + __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01); + __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01); + + _mm_storeu_si128((__m128i*)ppDsts[0], vRow00); + _mm_storeu_si128((__m128i*)ppDsts[1], vRow10); + } +}; + +////////////////////////////////////////////////////////////////////////// +/// StorePixels (32-bit pixel specialization) +/// @brief Stores a 4x2 (AVX) raster-tile to two rows. +/// @param pSrc - Pointer to source raster tile in SWRZ pixel order +/// @param ppDsts - Array of destination pointers. Each pointer is +/// to a single row of at most 16B. +/// @tparam NumDests - Number of destination pointers. Each pair of +/// pointers is for a 16-byte column of two rows. +////////////////////////////////////////////////////////////////////////// +template <> +struct StorePixels<64, 4> +{ + static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) + { + // Each 4-pixel row is 32 bytes. + const __m128i* pPixSrc = (const __m128i*)pSrc; + + // order of pointers match SWR-Z layout + __m128i** pvDsts = (__m128i**)&ppDsts[0]; + *pvDsts[0] = pPixSrc[0]; + *pvDsts[1] = pPixSrc[1]; + *pvDsts[2] = pPixSrc[2]; + *pvDsts[3] = pPixSrc[3]; + } +}; + +////////////////////////////////////////////////////////////////////////// +/// StorePixels (32-bit pixel specialization) +/// @brief Stores a 4x2 (AVX) raster-tile to two rows. +/// @param pSrc - Pointer to source raster tile in SWRZ pixel order +/// @param ppDsts - Array of destination pointers. Each pointer is +/// to a single row of at most 16B. +/// @tparam NumDests - Number of destination pointers. Each pair of +/// pointers is for a 16-byte column of two rows. +////////////////////////////////////////////////////////////////////////// +template <> +struct StorePixels<128, 8> +{ + static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8]) + { + // Each 4-pixel row is 64 bytes. + const __m128i* pPixSrc = (const __m128i*)pSrc; + + // Unswizzle from SWR-Z order + __m128i** pvDsts = (__m128i**)&ppDsts[0]; + *pvDsts[0] = pPixSrc[0]; + *pvDsts[1] = pPixSrc[2]; + *pvDsts[2] = pPixSrc[1]; + *pvDsts[3] = pPixSrc[3]; + *pvDsts[4] = pPixSrc[4]; + *pvDsts[5] = pPixSrc[6]; + *pvDsts[6] = pPixSrc[5]; + *pvDsts[7] = pPixSrc[7]; + } +}; + +////////////////////////////////////////////////////////////////////////// +/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) +////////////////////////////////////////////////////////////////////////// +template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> +struct ConvertPixelsSOAtoAOS +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Converts a SIMD from the Hot Tile to the destination format + /// and converts from SOA to AOS. + /// @param pSrc - Pointer to raster tile. + /// @param pDst - Pointer to destination surface or deswizzling buffer. + template <size_t NumDests> + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel + + OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES]; + OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; + + // Convert from SrcFormat --> DstFormat + simdvector src; + LoadSOA<SrcFormat>(pSrc, src); + StoreSOA<DstFormat>(src, soaTile); + + // Convert from SOA --> AOS + FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile); + + // Store data into destination + StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts); + } +}; + +////////////////////////////////////////////////////////////////////////// +/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) +/// Specialization for no format conversion +////////////////////////////////////////////////////////////////////////// +template<SWR_FORMAT Format> +struct ConvertPixelsSOAtoAOS<Format, Format> +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Converts a SIMD from the Hot Tile to the destination format + /// and converts from SOA to AOS. + /// @param pSrc - Pointer to raster tile. + /// @param pDst - Pointer to destination surface or deswizzling buffer. + template <size_t NumDests> + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel + + OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; + + // Convert from SOA --> AOS + FormatTraits<Format>::TransposeT::Transpose(pSrc, aosTile); + + // Store data into destination + StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts); + } +}; + +////////////////////////////////////////////////////////////////////////// +/// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM +////////////////////////////////////////////////////////////////////////// +template<> +struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM > +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Converts a SIMD from the Hot Tile to the destination format + /// and converts from SOA to AOS. + /// @param pSrc - Pointer to raster tile. + /// @param pDst - Pointer to destination surface or deswizzling buffer. + template <size_t NumDests> + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT; + static const SWR_FORMAT DstFormat = B5G6R5_UNORM; + static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel + + OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; + + // Load hot-tile + simdvector src, dst; + LoadSOA<SrcFormat>(pSrc, src); + + // deswizzle + dst.x = src[FormatTraits<DstFormat>::swizzle(0)]; + dst.y = src[FormatTraits<DstFormat>::swizzle(1)]; + dst.z = src[FormatTraits<DstFormat>::swizzle(2)]; + + // clamp + dst.x = Clamp<DstFormat>(dst.x, 0); + dst.y = Clamp<DstFormat>(dst.y, 1); + dst.z = Clamp<DstFormat>(dst.z, 2); + + // normalize + dst.x = Normalize<DstFormat>(dst.x, 0); + dst.y = Normalize<DstFormat>(dst.y, 1); + dst.z = Normalize<DstFormat>(dst.z, 2); + + // pack + simdscalari packed = _simd_castps_si(dst.x); + packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits<DstFormat>::GetBPC(0))); + packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits<DstFormat>::GetBPC(0) + + FormatTraits<DstFormat>::GetBPC(1))); + + // pack low 16 bits of each 32 bit lane to low 128 bits of dst + uint32_t *pPacked = (uint32_t*)&packed; + uint16_t *pAosTile = (uint16_t*)&aosTile[0]; + for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t) + { + *pAosTile++ = *pPacked++; + } + + // Store data into destination + StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts); + } +}; + +////////////////////////////////////////////////////////////////////////// +/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) +////////////////////////////////////////////////////////////////////////// +template<> +struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS> +{ + static const SWR_FORMAT SrcFormat = R32_FLOAT; + static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Converts a SIMD from the Hot Tile to the destination format + /// and converts from SOA to AOS. + /// @param pSrc - Pointer to raster tile. + /// @param pDst - Pointer to destination surface or deswizzling buffer. + template <size_t NumDests> + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel + + OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES]; + OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; + + // Convert from SrcFormat --> DstFormat + simdvector src; + LoadSOA<SrcFormat>(pSrc, src); + StoreSOA<DstFormat>(src, soaTile); + + // Convert from SOA --> AOS + FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile); + + // Store data into destination but don't overwrite the X8 bits + // Each 4-pixel row is 16-bytes + __m128i *pZRow01 = (__m128i*)aosTile; + __m128i vQuad00 = _mm_load_si128(pZRow01); + __m128i vQuad01 = _mm_load_si128(pZRow01 + 1); + + __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01); + __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01); + + __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]); + __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]); + + __m128i vMask = _mm_set1_epi32(0xFFFFFF); + + vDst0 = _mm_andnot_si128(vMask, vDst0); + vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask)); + vDst1 = _mm_andnot_si128(vMask, vDst1); + vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask)); + + _mm_storeu_si128((__m128i*)ppDsts[0], vDst0); + _mm_storeu_si128((__m128i*)ppDsts[1], vDst1); + } +}; + +template<SWR_FORMAT DstFormat> +INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1) +{ + static const uint32_t offset = sizeof(simdscalar); + + // swizzle rgba -> bgra while we load + simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr + simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg + simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb + simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa + + // clamp + vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps()); + vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f)); + + vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps()); + vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f)); + + vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps()); + vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f)); + + vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps()); + vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f)); + + if (FormatTraits<DstFormat>::isSRGB) + { + // Gamma-correct only rgb + vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0); + vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1); + vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2); + } + + // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format + vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); + vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); + vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); + vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3))); + + // moving to 8 wide integer vector types + __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr + __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg + __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb + __m256i src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa + +#if KNOB_ARCH == KNOB_ARCH_AVX + + // splitting into two sets of 4 wide integer vector types + // because AVX doesn't have instructions to support this operation at 8 wide + __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r + __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g + __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b + __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a + + __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r + __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g + __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b + __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a + + srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0 + srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0 + srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00 + srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00 + srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000 + srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000 + + srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr + srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00 + + srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr + srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00 + + srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr + srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr + + // unpack into rows that get the tiling order correct + __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr + __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0); + + __m256i final = _mm256_castsi128_si256(vRow00); + final = _mm256_insertf128_si256(final, vRow10, 1); + +#elif KNOB_ARCH == KNOB_ARCH_AVX2 + + // logic is as above, only wider + src1 = _mm256_slli_si256(src1, 1); + src2 = _mm256_slli_si256(src2, 2); + src3 = _mm256_slli_si256(src3, 3); + + src0 = _mm256_or_si256(src0, src1); + src2 = _mm256_or_si256(src2, src3); + + __m256i final = _mm256_or_si256(src0, src2); + + // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3 + final = _mm256_permute4x64_epi64(final, 0xD8); + +#endif + + _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final); +} + +template<SWR_FORMAT DstFormat> +INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1) +{ + static const uint32_t offset = sizeof(simdscalar); + + // swizzle rgba -> bgra while we load + simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr + simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg + simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb + // clamp + vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps()); + vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f)); + + vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps()); + vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f)); + + vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps()); + vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f)); + + if (FormatTraits<DstFormat>::isSRGB) + { + // Gamma-correct only rgb + vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0); + vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1); + vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2); + } + + // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format + vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); + vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); + vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); + + // moving to 8 wide integer vector types + __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr + __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg + __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb + +#if KNOB_ARCH == KNOB_ARCH_AVX + + // splitting into two sets of 4 wide integer vector types + // because AVX doesn't have instructions to support this operation at 8 wide + __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r + __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g + __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b + + __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r + __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g + __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b + + srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0 + srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0 + srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00 + srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00 + + srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr + + srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr + + srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr + srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr + + // unpack into rows that get the tiling order correct + __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr + __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0); + + __m256i final = _mm256_castsi128_si256(vRow00); + final = _mm256_insertf128_si256(final, vRow10, 1); + +#elif KNOB_ARCH == KNOB_ARCH_AVX2 + + // logic is as above, only wider + src1 = _mm256_slli_si256(src1, 1); + src2 = _mm256_slli_si256(src2, 2); + + src0 = _mm256_or_si256(src0, src1); + + __m256i final = _mm256_or_si256(src0, src2); + + // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3 + final = _mm256_permute4x64_epi64(final, 0xD8); + +#endif + + _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final); +} + +template<> +struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM> +{ + template <size_t NumDests> + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]); + } +}; + +template<> +struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM> +{ + template <size_t NumDests> + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]); + } +}; + +template<> +struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB > +{ + template <size_t NumDests> + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]); + } +}; + +template<> +struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB > +{ + template <size_t NumDests> + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]); + } +}; + +template<> +struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM > +{ + template <size_t NumDests> + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]); + } +}; + +template<> +struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM > +{ + template <size_t NumDests> + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]); + } +}; + +template<> +struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB > +{ + template <size_t NumDests> + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]); + } +}; + +template<> +struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB > +{ + template <size_t NumDests> + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]); + } +}; + +////////////////////////////////////////////////////////////////////////// +/// StoreRasterTile +////////////////////////////////////////////////////////////////////////// +template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> +struct StoreRasterTile +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Retrieve color from hot tile source which is always float. + /// @param pSrc - Pointer to raster tile. + /// @param x, y - Coordinates to raster tile. + /// @param output - output color + INLINE static void GetSwizzledSrcColor( + uint8_t* pSrc, + uint32_t x, uint32_t y, + float outputColor[4]) + { + typedef SimdTile<SrcFormat, DstFormat> SimdT; + + SimdT* pSrcSimdTiles = (SimdT*)pSrc; + + // Compute which simd tile we're accessing within 8x8 tile. + // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates. + uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM); + + SimdT* pSimdTile = &pSrcSimdTiles[simdIndex]; + + uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM); + + pSimdTile->GetSwizzledColor(simdOffset, outputColor); + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile. + { + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + + // For each raster tile pixel (rx, ry) + for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry) + { + for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx) + { + // Perform bounds checking. + if (((x + rx) < lodWidth) && + ((y + ry) < lodHeight)) + { + float srcColor[4]; + GetSwizzledSrcColor(pSrc, rx, ry, srcColor); + + uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false>((x + rx), (y + ry), + pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, + sampleNum, pDstSurface->lod, pDstSurface); + ConvertPixelFromFloat<DstFormat>(pDst, srcColor); + } + } + } + } +}; + +template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> +struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat> +{}; + +////////////////////////////////////////////////////////////////////////// +/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp +////////////////////////////////////////////////////////////////////////// +template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> +struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat > +{ + typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile; + static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; + static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) + { + // Punt non-full tiles to generic store + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + if (x + KNOB_TILE_X_DIM > lodWidth || + y + KNOB_TILE_Y_DIM > lodHeight) + { + return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); + } + + uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; + + for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) + { + uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; + + for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) + { + // Format conversion and convert from SOA to AOS, and store the rows. + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows); + + ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; + ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;; + pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; + } + + ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; + ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp +////////////////////////////////////////////////////////////////////////// +template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> +struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat > +{ + typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile; + static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; + static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) + { + // Punt non-full tiles to generic store + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + if (x + KNOB_TILE_X_DIM > lodWidth || + y + KNOB_TILE_Y_DIM > lodHeight) + { + return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); + } + + uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; + + for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) + { + uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; + + for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) + { + // Format conversion and convert from SOA to AOS, and store the rows. + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows); + + ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; + ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;; + pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; + } + + ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; + ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp +////////////////////////////////////////////////////////////////////////// +template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> +struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat > +{ + typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile; + static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; + static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) + { + // Punt non-full tiles to generic store + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + if (x + KNOB_TILE_X_DIM > lodWidth || + y + KNOB_TILE_Y_DIM > lodHeight) + { + return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); + } + + uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; + + for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) + { + uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; + + for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) + { + // Format conversion and convert from SOA to AOS, and store the rows. + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows); + + ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; + ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;; + pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; + } + + ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; + ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp +////////////////////////////////////////////////////////////////////////// +template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> +struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat > +{ + typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile; + static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; + static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; + static const size_t MAX_DST_COLUMN_BYTES = 16; + static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; + static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) + { + // Punt non-full tiles to generic store + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + if (x + KNOB_TILE_X_DIM > lodWidth || + y + KNOB_TILE_Y_DIM > lodHeight) + { + return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); + } + + uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + uint8_t* ppDsts[] = + { + pDst, // row 0, col 0 + pDst + pDstSurface->pitch, // row 1, col 0 + pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 + pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 + }; + + for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) + { + uint8_t* ppStartRows[] = + { + ppDsts[0], + ppDsts[1], + ppDsts[2], + ppDsts[3], + }; + + for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) + { + // Format conversion and convert from SOA to AOS, and store the rows. + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); + + ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; + ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; + ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; + ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; + pSrc += SRC_COLUMN_BYTES; + } + + ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch; + ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch; + ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch; + ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch; + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp +////////////////////////////////////////////////////////////////////////// +template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> +struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat > +{ + typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile; + static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; + static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; + static const size_t MAX_DST_COLUMN_BYTES = 16; + static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; + static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) + { + // Punt non-full tiles to generic store + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + if (x + KNOB_TILE_X_DIM > lodWidth || + y + KNOB_TILE_Y_DIM > lodHeight) + { + return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); + } + + uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + struct DstPtrs + { + uint8_t* ppDsts[8]; + } ptrs; + + // Need 8 pointers, 4 columns of 2 rows each + for (uint32_t y = 0; y < 2; ++y) + { + for (uint32_t x = 0; x < 4; ++x) + { + ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES; + } + } + + for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) + { + DstPtrs startPtrs = ptrs; + + for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) + { + // Format conversion and convert from SOA to AOS, and store the rows. + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts); + + ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC; + pSrc += SRC_COLUMN_BYTES; + } + + ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch; + ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch; + ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch; + ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch; + ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch; + ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch; + ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch; + ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch; + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp +////////////////////////////////////////////////////////////////////////// +template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> +struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat > +{ + typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) + { + static const uint32_t DestRowWidthBytes = 16; // 16B rows + + // Punt non-full tiles to generic store + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + if (x + KNOB_TILE_X_DIM > lodWidth || + y + KNOB_TILE_Y_DIM > lodHeight) + { + return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); + } + + // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. + // We can compute the offsets to each column within the raster tile once and increment from these. + // There will be 2 x 4-wide columns in an 8x8 raster tile. + uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + + // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. + uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8; + + // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. + for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) + { + uint32_t rowOffset = row * DestRowWidthBytes; + + uint8_t* pRow = pCol0 + rowOffset; + uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; + + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); + pSrc += pSrcInc; + + ppDsts[0] += DestRowWidthBytes / 4; + ppDsts[1] += DestRowWidthBytes / 4; + + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); + pSrc += pSrcInc; + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp +////////////////////////////////////////////////////////////////////////// +template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> +struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat > +{ + typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) + { + static const uint32_t DestRowWidthBytes = 16; // 16B rows + + // Punt non-full tiles to generic store + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + if (x + KNOB_TILE_X_DIM > lodWidth || + y + KNOB_TILE_Y_DIM > lodHeight) + { + return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); + } + + // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. + // We can compute the offsets to each column within the raster tile once and increment from these. + // There will be 2 x 4-wide columns in an 8x8 raster tile. + uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + + // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. + uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8; + + // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. + for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) + { + uint32_t rowOffset = row * DestRowWidthBytes; + + uint8_t* pRow = pCol0 + rowOffset; + uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; + + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); + pSrc += pSrcInc; + + ppDsts[0] += DestRowWidthBytes / 2; + ppDsts[1] += DestRowWidthBytes / 2; + + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); + pSrc += pSrcInc; + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp +////////////////////////////////////////////////////////////////////////// +template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> +struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat > +{ + typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) + { + static const uint32_t DestRowWidthBytes = 512; // 512B rows + + // Punt non-full tiles to generic store + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + if (x + KNOB_TILE_X_DIM > lodWidth || + y + KNOB_TILE_Y_DIM > lodHeight) + { + return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); + } + + // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows. + // We can compute the offsets to each column within the raster tile once and increment from these. + uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + uint8_t* pRow1 = pRow0 + DestRowWidthBytes; + + for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) + { + for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM) + { + uint32_t xRowOffset = col * (FormatTraits<DstFormat>::bpp / 8); + + uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset }; + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); + + // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. + pSrc += (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8; + } + + pRow0 += (DestRowWidthBytes * 2); + pRow1 += (DestRowWidthBytes * 2); + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp +////////////////////////////////////////////////////////////////////////// +template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> +struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat > +{ + typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) + { + static const uint32_t DestRowWidthBytes = 16; // 16B rows + static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. + + // Punt non-full tiles to generic store + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + if (x + KNOB_TILE_X_DIM > lodWidth || + y + KNOB_TILE_Y_DIM > lodHeight) + { + return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); + } + + // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. + // We can compute the offsets to each column within the raster tile once and increment from these. + // There will be 2 x 4-wide columns in an 8x8 raster tile. + uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + + // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. + uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8; + + // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. + for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) + { + uint32_t rowOffset = row * DestRowWidthBytes; + + uint8_t* pRow = pCol0 + rowOffset; + uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; + + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); + pSrc += pSrcInc; + + ppDsts[0] += DestColumnBytes; + ppDsts[1] += DestColumnBytes; + + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); + pSrc += pSrcInc; + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp +////////////////////////////////////////////////////////////////////////// +template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> +struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat > +{ + typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) + { + static const uint32_t DestRowWidthBytes = 16; // 16B rows + static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. + + // Punt non-full tiles to generic store + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + if (x + KNOB_TILE_X_DIM > lodWidth || + y + KNOB_TILE_Y_DIM > lodHeight) + { + return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); + } + + // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. + // We can compute the offsets to each column within the raster tile once and increment from these. + // There will be 2 x 4-wide columns in an 8x8 raster tile. + uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + uint8_t* pCol1 = pCol0 + DestColumnBytes; + + // There are 4 columns, each 2 pixels wide when we have 64bpp pixels. + // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. + uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8; + + // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. + for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) + { + uint32_t rowOffset = row * DestRowWidthBytes; + uint8_t* ppDsts[] = + { + pCol0 + rowOffset, + pCol0 + rowOffset + DestRowWidthBytes, + pCol1 + rowOffset, + pCol1 + rowOffset + DestRowWidthBytes, + }; + + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); + pSrc += pSrcInc; + + ppDsts[0] += DestColumnBytes * 2; + ppDsts[1] += DestColumnBytes * 2; + ppDsts[2] += DestColumnBytes * 2; + ppDsts[3] += DestColumnBytes * 2; + + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); + pSrc += pSrcInc; + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp +////////////////////////////////////////////////////////////////////////// +template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> +struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat > +{ + typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile; + + static const size_t TILE_Y_COL_WIDTH_BYTES = 16; + static const size_t TILE_Y_ROWS = 32; + static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES; + + static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; + static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; + static const size_t MAX_DST_COLUMN_BYTES = 16; + + static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; + static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) + { + // Punt non-full tiles to generic store + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + if (x + KNOB_TILE_X_DIM > lodWidth || + y + KNOB_TILE_Y_DIM > lodHeight) + { + return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); + } + + uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + struct DstPtrs + { + uint8_t* ppDsts[8]; + } ptrs; + + // Need 8 pointers, 4 columns of 2 rows each + for (uint32_t y = 0; y < 2; ++y) + { + for (uint32_t x = 0; x < 4; ++x) + { + ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES; + } + } + + for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) + { + DstPtrs startPtrs = ptrs; + + for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) + { + // Format conversion and convert from SOA to AOS, and store the rows. + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts); + + ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC; + pSrc += SRC_COLUMN_BYTES; + } + + ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES; + ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES; + ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES; + ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES; + ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES; + ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES; + ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES; + ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES; + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// StoreMacroTile - Stores a macro tile which consists of raster tiles. +////////////////////////////////////////////////////////////////////////// +template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> +struct StoreMacroTile +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores a macrotile to the destination surface using safe implementation. + /// @param pSrc - Pointer to macro tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to macro tile + static void StoreGeneric( + uint8_t *pSrcHotTile, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) + { + // Store each raster tile from the hot tile to the destination surface. + for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) + { + for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) + { + for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) + { + StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store (pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, + renderTargetArrayIndex); + pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8); + } + } + } + } + + typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t); + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores a macrotile to the destination surface. + /// @param pSrc - Pointer to macro tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to macro tile + static void Store( + uint8_t *pSrcHotTile, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) + { + PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES]; + for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) + { + size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false>( + 0, + 0, + pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces + pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays + sampleNum, + pDstSurface->lod, + pDstSurface); + + // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear + bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) || (pDstSurface->bInterleavedSamples); + + pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store; + } + + // Store each raster tile from the hot tile to the destination surface. + for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) + { + for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) + { + for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) + { + pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex); + pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8); + } + } + } + } +}; + +static void BUCKETS_START(UINT id) +{ +#ifdef KNOB_ENABLE_RDTSC + gBucketMgr.StartBucket(id); +#endif +} + +static void BUCKETS_STOP(UINT id) +{ +#ifdef KNOB_ENABLE_RDTSC + gBucketMgr.StopBucket(id); +#endif +} + +// on demand buckets for store tiles +static std::mutex sBucketMutex; +static std::vector<int32_t> sBuckets(NUM_SWR_FORMATS, -1); + +////////////////////////////////////////////////////////////////////////// +/// @brief Deswizzles and stores a full hottile to a render surface +/// @param hPrivateContext - Handle to private DC +/// @param srcFormat - Format for hot tile. +/// @param renderTargetIndex - Index to destination render target +/// @param x, y - Coordinates to raster tile. +/// @param pSrcHotTile - Pointer to Hot Tile +void StoreHotTile( + SWR_SURFACE_STATE *pDstSurface, + SWR_FORMAT srcFormat, + SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, + uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, + uint8_t *pSrcHotTile) +{ + if (pDstSurface->type == SURFACE_NULL) + { + return; + } + + // force 0 if requested renderTargetArrayIndex is OOB + if (renderTargetArrayIndex >= pDstSurface->depth) + { + renderTargetArrayIndex = 0; + } + + PFN_STORE_TILES pfnStoreTiles = nullptr; + + if ((renderTargetIndex <= SWR_ATTACHMENT_COLOR7) && (pDstSurface->tileMode != SWR_TILE_MODE_WMAJOR)) + { + pfnStoreTiles = sStoreTilesTableColor[pDstSurface->tileMode][pDstSurface->format]; + } + else if (renderTargetIndex == SWR_ATTACHMENT_DEPTH) + { + pfnStoreTiles = sStoreTilesTableDepth[pDstSurface->tileMode][pDstSurface->format]; + } + else + { + pfnStoreTiles = sStoreTilesTableStencil[pDstSurface->tileMode][pDstSurface->format]; + } + + if(nullptr == pfnStoreTiles) + { + SWR_ASSERT(false, "Invalid pixel format / tile mode for store tiles"); + return; + } + + // Store a macro tile +#ifdef KNOB_ENABLE_RDTSC + if (sBuckets[pDstSurface->format] == -1) + { + // guard sBuckets update since storetiles is called by multiple threads + sBucketMutex.lock(); + if (sBuckets[pDstSurface->format] == -1) + { + const SWR_FORMAT_INFO& info = GetFormatInfo(pDstSurface->format); + BUCKET_DESC desc{info.name, "", false, 0xffffffff}; + sBuckets[pDstSurface->format] = gBucketMgr.RegisterBucket(desc); + } + sBucketMutex.unlock(); + } +#endif + + BUCKETS_START(sBuckets[pDstSurface->format]); + pfnStoreTiles(pSrcHotTile, pDstSurface, x, y, renderTargetArrayIndex); + BUCKETS_STOP(sBuckets[pDstSurface->format]); +} + +////////////////////////////////////////////////////////////////////////// +/// InitStoreTilesTable - Helper for setting up the tables. +template <SWR_TILE_MODE TileModeT, size_t NumTileModesT, size_t ArraySizeT> +void InitStoreTilesTableColor( + PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT]) +{ + table[TileModeT][R32G32B32A32_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store; + table[TileModeT][R32G32B32A32_SINT] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store; + table[TileModeT][R32G32B32A32_UINT] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store; + table[TileModeT][R32G32B32X32_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store; + table[TileModeT][R32G32B32_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store; + table[TileModeT][R32G32B32_SINT] = StoreMacroTile<TilingTraits<TileModeT, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store; + table[TileModeT][R32G32B32_UINT] = StoreMacroTile<TilingTraits<TileModeT, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store; + table[TileModeT][R16G16B16A16_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store; + table[TileModeT][R16G16B16A16_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store; + table[TileModeT][R16G16B16A16_SINT] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store; + table[TileModeT][R16G16B16A16_UINT] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store; + table[TileModeT][R16G16B16A16_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store; + table[TileModeT][R32G32_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store; + table[TileModeT][R32G32_SINT] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store; + table[TileModeT][R32G32_UINT] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store; + table[TileModeT][R16G16B16X16_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store; + table[TileModeT][R16G16B16X16_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store; + table[TileModeT][B8G8R8A8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store; + table[TileModeT][B8G8R8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store; + + // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now + table[TileModeT][R10G10B10A2_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric; + table[TileModeT][R10G10B10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric; + table[TileModeT][R10G10B10A2_UINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric; + + table[TileModeT][R8G8B8A8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store; + table[TileModeT][R8G8B8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store; + table[TileModeT][R8G8B8A8_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store; + table[TileModeT][R8G8B8A8_SINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store; + table[TileModeT][R8G8B8A8_UINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store; + table[TileModeT][R16G16_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store; + table[TileModeT][R16G16_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store; + table[TileModeT][R16G16_SINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store; + table[TileModeT][R16G16_UINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store; + table[TileModeT][R16G16_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store; + + // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now + table[TileModeT][B10G10R10A2_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric; + table[TileModeT][B10G10R10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric; + table[TileModeT][R11G11B10_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric; + + table[TileModeT][R32_SINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store; + table[TileModeT][R32_UINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store; + table[TileModeT][R32_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store; + table[TileModeT][A32_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store; + table[TileModeT][B8G8R8X8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store; + table[TileModeT][B8G8R8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store; + table[TileModeT][R8G8B8X8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store; + table[TileModeT][R8G8B8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store; + + // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now + table[TileModeT][B10G10R10X2_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric; + table[TileModeT][B5G6R5_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store; + table[TileModeT][B5G6R5_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric; + table[TileModeT][B5G5R5A1_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric; + table[TileModeT][B5G5R5A1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric; + table[TileModeT][B4G4R4A4_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric; + table[TileModeT][B4G4R4A4_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric; + + table[TileModeT][R8G8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store; + table[TileModeT][R8G8_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store; + table[TileModeT][R8G8_SINT] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store; + table[TileModeT][R8G8_UINT] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store; + table[TileModeT][R16_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store; + table[TileModeT][R16_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store; + table[TileModeT][R16_SINT] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store; + table[TileModeT][R16_UINT] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store; + table[TileModeT][R16_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store; + table[TileModeT][A16_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store; + table[TileModeT][A16_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store; + + // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now + table[TileModeT][B5G5R5X1_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric; + table[TileModeT][B5G5R5X1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric; + + table[TileModeT][R8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store; + table[TileModeT][R8_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store; + table[TileModeT][R8_SINT] = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store; + table[TileModeT][R8_UINT] = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store; + table[TileModeT][A8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store; + table[TileModeT][BC1_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, BC1_UNORM>::Store; + table[TileModeT][BC2_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC2_UNORM>::Store; + table[TileModeT][BC3_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC3_UNORM>::Store; + table[TileModeT][BC4_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, BC4_UNORM>::Store; + table[TileModeT][BC5_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC5_UNORM>::Store; + table[TileModeT][BC1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, BC1_UNORM_SRGB>::Store; + table[TileModeT][BC2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC2_UNORM_SRGB>::Store; + table[TileModeT][BC3_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC3_UNORM_SRGB>::Store; + table[TileModeT][R8G8B8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store; + table[TileModeT][R8G8B8_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store; + table[TileModeT][BC4_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, BC4_SNORM>::Store; + table[TileModeT][BC5_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC5_SNORM>::Store; + table[TileModeT][R16G16B16_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store; + table[TileModeT][R16G16B16_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store; + table[TileModeT][R16G16B16_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store; + table[TileModeT][R8G8B8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store; + table[TileModeT][R16G16B16_UINT] = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store; + table[TileModeT][R16G16B16_SINT] = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store; + + // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now + table[TileModeT][R10G10B10A2_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric; + table[TileModeT][R10G10B10A2_SINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric; + table[TileModeT][B10G10R10A2_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric; + table[TileModeT][B10G10R10A2_UINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric; + table[TileModeT][B10G10R10A2_SINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric; + + table[TileModeT][R8G8B8_UINT] = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store; + table[TileModeT][R8G8B8_SINT] = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store; +} + +////////////////////////////////////////////////////////////////////////// +/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables. +template <SWR_TILE_MODE TileModeT, size_t NumTileModes, size_t ArraySizeT> +void InitStoreTilesTableDepth( + PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT]) +{ + table[TileModeT][R32_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32_FLOAT, R32_FLOAT>::Store; + table[TileModeT][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store; + table[TileModeT][R16_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32_FLOAT, R16_UNORM>::Store; +} + +template <SWR_TILE_MODE TileModeT, size_t NumTileModes, size_t ArraySizeT> +void InitStoreTilesTableStencil( + PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT]) +{ + table[TileModeT][R32_UINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R8_UINT, R32_UINT>::Store; + table[TileModeT][R8_UINT] = StoreMacroTile<TilingTraits<TileModeT, 8>, R8_UINT, R8_UINT>::Store; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Sets up tables for StoreTile +void InitSimStoreTilesTable() +{ + memset(sStoreTilesTableColor, 0, sizeof(sStoreTilesTableColor)); + memset(sStoreTilesTableDepth, 0, sizeof(sStoreTilesTableDepth)); + + InitStoreTilesTableColor<SWR_TILE_NONE>(sStoreTilesTableColor); + InitStoreTilesTableDepth<SWR_TILE_NONE>(sStoreTilesTableDepth); + InitStoreTilesTableStencil<SWR_TILE_NONE>(sStoreTilesTableStencil); + + InitStoreTilesTableColor<SWR_TILE_MODE_YMAJOR>(sStoreTilesTableColor); + InitStoreTilesTableColor<SWR_TILE_MODE_XMAJOR>(sStoreTilesTableColor); + + InitStoreTilesTableDepth<SWR_TILE_MODE_YMAJOR>(sStoreTilesTableDepth); + InitStoreTilesTableStencil<SWR_TILE_MODE_WMAJOR>(sStoreTilesTableStencil); +} diff --git a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h new file mode 100644 index 00000000000..a14f3bf3f7c --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h @@ -0,0 +1,581 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file TilingFunctions.h +* +* @brief Tiling functions. +* +******************************************************************************/ +#pragma once + +#include "core/state.h" +#include "core/format_traits.h" +#include "memory/tilingtraits.h" + +#include <algorithm> + +#define MAX_NUM_LOD 15 + +#define GFX_ALIGN(x, a) (((x) + ((a) - 1)) - (((x) + ((a) - 1)) & ((a) - 1))) // Alt implementation with bitwise not (~) has issue with uint32 align used with 64-bit value, since ~'ed value will remain 32-bit. + +////////////////////////////////////////////////////////////////////////// +/// SimdTile SSE(2x2), AVX(4x2), or AVX-512(4x4?) +////////////////////////////////////////////////////////////////////////// +template<SWR_FORMAT HotTileFormat, SWR_FORMAT SrcOrDstFormat> +struct SimdTile +{ + // SimdTile is SOA (e.g. rrrrrrrr gggggggg bbbbbbbb aaaaaaaa ) + float color[FormatTraits<HotTileFormat>::numComps][KNOB_SIMD_WIDTH]; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Retrieve color from simd. + /// @param index - linear index to color within simd. + /// @param outputColor - output color + INLINE void GetSwizzledColor( + uint32_t index, + float outputColor[4]) + { + // SOA pattern for 2x2 is a subset of 4x2. + // 0 1 4 5 + // 2 3 6 7 + // The offset converts pattern to linear +#if (SIMD_TILE_X_DIM == 4) + static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 }; +#elif (SIMD_TILE_X_DIM == 2) + static const uint32_t offset[] = { 0, 1, 2, 3 }; +#endif + + for (uint32_t i = 0; i < FormatTraits<SrcOrDstFormat>::numComps; ++i) + { + outputColor[i] = this->color[FormatTraits<SrcOrDstFormat>::swizzle(i)][offset[index]]; + } + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Retrieve color from simd. + /// @param index - linear index to color within simd. + /// @param outputColor - output color + INLINE void SetSwizzledColor( + uint32_t index, + const float src[4]) + { + // SOA pattern for 2x2 is a subset of 4x2. + // 0 1 4 5 + // 2 3 6 7 + // The offset converts pattern to linear +#if (SIMD_TILE_X_DIM == 4) + static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 }; +#elif (SIMD_TILE_X_DIM == 2) + static const uint32_t offset[] = { 0, 1, 2, 3 }; +#endif + + // Only loop over the components needed for destination. + for (uint32_t i = 0; i < FormatTraits<SrcOrDstFormat>::numComps; ++i) + { + this->color[i][offset[index]] = src[i]; + } + } +}; + +template<> +struct SimdTile <R8_UINT,R8_UINT> +{ + // SimdTile is SOA (e.g. rrrrrrrr gggggggg bbbbbbbb aaaaaaaa ) + uint8_t color[FormatTraits<R8_UINT>::numComps][KNOB_SIMD_WIDTH]; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Retrieve color from simd. + /// @param index - linear index to color within simd. + /// @param outputColor - output color + INLINE void GetSwizzledColor( + uint32_t index, + float outputColor[4]) + { + // SOA pattern for 2x2 is a subset of 4x2. + // 0 1 4 5 + // 2 3 6 7 + // The offset converts pattern to linear +#if (SIMD_TILE_X_DIM == 4) + static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 }; +#elif (SIMD_TILE_X_DIM == 2) + static const uint32_t offset[] = { 0, 1, 2, 3 }; +#endif + + for (uint32_t i = 0; i < FormatTraits<R8_UINT>::numComps; ++i) + { + uint32_t src = this->color[FormatTraits<R8_UINT>::swizzle(i)][offset[index]]; + outputColor[i] = *(float*)&src; + } + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Retrieve color from simd. + /// @param index - linear index to color within simd. + /// @param outputColor - output color + INLINE void SetSwizzledColor( + uint32_t index, + const float src[4]) + { + // SOA pattern for 2x2 is a subset of 4x2. + // 0 1 4 5 + // 2 3 6 7 + // The offset converts pattern to linear +#if (SIMD_TILE_X_DIM == 4) + static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 }; +#elif (SIMD_TILE_X_DIM == 2) + static const uint32_t offset[] = { 0, 1, 2, 3 }; +#endif + + // Only loop over the components needed for destination. + for (uint32_t i = 0; i < FormatTraits<R8_UINT>::numComps; ++i) + { + this->color[i][offset[index]] = *(uint8_t*)&src[i]; + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief Computes lod offset for 1D surface at specified lod. +/// @param baseWidth - width of basemip (mip 0). +/// @param hAlign - horizontal alignment per miip, in texels +/// @param lod - lod index +/// @param offset - output offset. +INLINE void ComputeLODOffset1D( + const SWR_FORMAT_INFO& info, + uint32_t baseWidth, + uint32_t hAlign, + uint32_t lod, + uint32_t &offset) +{ + if (lod == 0) + { + offset = 0; + } + else + { + uint32_t curWidth = baseWidth; + // translate mip width from pixels to blocks for block compressed formats + // @note hAlign is already in blocks for compressed formats so no need to convert + if (info.isBC) curWidth /= info.bcWidth; + + offset = GFX_ALIGN(curWidth, hAlign); + for (uint32_t l = 1; l < lod; ++l) + { + curWidth = GFX_ALIGN(std::max<uint32_t>(curWidth >> 1, 1U), hAlign); + offset += curWidth; + } + + if (info.isSubsampled) + { + offset /= info.bcWidth; + } + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Computes x lod offset for 2D surface at specified lod. +/// @param baseWidth - width of basemip (mip 0). +/// @param hAlign - horizontal alignment per mip, in texels +/// @param lod - lod index +/// @param offset - output offset. +INLINE void ComputeLODOffsetX( + const SWR_FORMAT_INFO& info, + uint32_t baseWidth, + uint32_t hAlign, + uint32_t lod, + uint32_t &offset) +{ + if (lod < 2) + { + offset = 0; + } + else + { + uint32_t curWidth = baseWidth; + // convert mip width from pixels to blocks for block compressed formats + // @note hAlign is already in blocks for compressed formats so no need to convert + if (info.isBC) curWidth /= info.bcWidth; + + curWidth = std::max<uint32_t>(curWidth >> 1, 1U); + curWidth = GFX_ALIGN(curWidth, hAlign); + + if (info.isSubsampled) + { + curWidth /= info.bcWidth; + } + + offset = curWidth; + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Computes y lod offset for 2D surface at specified lod. +/// @param baseWidth - width of basemip (mip 0). +/// @param vAlign - vertical alignment per mip, in rows +/// @param lod - lod index +/// @param offset - output offset. +INLINE void ComputeLODOffsetY( + const SWR_FORMAT_INFO& info, + uint32_t baseHeight, + uint32_t vAlign, + uint32_t lod, + uint32_t &offset) +{ + if (lod == 0) + { + offset = 0; + } + else + { + offset = 0; + uint32_t mipHeight = baseHeight; + + // translate mip height from pixels to blocks for block compressed formats + // @note VAlign is already in blocks for compressed formats so no need to convert + if (info.isBC) mipHeight /= info.bcHeight; + + for (uint32_t l = 1; l <= lod; ++l) + { + uint32_t alignedMipHeight = GFX_ALIGN(mipHeight, vAlign); + offset += ((l != 2) ? alignedMipHeight : 0); + mipHeight = std::max<uint32_t>(mipHeight >> 1, 1U); + } + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Computes 1D surface offset +/// @param x - offset from start of array slice at given lod. +/// @param array - array slice index +/// @param lod - lod index +/// @param pState - surface state +/// @param xOffsetBytes - output offset in bytes. +template<bool UseCachedOffsets> +INLINE void ComputeSurfaceOffset1D( + uint32_t x, + uint32_t array, + uint32_t lod, + const SWR_SURFACE_STATE *pState, + uint32_t &xOffsetBytes) +{ + const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format); + uint32_t lodOffset; + + if (UseCachedOffsets) + { + lodOffset = pState->lodOffsets[0][lod]; + } + else + { + ComputeLODOffset1D(info, pState->width, pState->halign, lod, lodOffset); + } + + xOffsetBytes = (array * pState->qpitch + lodOffset + x) * info.Bpp; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Adjusts the array slice for legacy TileY MSAA +/// @param pState - surface state +/// @param array - array slice index +/// @param sampleNum - requested sample +INLINE void AdjustCoordsForMSAA(const SWR_SURFACE_STATE *pState, uint32_t& x, uint32_t& y, uint32_t& arrayIndex, uint32_t sampleNum) +{ + /// @todo: might want to templatize adjusting for sample slices when we support tileYS/tileYF. + if((pState->tileMode == SWR_TILE_MODE_YMAJOR || + pState->tileMode == SWR_TILE_MODE_WMAJOR) && + pState->bInterleavedSamples) + { + uint32_t newX, newY, newSampleX, newSampleY; + switch(pState->numSamples) + { + case 1: + newX = x; + newY = y; + newSampleX = newSampleY = 0; + break; + case 2: + { + assert(pState->type == SURFACE_2D); + static const uint32_t xMask = 0xFFFFFFFD; + static const uint32_t sampleMaskX = 0x1; + newX = pdep_u32(x, xMask); + newY = y; + newSampleX = pext_u32(sampleNum, sampleMaskX); + newSampleY = 0; + } + break; + case 4: + { + assert(pState->type == SURFACE_2D); + static const uint32_t mask = 0xFFFFFFFD; + static const uint32_t sampleMaskX = 0x1; + static const uint32_t sampleMaskY = 0x2; + newX = pdep_u32(x, mask); + newY = pdep_u32(y, mask); + newSampleX = pext_u32(sampleNum, sampleMaskX); + newSampleY = pext_u32(sampleNum, sampleMaskY); + } + break; + case 8: + { + assert(pState->type == SURFACE_2D); + static const uint32_t xMask = 0xFFFFFFF9; + static const uint32_t yMask = 0xFFFFFFFD; + static const uint32_t sampleMaskX = 0x5; + static const uint32_t sampleMaskY = 0x2; + newX = pdep_u32(x, xMask); + newY = pdep_u32(y, yMask); + newSampleX = pext_u32(sampleNum, sampleMaskX); + newSampleY = pext_u32(sampleNum, sampleMaskY); + } + break; + case 16: + { + assert(pState->type == SURFACE_2D); + static const uint32_t mask = 0xFFFFFFF9; + static const uint32_t sampleMaskX = 0x5; + static const uint32_t sampleMaskY = 0xA; + newX = pdep_u32(x, mask); + newY = pdep_u32(y, mask); + newSampleX = pext_u32(sampleNum, sampleMaskX); + newSampleY = pext_u32(sampleNum, sampleMaskY); + } + break; + default: + assert(0 && "Unsupported sample count"); + newX = newY = 0; + newSampleX = newSampleY = 0; + break; + } + x = newX | (newSampleX << 1); + y = newY | (newSampleY << 1); + } + else if(pState->tileMode == SWR_TILE_MODE_YMAJOR || + pState->tileMode == SWR_TILE_NONE) + { + uint32_t sampleShift; + switch(pState->numSamples) + { + case 1: + assert(sampleNum == 0); + sampleShift = 0; + break; + case 2: + assert(pState->type == SURFACE_2D); + sampleShift = 1; + break; + case 4: + assert(pState->type == SURFACE_2D); + sampleShift = 2; + break; + case 8: + assert(pState->type == SURFACE_2D); + sampleShift = 3; + break; + case 16: + assert(pState->type == SURFACE_2D); + sampleShift = 4; + break; + default: + assert(0 && "Unsupported sample count"); + sampleShift = 0; + break; + } + arrayIndex = (arrayIndex << sampleShift) | sampleNum; + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Computes 2D surface offset +/// @param x - horizontal offset from start of array slice and lod. +/// @param y - vertical offset from start of array slice and lod. +/// @param array - array slice index +/// @param lod - lod index +/// @param pState - surface state +/// @param xOffsetBytes - output x offset in bytes. +/// @param yOffsetRows - output y offset in bytes. +template<bool UseCachedOffsets> +INLINE void ComputeSurfaceOffset2D(uint32_t x, uint32_t y, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState, uint32_t &xOffsetBytes, uint32_t &yOffsetRows) +{ + const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format); + uint32_t lodOffsetX, lodOffsetY; + + if (UseCachedOffsets) + { + lodOffsetX = pState->lodOffsets[0][lod]; + lodOffsetY = pState->lodOffsets[1][lod]; + } + else + { + ComputeLODOffsetX(info, pState->width, pState->halign, lod, lodOffsetX); + ComputeLODOffsetY(info, pState->height, pState->valign, lod, lodOffsetY); + } + + AdjustCoordsForMSAA(pState, x, y, array, sampleNum); + xOffsetBytes = (x + lodOffsetX + pState->xOffset) * info.Bpp; + yOffsetRows = (array * pState->qpitch) + lodOffsetY + y + pState->yOffset; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Computes 3D surface offset +/// @param x - horizontal offset from start of array slice and lod. +/// @param y - vertical offset from start of array slice and lod. +/// @param z - depth offset from start of array slice and lod. +/// @param lod - lod index +/// @param pState - surface state +/// @param xOffsetBytes - output x offset in bytes. +/// @param yOffsetRows - output y offset in rows. +/// @param zOffsetSlices - output y offset in slices. +template<bool UseCachedOffsets> +INLINE void ComputeSurfaceOffset3D(uint32_t x, uint32_t y, uint32_t z, uint32_t lod, const SWR_SURFACE_STATE *pState, uint32_t &xOffsetBytes, uint32_t &yOffsetRows, uint32_t &zOffsetSlices) +{ + const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format); + uint32_t lodOffsetX, lodOffsetY; + + if (UseCachedOffsets) + { + lodOffsetX = pState->lodOffsets[0][lod]; + lodOffsetY = pState->lodOffsets[1][lod]; + } + else + { + ComputeLODOffsetX(info, pState->width, pState->halign, lod, lodOffsetX); + ComputeLODOffsetY(info, pState->height, pState->valign, lod, lodOffsetY); + } + + xOffsetBytes = (x + lodOffsetX) * info.Bpp; + yOffsetRows = lodOffsetY + y; + zOffsetSlices = z; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Swizzles the linear x,y offsets depending on surface tiling mode +/// and returns final surface address +/// @param xOffsetBytes - x offset from base of surface in bytes +/// @param yOffsetRows - y offset from base of surface in rows +/// @param pState - pointer to the surface state +template<typename TTraits> +INLINE uint32_t ComputeTileSwizzle2D(uint32_t xOffsetBytes, uint32_t yOffsetRows, const SWR_SURFACE_STATE *pState) +{ + return ComputeOffset2D<TTraits>(pState->pitch, xOffsetBytes, yOffsetRows); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Swizzles the linear x,y offsets depending on surface tiling mode +/// and returns final surface address +/// @param xOffsetBytes - x offset from base of surface in bytes +/// @param yOffsetRows - y offset from base of surface in rows +/// @param pState - pointer to the surface state +template<typename TTraits> +INLINE uint32_t ComputeTileSwizzle3D(uint32_t xOffsetBytes, uint32_t yOffsetRows, uint32_t zOffsetSlices, const SWR_SURFACE_STATE *pState) +{ + return ComputeOffset3D<TTraits>(pState->qpitch, pState->pitch, xOffsetBytes, yOffsetRows, zOffsetSlices); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Swizzles the linear x,y offsets depending on surface tiling mode +/// and returns final surface address +/// @param xOffsetBytes - x offset from base of surface in bytes +/// @param yOffsetRows - y offset from base of surface in rows +/// @param pState - pointer to the surface state +INLINE +uint32_t TileSwizzle2D(uint32_t xOffsetBytes, uint32_t yOffsetRows, const SWR_SURFACE_STATE *pState) +{ + switch (pState->tileMode) + { + case SWR_TILE_NONE: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_NONE, 32> >(xOffsetBytes, yOffsetRows, pState); + case SWR_TILE_SWRZ: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_SWRZ, 32> >(xOffsetBytes, yOffsetRows, pState); + case SWR_TILE_MODE_XMAJOR: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_MODE_XMAJOR, 8> >(xOffsetBytes, yOffsetRows, pState); + case SWR_TILE_MODE_YMAJOR: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_MODE_YMAJOR, 32> >(xOffsetBytes, yOffsetRows, pState); + case SWR_TILE_MODE_WMAJOR: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_MODE_WMAJOR, 8> >(xOffsetBytes, yOffsetRows, pState); + default: SWR_ASSERT(0, "Unsupported tiling mode"); + } + return (uint32_t) NULL; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Swizzles the linear x,y,z offsets depending on surface tiling mode +/// and returns final surface address +/// @param xOffsetBytes - x offset from base of surface in bytes +/// @param yOffsetRows - y offset from base of surface in rows +/// @param zOffsetSlices - z offset from base of surface in slices +/// @param pState - pointer to the surface state +INLINE +uint32_t TileSwizzle3D(uint32_t xOffsetBytes, uint32_t yOffsetRows, uint32_t zOffsetSlices, const SWR_SURFACE_STATE *pState) +{ + switch (pState->tileMode) + { + case SWR_TILE_NONE: return ComputeTileSwizzle3D<TilingTraits<SWR_TILE_NONE, 32> >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState); + case SWR_TILE_SWRZ: return ComputeTileSwizzle3D<TilingTraits<SWR_TILE_SWRZ, 32> >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState); + case SWR_TILE_MODE_YMAJOR: return ComputeTileSwizzle3D<TilingTraits<SWR_TILE_MODE_YMAJOR, 32> >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState); + default: SWR_ASSERT(0, "Unsupported tiling mode"); + } + return (uint32_t) NULL; +} + +template<bool UseCachedOffsets> +INLINE +uint32_t ComputeSurfaceOffset(uint32_t x, uint32_t y, uint32_t z, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState) +{ + uint32_t offsetX = 0, offsetY = 0, offsetZ = 0; + switch (pState->type) + { + case SURFACE_BUFFER: + case SURFACE_STRUCTURED_BUFFER: + offsetX = x * pState->pitch; + return offsetX; + break; + case SURFACE_1D: + ComputeSurfaceOffset1D<UseCachedOffsets>(x, array, lod, pState, offsetX); + return TileSwizzle2D(offsetX, 0, pState); + break; + case SURFACE_2D: + ComputeSurfaceOffset2D<UseCachedOffsets>(x, y, array, sampleNum, lod, pState, offsetX, offsetY); + return TileSwizzle2D(offsetX, offsetY, pState); + case SURFACE_3D: + ComputeSurfaceOffset3D<UseCachedOffsets>(x, y, z, lod, pState, offsetX, offsetY, offsetZ); + return TileSwizzle3D(offsetX, offsetY, offsetZ, pState); + break; + case SURFACE_CUBE: + ComputeSurfaceOffset2D<UseCachedOffsets>(x, y, array, sampleNum, lod, pState, offsetX, offsetY); + return TileSwizzle2D(offsetX, offsetY, pState); + break; + default: SWR_ASSERT(0, "Unsupported format"); + } + + return (uint32_t) NULL; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Computes surface address at the given location and lod +/// @param x - x location in pixels +/// @param y - y location in rows +/// @param z - z location for 3D surfaces +/// @param array - array slice for 1D and 2D surfaces +/// @param lod - level of detail +/// @param pState - pointer to the surface state +template<bool UseCachedOffsets> +INLINE +void* ComputeSurfaceAddress(uint32_t x, uint32_t y, uint32_t z, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState) +{ + return pState->pBaseAddress + ComputeSurfaceOffset<UseCachedOffsets>(x, y, z, array, sampleNum, lod, pState); +} diff --git a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h new file mode 100644 index 00000000000..50f8e57c22a --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h @@ -0,0 +1,263 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file tilingtraits.h +* +* @brief Tiling traits. +* +******************************************************************************/ +#pragma once + +#include "core/state.h" + +template<SWR_TILE_MODE mode, int> +struct TilingTraits +{ + static const SWR_TILE_MODE TileMode{ mode }; + static UINT GetCu() { SWR_ASSERT(0); return 0; } + static UINT GetCv() { SWR_ASSERT(0); return 0; } + static UINT GetCr() { SWR_ASSERT(0); return 0; } + static UINT GetTileIDShift() { SWR_ASSERT(0); return 0; } + + /// @todo correct pdep shifts for all rastertile dims. Unused for now + static UINT GetPdepX() { SWR_ASSERT(0); return 0x37; } + static UINT GetPdepY() { SWR_ASSERT(0); return 0xC8; } +}; + +template<int X> struct TilingTraits <SWR_TILE_NONE, X> +{ + static const SWR_TILE_MODE TileMode{ SWR_TILE_NONE }; + static UINT GetCu() { return 0; } + static UINT GetCv() { return 0; } + static UINT GetCr() { return 0; } + static UINT GetTileIDShift() { return 0; } + static UINT GetPdepX() { return 0x00; } + static UINT GetPdepY() { return 0x00; } +}; + +template<> struct TilingTraits <SWR_TILE_SWRZ, 8> +{ + static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ }; + static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT; } + static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; } + static UINT GetCr() { return 0; } + static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT; } + + /// @todo correct pdep shifts for all rastertile dims. Unused for now + static UINT GetPdepX() { SWR_ASSERT(0); return 0x00; } + static UINT GetPdepY() { SWR_ASSERT(0); return 0x00; } +}; + +template<> struct TilingTraits <SWR_TILE_SWRZ, 32> +{ + static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ }; + static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT + 2; } + static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; } + static UINT GetCr() { return 0; } + static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT + 2; } + + static UINT GetPdepX() { return 0x37; } + static UINT GetPdepY() { return 0xC8; } +}; + +template<> struct TilingTraits <SWR_TILE_SWRZ, 128> +{ + static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ }; + static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT + 4; } + static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; } + static UINT GetCr() { return 0; } + static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT + 4; } + + /// @todo correct pdep shifts for all rastertile dims. Unused for now + static UINT GetPdepX() { SWR_ASSERT(0); return 0x37; } + static UINT GetPdepY() { SWR_ASSERT(0); return 0xC8; } +}; + +// y-major tiling layout unaffected by element size +template<int X> struct TilingTraits <SWR_TILE_MODE_YMAJOR, X> +{ + static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_YMAJOR }; + static UINT GetCu() { return 7; } + static UINT GetCv() { return 5; } + static UINT GetCr() { return 0; } + static UINT GetTileIDShift() { return 12; } + + static UINT GetPdepX() { return 0xe0f; } + static UINT GetPdepY() { return 0x1f0; } +}; + +// x-major tiling layout unaffected by element size +template<int X> struct TilingTraits <SWR_TILE_MODE_XMAJOR, X> +{ + static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_XMAJOR }; + static UINT GetCu() { return 9; } + static UINT GetCv() { return 3; } + static UINT GetCr() { return 0; } + static UINT GetTileIDShift() { return 12; } + + static UINT GetPdepX() { return 0x1ff; } + static UINT GetPdepY() { return 0xe00; } +}; + +template<int X> struct TilingTraits <SWR_TILE_MODE_WMAJOR, X> +{ + static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_WMAJOR }; + static UINT GetCu() { return 6; } + static UINT GetCv() { return 6; } + static UINT GetCr() { return 0; } + static UINT GetTileIDShift() { return 12; } + + static UINT GetPdepX() { return 0xe15; } + static UINT GetPdepY() { return 0x1ea; } +}; + +INLINE +UINT pdep_u32(UINT a, UINT mask) +{ +#if KNOB_ARCH==KNOB_ARCH_AVX2 + return _pdep_u32(a, mask); +#else + UINT result = 0; + + // copied from http://wm.ite.pl/articles/pdep-soft-emu.html + // using bsf instead of funky loop + DWORD maskIndex; + while (_BitScanForward(&maskIndex, mask)) + { + // 1. isolate lowest set bit of mask + const UINT lowest = 1 << maskIndex; + + // 2. populate LSB from src + const UINT LSB = (UINT)((int)(a << 31) >> 31); + + // 3. copy bit from mask + result |= LSB & lowest; + + // 4. clear lowest bit + mask &= ~lowest; + + // 5. prepare for next iteration + a >>= 1; + } + + return result; +#endif +} + +INLINE +UINT pext_u32(UINT a, UINT mask) +{ +#if KNOB_ARCH==KNOB_ARCH_AVX2 + return _pext_u32(a, mask); +#else + UINT result = 0; + DWORD maskIndex; + uint32_t currentBit = 0; + while (_BitScanForward(&maskIndex, mask)) + { + // 1. isolate lowest set bit of mask + const UINT lowest = 1 << maskIndex; + + // 2. copy bit from mask + result |= ((a & lowest) > 0) << currentBit++; + + // 3. clear lowest bit + mask &= ~lowest; + } + return result; +#endif +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Computes the tileID for 2D tiled surfaces +/// @param pitch - surface pitch in bytes +/// @param tileX - x offset in tiles +/// @param tileY - y offset in tiles +template<typename TTraits> +INLINE UINT ComputeTileOffset2D(UINT pitch, UINT tileX, UINT tileY) +{ + UINT tileID = tileY * (pitch >> TTraits::GetCu()) + tileX; + return tileID << TTraits::GetTileIDShift(); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Computes the tileID for 3D tiled surfaces +/// @param qpitch - surface qpitch in rows +/// @param pitch - surface pitch in bytes +/// @param tileX - x offset in tiles +/// @param tileY - y offset in tiles +/// @param tileZ - y offset in tiles +template<typename TTraits> +INLINE UINT ComputeTileOffset3D(UINT qpitch, UINT pitch, UINT tileX, UINT tileY, UINT tileZ) +{ + UINT tileID = (tileZ * (qpitch >> TTraits::GetCv()) + tileY) * (pitch >> TTraits::GetCu()) + tileX; + return tileID << TTraits::GetTileIDShift(); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Computes the byte offset for 2D tiled surfaces +/// @param pitch - surface pitch in bytes +/// @param x - x offset in bytes +/// @param y - y offset in rows +template<typename TTraits> +INLINE UINT ComputeOffset2D(UINT pitch, UINT x, UINT y) +{ + UINT tileID = ComputeTileOffset2D<TTraits>(pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv()); + UINT xSwizzle = pdep_u32(x, TTraits::GetPdepX()); + UINT ySwizzle = pdep_u32(y, TTraits::GetPdepY()); + return (tileID | xSwizzle | ySwizzle); +} + +#if KNOB_ARCH <= KNOB_ARCH_AVX +////////////////////////////////////////////////////////////////////////// +/// @brief Computes the byte offset for 2D tiled surfaces. Specialization +/// for tile-y surfaces that uses bit twiddling instead of pdep emulation. +/// @param pitch - surface pitch in bytes +/// @param x - x offset in bytes +/// @param y - y offset in rows +template<> +INLINE UINT ComputeOffset2D<TilingTraits<SWR_TILE_MODE_YMAJOR, 32> >(UINT pitch, UINT x, UINT y) +{ + typedef TilingTraits<SWR_TILE_MODE_YMAJOR, 32> TTraits; + + UINT tileID = ComputeTileOffset2D<TTraits>(pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv()); + UINT xSwizzle = ((x << 5) & 0xe00) | (x & 0xf); + UINT ySwizzle = (y << 4) & 0x1f0; + return (tileID | xSwizzle | ySwizzle); +} +#endif + +////////////////////////////////////////////////////////////////////////// +/// @brief Computes the byte offset for 3D tiled surfaces +/// @param qpitch - depth pitch in rows +/// @param pitch - surface pitch in bytes +/// @param x - x offset in bytes +/// @param y - y offset in rows +/// @param z - y offset in slices +template<typename TTraits> +INLINE UINT ComputeOffset3D(UINT qpitch, UINT pitch, UINT x, UINT y, UINT z) +{ + UINT tileID = ComputeTileOffset3D<TTraits>(qpitch, pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv(), z >> TTraits::GetCr()); + UINT xSwizzle = pdep_u32(x, TTraits::GetPdepX()); + UINT ySwizzle = pdep_u32(y, TTraits::GetPdepY()); + return (tileID | xSwizzle | ySwizzle); +} diff --git a/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py new file mode 100644 index 00000000000..44ab69815b1 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py @@ -0,0 +1,79 @@ +# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +# Python source +from __future__ import print_function +import os +import sys +import knob_defs +from mako.template import Template +from mako.exceptions import RichTraceback + +def write_template_to_string(template_filename, **kwargs): + try: + template = Template(filename=template_filename) + # Split + Join fixes line-endings for whatever platform you are using + return '\n'.join(template.render(**kwargs).splitlines()) + except: + traceback = RichTraceback() + for (filename, lineno, function, line) in traceback.traceback: + print("File %s, line %s, in %s" % (filename, lineno, function)) + print(line, "\n") + print("%s: %s" % (str(traceback.error.__class__.__name__), traceback.error)) + +def write_template_to_file(template_filename, output_filename, **kwargs): + with open(output_filename, "w") as outfile: + print(write_template_to_string(template_filename, **kwargs), file=outfile) + +def main(args=sys.argv[1:]): + if len(args) != 1: + print('Usage:', sys.argv[0], '<output_directory>', file=sys.stderr) + return 1 + + output_dir = args[0] + if not os.path.isdir(output_dir): + if os.path.exists(output_dir): + print('ERROR: Invalid output directory:', output_dir, file=sys.stderr) + return 1 + + try: + os.makedirs(output_dir) + except: + print('ERROR: Could not create output directory:', output_dir, file=sys.stderr) + return 1 + + # Output path exists, now just run the template + template_file = os.sep.join([sys.path[0], 'templates', 'knobs.template']) + output_file = os.sep.join([output_dir, 'gen_knobs.cpp']) + output_header = os.sep.join([output_dir, 'gen_knobs.h']) + + for f in [output_header, output_file]: + write_template_to_file(template_file, f, + filename='gen_knobs', + knobs=knob_defs.KNOBS, + includes=['core/knobs_init.h', 'common/os.h', 'sstream', 'iomanip'], + gen_header=True if f == output_header else False) + + return 0 + +if __name__ == '__main__': + sys.exit(main()) + diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py new file mode 100644 index 00000000000..8c51e1e8e73 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py @@ -0,0 +1,226 @@ +# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +# Python source +KNOBS = [ + ['ENABLE_ASSERT_DIALOGS', { + 'type' : 'bool', + 'default' : 'true', + 'desc' : ['Use dialogs when asserts fire.', + 'Asserts are only enabled in debug builds'], + }], + + ['SINGLE_THREADED', { + 'type' : 'bool', + 'default' : 'false', + 'desc' : ['If enabled will perform all rendering on the API thread.', + 'This is useful mainly for debugging purposes.'], + }], + + ['DUMP_SHADER_IR', { + 'type' : 'bool', + 'default' : 'false', + 'desc' : ['Dumps shader LLVM IR at various stages of jit compilation.'], + }], + + ['USE_GENERIC_STORETILE', { + 'type' : 'bool', + 'default' : 'false', + 'desc' : ['Always use generic function for performing StoreTile.', + 'Will be slightly slower than using optimized (jitted) path'], + }], + + ['FAST_CLEAR', { + 'type' : 'bool', + 'default' : 'true', + 'desc' : ['Replace 3D primitive execute with a SWRClearRT operation and', + 'defer clear execution to first backend op on hottile, or hottile store'], + }], + + ['MAX_NUMA_NODES', { + 'type' : 'uint32_t', + 'default' : '0', + 'desc' : ['Maximum # of NUMA-nodes per system used for worker threads', + ' 0 == ALL NUMA-nodes in the system', + ' N == Use at most N NUMA-nodes for rendering'], + }], + + ['MAX_CORES_PER_NUMA_NODE', { + 'type' : 'uint32_t', + 'default' : '0', + 'desc' : ['Maximum # of cores per NUMA-node used for worker threads.', + ' 0 == ALL non-API thread cores per NUMA-node', + ' N == Use at most N cores per NUMA-node'], + }], + + ['MAX_THREADS_PER_CORE', { + 'type' : 'uint32_t', + 'default' : '1', + 'desc' : ['Maximum # of (hyper)threads per physical core used for worker threads.', + ' 0 == ALL hyper-threads per core', + ' N == Use at most N hyper-threads per physical core'], + }], + + ['MAX_WORKER_THREADS', { + 'type' : 'uint32_t', + 'default' : '0', + 'desc' : ['Maximum worker threads to spawn.', + '', + 'IMPORTANT: If this is non-zero, no worker threads will be bound to', + 'specific HW threads. They will all be "floating" SW threads.', + 'In this case, the above 3 KNOBS will be ignored.'], + }], + + ['BUCKETS_START_FRAME', { + 'type' : 'uint32_t', + 'default' : '1200', + 'desc' : ['Frame from when to start saving buckets data.', + '', + 'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h', + 'for this to have an effect.'], + }], + + ['BUCKETS_END_FRAME', { + 'type' : 'uint32_t', + 'default' : '1400', + 'desc' : ['Frame at which to stop saving buckets data.', + '', + 'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h', + 'for this to have an effect.'], + }], + + ['WORKER_SPIN_LOOP_COUNT', { + 'type' : 'uint32_t', + 'default' : '5000', + 'desc' : ['Number of spin-loop iterations worker threads will perform', + 'before going to sleep when waiting for work'], + }], + + ['MAX_DRAWS_IN_FLIGHT', { + 'type' : 'uint32_t', + 'default' : '160', + 'desc' : ['Maximum number of draws outstanding before API thread blocks.'], + }], + + ['MAX_PRIMS_PER_DRAW', { + 'type' : 'uint32_t', + 'default' : '2040', + 'desc' : ['Maximum primitives in a single Draw().', + 'Larger primitives are split into smaller Draw calls.', + 'Should be a multiple of (3 * vectorWidth).'], + }], + + ['MAX_TESS_PRIMS_PER_DRAW', { + 'type' : 'uint32_t', + 'default' : '16', + 'desc' : ['Maximum primitives in a single Draw() with tessellation enabled.', + 'Larger primitives are split into smaller Draw calls.', + 'Should be a multiple of (vectorWidth).'], + }], + + ['MAX_FRAC_ODD_TESS_FACTOR', { + 'type' : 'float', + 'default' : '63.0f', + 'desc' : ['(DEBUG) Maximum tessellation factor for fractional-odd partitioning.'], + }], + + ['MAX_FRAC_EVEN_TESS_FACTOR', { + 'type' : 'float', + 'default' : '64.0f', + 'desc' : ['(DEBUG) Maximum tessellation factor for fractional-even partitioning.'], + }], + + ['MAX_INTEGER_TESS_FACTOR', { + 'type' : 'uint32_t', + 'default' : '64', + 'desc' : ['(DEBUG) Maximum tessellation factor for integer partitioning.'], + }], + + + ['BUCKETS_ENABLE_THREADVIZ', { + 'type' : 'bool', + 'default' : 'false', + 'desc' : ['Enable threadviz output.'], + }], + + ['TOSS_DRAW', { + 'type' : 'bool', + 'default' : 'false', + 'desc' : ['Disable per-draw/dispatch execution'], + }], + + ['TOSS_QUEUE_FE', { + 'type' : 'bool', + 'default' : 'false', + 'desc' : ['Stop per-draw execution at worker FE', + '', + 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], + }], + + ['TOSS_FETCH', { + 'type' : 'bool', + 'default' : 'false', + 'desc' : ['Stop per-draw execution at vertex fetch', + '', + 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], + }], + + ['TOSS_IA', { + 'type' : 'bool', + 'default' : 'false', + 'desc' : ['Stop per-draw execution at input assembler', + '', + 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], + }], + + ['TOSS_VS', { + 'type' : 'bool', + 'default' : 'false', + 'desc' : ['Stop per-draw execution at vertex shader', + '', + 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], + }], + + ['TOSS_SETUP_TRIS', { + 'type' : 'bool', + 'default' : 'false', + 'desc' : ['Stop per-draw execution at primitive setup', + '', + 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], + }], + + ['TOSS_BIN_TRIS', { + 'type' : 'bool', + 'default' : 'false', + 'desc' : ['Stop per-draw execution at primitive binning', + '', + 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], + }], + + ['TOSS_RS', { + 'type' : 'bool', + 'default' : 'false', + 'desc' : ['Stop per-draw execution at rasterizer', + '', + 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], + }], + +] diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/__init__.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/__init__.py new file mode 100644 index 00000000000..d9638481889 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/__init__.py @@ -0,0 +1,8 @@ +# mako/__init__.py +# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file> +# +# This module is part of Mako and is released under +# the MIT License: http://www.opensource.org/licenses/mit-license.php + + +__version__ = '1.0.1' diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/_ast_util.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/_ast_util.py new file mode 100644 index 00000000000..efbc4fc245d --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/_ast_util.py @@ -0,0 +1,845 @@ +# mako/_ast_util.py +# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file> +# +# This module is part of Mako and is released under +# the MIT License: http://www.opensource.org/licenses/mit-license.php + +""" + ast + ~~~ + + The `ast` module helps Python applications to process trees of the Python + abstract syntax grammar. The abstract syntax itself might change with + each Python release; this module helps to find out programmatically what + the current grammar looks like and allows modifications of it. + + An abstract syntax tree can be generated by passing `ast.PyCF_ONLY_AST` as + a flag to the `compile()` builtin function or by using the `parse()` + function from this module. The result will be a tree of objects whose + classes all inherit from `ast.AST`. + + A modified abstract syntax tree can be compiled into a Python code object + using the built-in `compile()` function. + + Additionally various helper functions are provided that make working with + the trees simpler. The main intention of the helper functions and this + module in general is to provide an easy to use interface for libraries + that work tightly with the python syntax (template engines for example). + + + :copyright: Copyright 2008 by Armin Ronacher. + :license: Python License. +""" +from _ast import * +from mako.compat import arg_stringname + +BOOLOP_SYMBOLS = { + And: 'and', + Or: 'or' +} + +BINOP_SYMBOLS = { + Add: '+', + Sub: '-', + Mult: '*', + Div: '/', + FloorDiv: '//', + Mod: '%', + LShift: '<<', + RShift: '>>', + BitOr: '|', + BitAnd: '&', + BitXor: '^' +} + +CMPOP_SYMBOLS = { + Eq: '==', + Gt: '>', + GtE: '>=', + In: 'in', + Is: 'is', + IsNot: 'is not', + Lt: '<', + LtE: '<=', + NotEq: '!=', + NotIn: 'not in' +} + +UNARYOP_SYMBOLS = { + Invert: '~', + Not: 'not', + UAdd: '+', + USub: '-' +} + +ALL_SYMBOLS = {} +ALL_SYMBOLS.update(BOOLOP_SYMBOLS) +ALL_SYMBOLS.update(BINOP_SYMBOLS) +ALL_SYMBOLS.update(CMPOP_SYMBOLS) +ALL_SYMBOLS.update(UNARYOP_SYMBOLS) + + +def parse(expr, filename='<unknown>', mode='exec'): + """Parse an expression into an AST node.""" + return compile(expr, filename, mode, PyCF_ONLY_AST) + + +def to_source(node, indent_with=' ' * 4): + """ + This function can convert a node tree back into python sourcecode. This + is useful for debugging purposes, especially if you're dealing with custom + asts not generated by python itself. + + It could be that the sourcecode is evaluable when the AST itself is not + compilable / evaluable. The reason for this is that the AST contains some + more data than regular sourcecode does, which is dropped during + conversion. + + Each level of indentation is replaced with `indent_with`. Per default this + parameter is equal to four spaces as suggested by PEP 8, but it might be + adjusted to match the application's styleguide. + """ + generator = SourceGenerator(indent_with) + generator.visit(node) + return ''.join(generator.result) + + +def dump(node): + """ + A very verbose representation of the node passed. This is useful for + debugging purposes. + """ + def _format(node): + if isinstance(node, AST): + return '%s(%s)' % (node.__class__.__name__, + ', '.join('%s=%s' % (a, _format(b)) + for a, b in iter_fields(node))) + elif isinstance(node, list): + return '[%s]' % ', '.join(_format(x) for x in node) + return repr(node) + if not isinstance(node, AST): + raise TypeError('expected AST, got %r' % node.__class__.__name__) + return _format(node) + + +def copy_location(new_node, old_node): + """ + Copy the source location hint (`lineno` and `col_offset`) from the + old to the new node if possible and return the new one. + """ + for attr in 'lineno', 'col_offset': + if attr in old_node._attributes and attr in new_node._attributes \ + and hasattr(old_node, attr): + setattr(new_node, attr, getattr(old_node, attr)) + return new_node + + +def fix_missing_locations(node): + """ + Some nodes require a line number and the column offset. Without that + information the compiler will abort the compilation. Because it can be + a dull task to add appropriate line numbers and column offsets when + adding new nodes this function can help. It copies the line number and + column offset of the parent node to the child nodes without this + information. + + Unlike `copy_location` this works recursive and won't touch nodes that + already have a location information. + """ + def _fix(node, lineno, col_offset): + if 'lineno' in node._attributes: + if not hasattr(node, 'lineno'): + node.lineno = lineno + else: + lineno = node.lineno + if 'col_offset' in node._attributes: + if not hasattr(node, 'col_offset'): + node.col_offset = col_offset + else: + col_offset = node.col_offset + for child in iter_child_nodes(node): + _fix(child, lineno, col_offset) + _fix(node, 1, 0) + return node + + +def increment_lineno(node, n=1): + """ + Increment the line numbers of all nodes by `n` if they have line number + attributes. This is useful to "move code" to a different location in a + file. + """ + for node in zip((node,), walk(node)): + if 'lineno' in node._attributes: + node.lineno = getattr(node, 'lineno', 0) + n + + +def iter_fields(node): + """Iterate over all fields of a node, only yielding existing fields.""" + # CPython 2.5 compat + if not hasattr(node, '_fields') or not node._fields: + return + for field in node._fields: + try: + yield field, getattr(node, field) + except AttributeError: + pass + + +def get_fields(node): + """Like `iter_fiels` but returns a dict.""" + return dict(iter_fields(node)) + + +def iter_child_nodes(node): + """Iterate over all child nodes or a node.""" + for name, field in iter_fields(node): + if isinstance(field, AST): + yield field + elif isinstance(field, list): + for item in field: + if isinstance(item, AST): + yield item + + +def get_child_nodes(node): + """Like `iter_child_nodes` but returns a list.""" + return list(iter_child_nodes(node)) + + +def get_compile_mode(node): + """ + Get the mode for `compile` of a given node. If the node is not a `mod` + node (`Expression`, `Module` etc.) a `TypeError` is thrown. + """ + if not isinstance(node, mod): + raise TypeError('expected mod node, got %r' % node.__class__.__name__) + return { + Expression: 'eval', + Interactive: 'single' + }.get(node.__class__, 'expr') + + +def get_docstring(node): + """ + Return the docstring for the given node or `None` if no docstring can be + found. If the node provided does not accept docstrings a `TypeError` + will be raised. + """ + if not isinstance(node, (FunctionDef, ClassDef, Module)): + raise TypeError("%r can't have docstrings" % node.__class__.__name__) + if node.body and isinstance(node.body[0], Str): + return node.body[0].s + + +def walk(node): + """ + Iterate over all nodes. This is useful if you only want to modify nodes in + place and don't care about the context or the order the nodes are returned. + """ + from collections import deque + todo = deque([node]) + while todo: + node = todo.popleft() + todo.extend(iter_child_nodes(node)) + yield node + + +class NodeVisitor(object): + """ + Walks the abstract syntax tree and call visitor functions for every node + found. The visitor functions may return values which will be forwarded + by the `visit` method. + + Per default the visitor functions for the nodes are ``'visit_'`` + + class name of the node. So a `TryFinally` node visit function would + be `visit_TryFinally`. This behavior can be changed by overriding + the `get_visitor` function. If no visitor function exists for a node + (return value `None`) the `generic_visit` visitor is used instead. + + Don't use the `NodeVisitor` if you want to apply changes to nodes during + traversing. For this a special visitor exists (`NodeTransformer`) that + allows modifications. + """ + + def get_visitor(self, node): + """ + Return the visitor function for this node or `None` if no visitor + exists for this node. In that case the generic visit function is + used instead. + """ + method = 'visit_' + node.__class__.__name__ + return getattr(self, method, None) + + def visit(self, node): + """Visit a node.""" + f = self.get_visitor(node) + if f is not None: + return f(node) + return self.generic_visit(node) + + def generic_visit(self, node): + """Called if no explicit visitor function exists for a node.""" + for field, value in iter_fields(node): + if isinstance(value, list): + for item in value: + if isinstance(item, AST): + self.visit(item) + elif isinstance(value, AST): + self.visit(value) + + +class NodeTransformer(NodeVisitor): + """ + Walks the abstract syntax tree and allows modifications of nodes. + + The `NodeTransformer` will walk the AST and use the return value of the + visitor functions to replace or remove the old node. If the return + value of the visitor function is `None` the node will be removed + from the previous location otherwise it's replaced with the return + value. The return value may be the original node in which case no + replacement takes place. + + Here an example transformer that rewrites all `foo` to `data['foo']`:: + + class RewriteName(NodeTransformer): + + def visit_Name(self, node): + return copy_location(Subscript( + value=Name(id='data', ctx=Load()), + slice=Index(value=Str(s=node.id)), + ctx=node.ctx + ), node) + + Keep in mind that if the node you're operating on has child nodes + you must either transform the child nodes yourself or call the generic + visit function for the node first. + + Nodes that were part of a collection of statements (that applies to + all statement nodes) may also return a list of nodes rather than just + a single node. + + Usually you use the transformer like this:: + + node = YourTransformer().visit(node) + """ + + def generic_visit(self, node): + for field, old_value in iter_fields(node): + old_value = getattr(node, field, None) + if isinstance(old_value, list): + new_values = [] + for value in old_value: + if isinstance(value, AST): + value = self.visit(value) + if value is None: + continue + elif not isinstance(value, AST): + new_values.extend(value) + continue + new_values.append(value) + old_value[:] = new_values + elif isinstance(old_value, AST): + new_node = self.visit(old_value) + if new_node is None: + delattr(node, field) + else: + setattr(node, field, new_node) + return node + + +class SourceGenerator(NodeVisitor): + """ + This visitor is able to transform a well formed syntax tree into python + sourcecode. For more details have a look at the docstring of the + `node_to_source` function. + """ + + def __init__(self, indent_with): + self.result = [] + self.indent_with = indent_with + self.indentation = 0 + self.new_lines = 0 + + def write(self, x): + if self.new_lines: + if self.result: + self.result.append('\n' * self.new_lines) + self.result.append(self.indent_with * self.indentation) + self.new_lines = 0 + self.result.append(x) + + def newline(self, n=1): + self.new_lines = max(self.new_lines, n) + + def body(self, statements): + self.new_line = True + self.indentation += 1 + for stmt in statements: + self.visit(stmt) + self.indentation -= 1 + + def body_or_else(self, node): + self.body(node.body) + if node.orelse: + self.newline() + self.write('else:') + self.body(node.orelse) + + def signature(self, node): + want_comma = [] + def write_comma(): + if want_comma: + self.write(', ') + else: + want_comma.append(True) + + padding = [None] * (len(node.args) - len(node.defaults)) + for arg, default in zip(node.args, padding + node.defaults): + write_comma() + self.visit(arg) + if default is not None: + self.write('=') + self.visit(default) + if node.vararg is not None: + write_comma() + self.write('*' + arg_stringname(node.vararg)) + if node.kwarg is not None: + write_comma() + self.write('**' + arg_stringname(node.kwarg)) + + def decorators(self, node): + for decorator in node.decorator_list: + self.newline() + self.write('@') + self.visit(decorator) + + # Statements + + def visit_Assign(self, node): + self.newline() + for idx, target in enumerate(node.targets): + if idx: + self.write(', ') + self.visit(target) + self.write(' = ') + self.visit(node.value) + + def visit_AugAssign(self, node): + self.newline() + self.visit(node.target) + self.write(BINOP_SYMBOLS[type(node.op)] + '=') + self.visit(node.value) + + def visit_ImportFrom(self, node): + self.newline() + self.write('from %s%s import ' % ('.' * node.level, node.module)) + for idx, item in enumerate(node.names): + if idx: + self.write(', ') + self.write(item) + + def visit_Import(self, node): + self.newline() + for item in node.names: + self.write('import ') + self.visit(item) + + def visit_Expr(self, node): + self.newline() + self.generic_visit(node) + + def visit_FunctionDef(self, node): + self.newline(n=2) + self.decorators(node) + self.newline() + self.write('def %s(' % node.name) + self.signature(node.args) + self.write('):') + self.body(node.body) + + def visit_ClassDef(self, node): + have_args = [] + def paren_or_comma(): + if have_args: + self.write(', ') + else: + have_args.append(True) + self.write('(') + + self.newline(n=3) + self.decorators(node) + self.newline() + self.write('class %s' % node.name) + for base in node.bases: + paren_or_comma() + self.visit(base) + # XXX: the if here is used to keep this module compatible + # with python 2.6. + if hasattr(node, 'keywords'): + for keyword in node.keywords: + paren_or_comma() + self.write(keyword.arg + '=') + self.visit(keyword.value) + if node.starargs is not None: + paren_or_comma() + self.write('*') + self.visit(node.starargs) + if node.kwargs is not None: + paren_or_comma() + self.write('**') + self.visit(node.kwargs) + self.write(have_args and '):' or ':') + self.body(node.body) + + def visit_If(self, node): + self.newline() + self.write('if ') + self.visit(node.test) + self.write(':') + self.body(node.body) + while True: + else_ = node.orelse + if len(else_) == 1 and isinstance(else_[0], If): + node = else_[0] + self.newline() + self.write('elif ') + self.visit(node.test) + self.write(':') + self.body(node.body) + else: + self.newline() + self.write('else:') + self.body(else_) + break + + def visit_For(self, node): + self.newline() + self.write('for ') + self.visit(node.target) + self.write(' in ') + self.visit(node.iter) + self.write(':') + self.body_or_else(node) + + def visit_While(self, node): + self.newline() + self.write('while ') + self.visit(node.test) + self.write(':') + self.body_or_else(node) + + def visit_With(self, node): + self.newline() + self.write('with ') + self.visit(node.context_expr) + if node.optional_vars is not None: + self.write(' as ') + self.visit(node.optional_vars) + self.write(':') + self.body(node.body) + + def visit_Pass(self, node): + self.newline() + self.write('pass') + + def visit_Print(self, node): + # XXX: python 2.6 only + self.newline() + self.write('print ') + want_comma = False + if node.dest is not None: + self.write(' >> ') + self.visit(node.dest) + want_comma = True + for value in node.values: + if want_comma: + self.write(', ') + self.visit(value) + want_comma = True + if not node.nl: + self.write(',') + + def visit_Delete(self, node): + self.newline() + self.write('del ') + for idx, target in enumerate(node): + if idx: + self.write(', ') + self.visit(target) + + def visit_TryExcept(self, node): + self.newline() + self.write('try:') + self.body(node.body) + for handler in node.handlers: + self.visit(handler) + + def visit_TryFinally(self, node): + self.newline() + self.write('try:') + self.body(node.body) + self.newline() + self.write('finally:') + self.body(node.finalbody) + + def visit_Global(self, node): + self.newline() + self.write('global ' + ', '.join(node.names)) + + def visit_Nonlocal(self, node): + self.newline() + self.write('nonlocal ' + ', '.join(node.names)) + + def visit_Return(self, node): + self.newline() + self.write('return ') + self.visit(node.value) + + def visit_Break(self, node): + self.newline() + self.write('break') + + def visit_Continue(self, node): + self.newline() + self.write('continue') + + def visit_Raise(self, node): + # XXX: Python 2.6 / 3.0 compatibility + self.newline() + self.write('raise') + if hasattr(node, 'exc') and node.exc is not None: + self.write(' ') + self.visit(node.exc) + if node.cause is not None: + self.write(' from ') + self.visit(node.cause) + elif hasattr(node, 'type') and node.type is not None: + self.visit(node.type) + if node.inst is not None: + self.write(', ') + self.visit(node.inst) + if node.tback is not None: + self.write(', ') + self.visit(node.tback) + + # Expressions + + def visit_Attribute(self, node): + self.visit(node.value) + self.write('.' + node.attr) + + def visit_Call(self, node): + want_comma = [] + def write_comma(): + if want_comma: + self.write(', ') + else: + want_comma.append(True) + + self.visit(node.func) + self.write('(') + for arg in node.args: + write_comma() + self.visit(arg) + for keyword in node.keywords: + write_comma() + self.write(keyword.arg + '=') + self.visit(keyword.value) + if node.starargs is not None: + write_comma() + self.write('*') + self.visit(node.starargs) + if node.kwargs is not None: + write_comma() + self.write('**') + self.visit(node.kwargs) + self.write(')') + + def visit_Name(self, node): + self.write(node.id) + + def visit_NameConstant(self, node): + self.write(str(node.value)) + + def visit_arg(self, node): + self.write(node.arg) + + def visit_Str(self, node): + self.write(repr(node.s)) + + def visit_Bytes(self, node): + self.write(repr(node.s)) + + def visit_Num(self, node): + self.write(repr(node.n)) + + def visit_Tuple(self, node): + self.write('(') + idx = -1 + for idx, item in enumerate(node.elts): + if idx: + self.write(', ') + self.visit(item) + self.write(idx and ')' or ',)') + + def sequence_visit(left, right): + def visit(self, node): + self.write(left) + for idx, item in enumerate(node.elts): + if idx: + self.write(', ') + self.visit(item) + self.write(right) + return visit + + visit_List = sequence_visit('[', ']') + visit_Set = sequence_visit('{', '}') + del sequence_visit + + def visit_Dict(self, node): + self.write('{') + for idx, (key, value) in enumerate(zip(node.keys, node.values)): + if idx: + self.write(', ') + self.visit(key) + self.write(': ') + self.visit(value) + self.write('}') + + def visit_BinOp(self, node): + self.write('(') + self.visit(node.left) + self.write(' %s ' % BINOP_SYMBOLS[type(node.op)]) + self.visit(node.right) + self.write(')') + + def visit_BoolOp(self, node): + self.write('(') + for idx, value in enumerate(node.values): + if idx: + self.write(' %s ' % BOOLOP_SYMBOLS[type(node.op)]) + self.visit(value) + self.write(')') + + def visit_Compare(self, node): + self.write('(') + self.visit(node.left) + for op, right in zip(node.ops, node.comparators): + self.write(' %s ' % CMPOP_SYMBOLS[type(op)]) + self.visit(right) + self.write(')') + + def visit_UnaryOp(self, node): + self.write('(') + op = UNARYOP_SYMBOLS[type(node.op)] + self.write(op) + if op == 'not': + self.write(' ') + self.visit(node.operand) + self.write(')') + + def visit_Subscript(self, node): + self.visit(node.value) + self.write('[') + self.visit(node.slice) + self.write(']') + + def visit_Slice(self, node): + if node.lower is not None: + self.visit(node.lower) + self.write(':') + if node.upper is not None: + self.visit(node.upper) + if node.step is not None: + self.write(':') + if not (isinstance(node.step, Name) and node.step.id == 'None'): + self.visit(node.step) + + def visit_ExtSlice(self, node): + for idx, item in node.dims: + if idx: + self.write(', ') + self.visit(item) + + def visit_Yield(self, node): + self.write('yield ') + self.visit(node.value) + + def visit_Lambda(self, node): + self.write('lambda ') + self.signature(node.args) + self.write(': ') + self.visit(node.body) + + def visit_Ellipsis(self, node): + self.write('Ellipsis') + + def generator_visit(left, right): + def visit(self, node): + self.write(left) + self.visit(node.elt) + for comprehension in node.generators: + self.visit(comprehension) + self.write(right) + return visit + + visit_ListComp = generator_visit('[', ']') + visit_GeneratorExp = generator_visit('(', ')') + visit_SetComp = generator_visit('{', '}') + del generator_visit + + def visit_DictComp(self, node): + self.write('{') + self.visit(node.key) + self.write(': ') + self.visit(node.value) + for comprehension in node.generators: + self.visit(comprehension) + self.write('}') + + def visit_IfExp(self, node): + self.visit(node.body) + self.write(' if ') + self.visit(node.test) + self.write(' else ') + self.visit(node.orelse) + + def visit_Starred(self, node): + self.write('*') + self.visit(node.value) + + def visit_Repr(self, node): + # XXX: python 2.6 only + self.write('`') + self.visit(node.value) + self.write('`') + + # Helper Nodes + + def visit_alias(self, node): + self.write(node.name) + if node.asname is not None: + self.write(' as ' + node.asname) + + def visit_comprehension(self, node): + self.write(' for ') + self.visit(node.target) + self.write(' in ') + self.visit(node.iter) + if node.ifs: + for if_ in node.ifs: + self.write(' if ') + self.visit(if_) + + def visit_excepthandler(self, node): + self.newline() + self.write('except') + if node.type is not None: + self.write(' ') + self.visit(node.type) + if node.name is not None: + self.write(' as ') + self.visit(node.name) + self.write(':') + self.body(node.body) diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/ast.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/ast.py new file mode 100644 index 00000000000..65fd84dfe15 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/ast.py @@ -0,0 +1,178 @@ +# mako/ast.py +# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file> +# +# This module is part of Mako and is released under +# the MIT License: http://www.opensource.org/licenses/mit-license.php + +"""utilities for analyzing expressions and blocks of Python +code, as well as generating Python from AST nodes""" + +from mako import exceptions, pyparser, compat +import re + +class PythonCode(object): + """represents information about a string containing Python code""" + def __init__(self, code, **exception_kwargs): + self.code = code + + # represents all identifiers which are assigned to at some point in + # the code + self.declared_identifiers = set() + + # represents all identifiers which are referenced before their + # assignment, if any + self.undeclared_identifiers = set() + + # note that an identifier can be in both the undeclared and declared + # lists. + + # using AST to parse instead of using code.co_varnames, + # code.co_names has several advantages: + # - we can locate an identifier as "undeclared" even if + # its declared later in the same block of code + # - AST is less likely to break with version changes + # (for example, the behavior of co_names changed a little bit + # in python version 2.5) + if isinstance(code, compat.string_types): + expr = pyparser.parse(code.lstrip(), "exec", **exception_kwargs) + else: + expr = code + + f = pyparser.FindIdentifiers(self, **exception_kwargs) + f.visit(expr) + +class ArgumentList(object): + """parses a fragment of code as a comma-separated list of expressions""" + def __init__(self, code, **exception_kwargs): + self.codeargs = [] + self.args = [] + self.declared_identifiers = set() + self.undeclared_identifiers = set() + if isinstance(code, compat.string_types): + if re.match(r"\S", code) and not re.match(r",\s*$", code): + # if theres text and no trailing comma, insure its parsed + # as a tuple by adding a trailing comma + code += "," + expr = pyparser.parse(code, "exec", **exception_kwargs) + else: + expr = code + + f = pyparser.FindTuple(self, PythonCode, **exception_kwargs) + f.visit(expr) + +class PythonFragment(PythonCode): + """extends PythonCode to provide identifier lookups in partial control + statements + + e.g. + for x in 5: + elif y==9: + except (MyException, e): + etc. + """ + def __init__(self, code, **exception_kwargs): + m = re.match(r'^(\w+)(?:\s+(.*?))?:\s*(#|$)', code.strip(), re.S) + if not m: + raise exceptions.CompileException( + "Fragment '%s' is not a partial control statement" % + code, **exception_kwargs) + if m.group(3): + code = code[:m.start(3)] + (keyword, expr) = m.group(1,2) + if keyword in ['for','if', 'while']: + code = code + "pass" + elif keyword == 'try': + code = code + "pass\nexcept:pass" + elif keyword == 'elif' or keyword == 'else': + code = "if False:pass\n" + code + "pass" + elif keyword == 'except': + code = "try:pass\n" + code + "pass" + elif keyword == 'with': + code = code + "pass" + else: + raise exceptions.CompileException( + "Unsupported control keyword: '%s'" % + keyword, **exception_kwargs) + super(PythonFragment, self).__init__(code, **exception_kwargs) + + +class FunctionDecl(object): + """function declaration""" + def __init__(self, code, allow_kwargs=True, **exception_kwargs): + self.code = code + expr = pyparser.parse(code, "exec", **exception_kwargs) + + f = pyparser.ParseFunc(self, **exception_kwargs) + f.visit(expr) + if not hasattr(self, 'funcname'): + raise exceptions.CompileException( + "Code '%s' is not a function declaration" % code, + **exception_kwargs) + if not allow_kwargs and self.kwargs: + raise exceptions.CompileException( + "'**%s' keyword argument not allowed here" % + self.kwargnames[-1], **exception_kwargs) + + def get_argument_expressions(self, as_call=False): + """Return the argument declarations of this FunctionDecl as a printable + list. + + By default the return value is appropriate for writing in a ``def``; + set `as_call` to true to build arguments to be passed to the function + instead (assuming locals with the same names as the arguments exist). + """ + + namedecls = [] + + # Build in reverse order, since defaults and slurpy args come last + argnames = self.argnames[::-1] + kwargnames = self.kwargnames[::-1] + defaults = self.defaults[::-1] + kwdefaults = self.kwdefaults[::-1] + + # Named arguments + if self.kwargs: + namedecls.append("**" + kwargnames.pop(0)) + + for name in kwargnames: + # Keyword-only arguments must always be used by name, so even if + # this is a call, print out `foo=foo` + if as_call: + namedecls.append("%s=%s" % (name, name)) + elif kwdefaults: + default = kwdefaults.pop(0) + if default is None: + # The AST always gives kwargs a default, since you can do + # `def foo(*, a=1, b, c=3)` + namedecls.append(name) + else: + namedecls.append("%s=%s" % ( + name, pyparser.ExpressionGenerator(default).value())) + else: + namedecls.append(name) + + # Positional arguments + if self.varargs: + namedecls.append("*" + argnames.pop(0)) + + for name in argnames: + if as_call or not defaults: + namedecls.append(name) + else: + default = defaults.pop(0) + namedecls.append("%s=%s" % ( + name, pyparser.ExpressionGenerator(default).value())) + + namedecls.reverse() + return namedecls + + @property + def allargnames(self): + return tuple(self.argnames) + tuple(self.kwargnames) + +class FunctionArgs(FunctionDecl): + """the argument portion of a function declaration""" + + def __init__(self, code, **kwargs): + super(FunctionArgs, self).__init__("def ANON(%s):pass" % code, + **kwargs) diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/cache.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/cache.py new file mode 100644 index 00000000000..c405c5171d7 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/cache.py @@ -0,0 +1,238 @@ +# mako/cache.py +# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file> +# +# This module is part of Mako and is released under +# the MIT License: http://www.opensource.org/licenses/mit-license.php + +from mako import compat, util + +_cache_plugins = util.PluginLoader("mako.cache") + +register_plugin = _cache_plugins.register +register_plugin("beaker", "mako.ext.beaker_cache", "BeakerCacheImpl") + + +class Cache(object): + """Represents a data content cache made available to the module + space of a specific :class:`.Template` object. + + .. versionadded:: 0.6 + :class:`.Cache` by itself is mostly a + container for a :class:`.CacheImpl` object, which implements + a fixed API to provide caching services; specific subclasses exist to + implement different + caching strategies. Mako includes a backend that works with + the Beaker caching system. Beaker itself then supports + a number of backends (i.e. file, memory, memcached, etc.) + + The construction of a :class:`.Cache` is part of the mechanics + of a :class:`.Template`, and programmatic access to this + cache is typically via the :attr:`.Template.cache` attribute. + + """ + + impl = None + """Provide the :class:`.CacheImpl` in use by this :class:`.Cache`. + + This accessor allows a :class:`.CacheImpl` with additional + methods beyond that of :class:`.Cache` to be used programmatically. + + """ + + id = None + """Return the 'id' that identifies this cache. + + This is a value that should be globally unique to the + :class:`.Template` associated with this cache, and can + be used by a caching system to name a local container + for data specific to this template. + + """ + + starttime = None + """Epochal time value for when the owning :class:`.Template` was + first compiled. + + A cache implementation may wish to invalidate data earlier than + this timestamp; this has the effect of the cache for a specific + :class:`.Template` starting clean any time the :class:`.Template` + is recompiled, such as when the original template file changed on + the filesystem. + + """ + + def __init__(self, template, *args): + # check for a stale template calling the + # constructor + if isinstance(template, compat.string_types) and args: + return + self.template = template + self.id = template.module.__name__ + self.starttime = template.module._modified_time + self._def_regions = {} + self.impl = self._load_impl(self.template.cache_impl) + + def _load_impl(self, name): + return _cache_plugins.load(name)(self) + + def get_or_create(self, key, creation_function, **kw): + """Retrieve a value from the cache, using the given creation function + to generate a new value.""" + + return self._ctx_get_or_create(key, creation_function, None, **kw) + + def _ctx_get_or_create(self, key, creation_function, context, **kw): + """Retrieve a value from the cache, using the given creation function + to generate a new value.""" + + if not self.template.cache_enabled: + return creation_function() + + return self.impl.get_or_create( + key, + creation_function, + **self._get_cache_kw(kw, context)) + + def set(self, key, value, **kw): + """Place a value in the cache. + + :param key: the value's key. + :param value: the value. + :param \**kw: cache configuration arguments. + + """ + + self.impl.set(key, value, **self._get_cache_kw(kw, None)) + + put = set + """A synonym for :meth:`.Cache.set`. + + This is here for backwards compatibility. + + """ + + def get(self, key, **kw): + """Retrieve a value from the cache. + + :param key: the value's key. + :param \**kw: cache configuration arguments. The + backend is configured using these arguments upon first request. + Subsequent requests that use the same series of configuration + values will use that same backend. + + """ + return self.impl.get(key, **self._get_cache_kw(kw, None)) + + def invalidate(self, key, **kw): + """Invalidate a value in the cache. + + :param key: the value's key. + :param \**kw: cache configuration arguments. The + backend is configured using these arguments upon first request. + Subsequent requests that use the same series of configuration + values will use that same backend. + + """ + self.impl.invalidate(key, **self._get_cache_kw(kw, None)) + + def invalidate_body(self): + """Invalidate the cached content of the "body" method for this + template. + + """ + self.invalidate('render_body', __M_defname='render_body') + + def invalidate_def(self, name): + """Invalidate the cached content of a particular ``<%def>`` within this + template. + + """ + + self.invalidate('render_%s' % name, __M_defname='render_%s' % name) + + def invalidate_closure(self, name): + """Invalidate a nested ``<%def>`` within this template. + + Caching of nested defs is a blunt tool as there is no + management of scope -- nested defs that use cache tags + need to have names unique of all other nested defs in the + template, else their content will be overwritten by + each other. + + """ + + self.invalidate(name, __M_defname=name) + + def _get_cache_kw(self, kw, context): + defname = kw.pop('__M_defname', None) + if not defname: + tmpl_kw = self.template.cache_args.copy() + tmpl_kw.update(kw) + elif defname in self._def_regions: + tmpl_kw = self._def_regions[defname] + else: + tmpl_kw = self.template.cache_args.copy() + tmpl_kw.update(kw) + self._def_regions[defname] = tmpl_kw + if context and self.impl.pass_context: + tmpl_kw = tmpl_kw.copy() + tmpl_kw.setdefault('context', context) + return tmpl_kw + + +class CacheImpl(object): + """Provide a cache implementation for use by :class:`.Cache`.""" + + def __init__(self, cache): + self.cache = cache + + pass_context = False + """If ``True``, the :class:`.Context` will be passed to + :meth:`get_or_create <.CacheImpl.get_or_create>` as the name ``'context'``. + """ + + def get_or_create(self, key, creation_function, **kw): + """Retrieve a value from the cache, using the given creation function + to generate a new value. + + This function *must* return a value, either from + the cache, or via the given creation function. + If the creation function is called, the newly + created value should be populated into the cache + under the given key before being returned. + + :param key: the value's key. + :param creation_function: function that when called generates + a new value. + :param \**kw: cache configuration arguments. + + """ + raise NotImplementedError() + + def set(self, key, value, **kw): + """Place a value in the cache. + + :param key: the value's key. + :param value: the value. + :param \**kw: cache configuration arguments. + + """ + raise NotImplementedError() + + def get(self, key, **kw): + """Retrieve a value from the cache. + + :param key: the value's key. + :param \**kw: cache configuration arguments. + + """ + raise NotImplementedError() + + def invalidate(self, key, **kw): + """Invalidate a value in the cache. + + :param key: the value's key. + :param \**kw: cache configuration arguments. + + """ + raise NotImplementedError() diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/cmd.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/cmd.py new file mode 100644 index 00000000000..1a9ca56637c --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/cmd.py @@ -0,0 +1,62 @@ +# mako/cmd.py +# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file> +# +# This module is part of Mako and is released under +# the MIT License: http://www.opensource.org/licenses/mit-license.php +from argparse import ArgumentParser +from os.path import isfile, dirname +import sys +from mako.template import Template +from mako.lookup import TemplateLookup +from mako import exceptions + +def varsplit(var): + if "=" not in var: + return (var, "") + return var.split("=", 1) + +def _exit(): + sys.stderr.write(exceptions.text_error_template().render()) + sys.exit(1) + +def cmdline(argv=None): + + parser = ArgumentParser("usage: %prog [FILENAME]") + parser.add_argument("--var", default=[], action="append", + help="variable (can be used multiple times, use name=value)") + parser.add_argument("--template-dir", default=[], action="append", + help="Directory to use for template lookup (multiple " + "directories may be provided). If not given then if the " + "template is read from stdin, the value defaults to be " + "the current directory, otherwise it defaults to be the " + "parent directory of the file provided.") + parser.add_argument('input', nargs='?', default='-') + + options = parser.parse_args(argv) + if options.input == '-': + lookup_dirs = options.template_dir or ["."] + lookup = TemplateLookup(lookup_dirs) + try: + template = Template(sys.stdin.read(), lookup=lookup) + except: + _exit() + else: + filename = options.input + if not isfile(filename): + raise SystemExit("error: can't find %s" % filename) + lookup_dirs = options.template_dir or [dirname(filename)] + lookup = TemplateLookup(lookup_dirs) + try: + template = Template(filename=filename, lookup=lookup) + except: + _exit() + + kw = dict([varsplit(var) for var in options.var]) + try: + print(template.render(**kw)) + except: + _exit() + + +if __name__ == "__main__": + cmdline() diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/codegen.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/codegen.py new file mode 100644 index 00000000000..4b0bda86731 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/codegen.py @@ -0,0 +1,1237 @@ +# mako/codegen.py +# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file> +# +# This module is part of Mako and is released under +# the MIT License: http://www.opensource.org/licenses/mit-license.php + +"""provides functionality for rendering a parsetree constructing into module +source code.""" + +import time +import re +from mako.pygen import PythonPrinter +from mako import util, ast, parsetree, filters, exceptions +from mako import compat + + +MAGIC_NUMBER = 10 + +# names which are hardwired into the +# template and are not accessed via the +# context itself +RESERVED_NAMES = set(['context', 'loop', 'UNDEFINED']) + +def compile(node, + uri, + filename=None, + default_filters=None, + buffer_filters=None, + imports=None, + future_imports=None, + source_encoding=None, + generate_magic_comment=True, + disable_unicode=False, + strict_undefined=False, + enable_loop=True, + reserved_names=frozenset()): + + """Generate module source code given a parsetree node, + uri, and optional source filename""" + + # if on Py2K, push the "source_encoding" string to be + # a bytestring itself, as we will be embedding it into + # the generated source and we don't want to coerce the + # result into a unicode object, in "disable_unicode" mode + if not compat.py3k and isinstance(source_encoding, compat.text_type): + source_encoding = source_encoding.encode(source_encoding) + + + buf = util.FastEncodingBuffer() + + printer = PythonPrinter(buf) + _GenerateRenderMethod(printer, + _CompileContext(uri, + filename, + default_filters, + buffer_filters, + imports, + future_imports, + source_encoding, + generate_magic_comment, + disable_unicode, + strict_undefined, + enable_loop, + reserved_names), + node) + return buf.getvalue() + +class _CompileContext(object): + def __init__(self, + uri, + filename, + default_filters, + buffer_filters, + imports, + future_imports, + source_encoding, + generate_magic_comment, + disable_unicode, + strict_undefined, + enable_loop, + reserved_names): + self.uri = uri + self.filename = filename + self.default_filters = default_filters + self.buffer_filters = buffer_filters + self.imports = imports + self.future_imports = future_imports + self.source_encoding = source_encoding + self.generate_magic_comment = generate_magic_comment + self.disable_unicode = disable_unicode + self.strict_undefined = strict_undefined + self.enable_loop = enable_loop + self.reserved_names = reserved_names + +class _GenerateRenderMethod(object): + """A template visitor object which generates the + full module source for a template. + + """ + def __init__(self, printer, compiler, node): + self.printer = printer + self.compiler = compiler + self.node = node + self.identifier_stack = [None] + self.in_def = isinstance(node, (parsetree.DefTag, parsetree.BlockTag)) + + if self.in_def: + name = "render_%s" % node.funcname + args = node.get_argument_expressions() + filtered = len(node.filter_args.args) > 0 + buffered = eval(node.attributes.get('buffered', 'False')) + cached = eval(node.attributes.get('cached', 'False')) + defs = None + pagetag = None + if node.is_block and not node.is_anonymous: + args += ['**pageargs'] + else: + defs = self.write_toplevel() + pagetag = self.compiler.pagetag + name = "render_body" + if pagetag is not None: + args = pagetag.body_decl.get_argument_expressions() + if not pagetag.body_decl.kwargs: + args += ['**pageargs'] + cached = eval(pagetag.attributes.get('cached', 'False')) + self.compiler.enable_loop = self.compiler.enable_loop or eval( + pagetag.attributes.get( + 'enable_loop', 'False') + ) + else: + args = ['**pageargs'] + cached = False + buffered = filtered = False + if args is None: + args = ['context'] + else: + args = [a for a in ['context'] + args] + + self.write_render_callable( + pagetag or node, + name, args, + buffered, filtered, cached) + + if defs is not None: + for node in defs: + _GenerateRenderMethod(printer, compiler, node) + + if not self.in_def: + self.write_metadata_struct() + + def write_metadata_struct(self): + self.printer.source_map[self.printer.lineno] = \ + max(self.printer.source_map) + struct = { + "filename": self.compiler.filename, + "uri": self.compiler.uri, + "source_encoding": self.compiler.source_encoding, + "line_map": self.printer.source_map, + } + self.printer.writelines( + '"""', + '__M_BEGIN_METADATA', + compat.json.dumps(struct), + '__M_END_METADATA\n' + '"""' + ) + + @property + def identifiers(self): + return self.identifier_stack[-1] + + def write_toplevel(self): + """Traverse a template structure for module-level directives and + generate the start of module-level code. + + """ + inherit = [] + namespaces = {} + module_code = [] + + self.compiler.pagetag = None + + class FindTopLevel(object): + def visitInheritTag(s, node): + inherit.append(node) + def visitNamespaceTag(s, node): + namespaces[node.name] = node + def visitPageTag(s, node): + self.compiler.pagetag = node + def visitCode(s, node): + if node.ismodule: + module_code.append(node) + + f = FindTopLevel() + for n in self.node.nodes: + n.accept_visitor(f) + + self.compiler.namespaces = namespaces + + module_ident = set() + for n in module_code: + module_ident = module_ident.union(n.declared_identifiers()) + + module_identifiers = _Identifiers(self.compiler) + module_identifiers.declared = module_ident + + # module-level names, python code + if self.compiler.generate_magic_comment and \ + self.compiler.source_encoding: + self.printer.writeline("# -*- coding:%s -*-" % + self.compiler.source_encoding) + + if self.compiler.future_imports: + self.printer.writeline("from __future__ import %s" % + (", ".join(self.compiler.future_imports),)) + self.printer.writeline("from mako import runtime, filters, cache") + self.printer.writeline("UNDEFINED = runtime.UNDEFINED") + self.printer.writeline("__M_dict_builtin = dict") + self.printer.writeline("__M_locals_builtin = locals") + self.printer.writeline("_magic_number = %r" % MAGIC_NUMBER) + self.printer.writeline("_modified_time = %r" % time.time()) + self.printer.writeline("_enable_loop = %r" % self.compiler.enable_loop) + self.printer.writeline( + "_template_filename = %r" % self.compiler.filename) + self.printer.writeline("_template_uri = %r" % self.compiler.uri) + self.printer.writeline( + "_source_encoding = %r" % self.compiler.source_encoding) + if self.compiler.imports: + buf = '' + for imp in self.compiler.imports: + buf += imp + "\n" + self.printer.writeline(imp) + impcode = ast.PythonCode( + buf, + source='', lineno=0, + pos=0, + filename='template defined imports') + else: + impcode = None + + main_identifiers = module_identifiers.branch(self.node) + module_identifiers.topleveldefs = \ + module_identifiers.topleveldefs.\ + union(main_identifiers.topleveldefs) + module_identifiers.declared.add("UNDEFINED") + if impcode: + module_identifiers.declared.update(impcode.declared_identifiers) + + self.compiler.identifiers = module_identifiers + self.printer.writeline("_exports = %r" % + [n.name for n in + main_identifiers.topleveldefs.values()] + ) + self.printer.write_blanks(2) + + if len(module_code): + self.write_module_code(module_code) + + if len(inherit): + self.write_namespaces(namespaces) + self.write_inherit(inherit[-1]) + elif len(namespaces): + self.write_namespaces(namespaces) + + return list(main_identifiers.topleveldefs.values()) + + def write_render_callable(self, node, name, args, buffered, filtered, + cached): + """write a top-level render callable. + + this could be the main render() method or that of a top-level def.""" + + if self.in_def: + decorator = node.decorator + if decorator: + self.printer.writeline( + "@runtime._decorate_toplevel(%s)" % decorator) + + self.printer.start_source(node.lineno) + self.printer.writelines( + "def %s(%s):" % (name, ','.join(args)), + # push new frame, assign current frame to __M_caller + "__M_caller = context.caller_stack._push_frame()", + "try:" + ) + if buffered or filtered or cached: + self.printer.writeline("context._push_buffer()") + + self.identifier_stack.append( + self.compiler.identifiers.branch(self.node)) + if (not self.in_def or self.node.is_block) and '**pageargs' in args: + self.identifier_stack[-1].argument_declared.add('pageargs') + + if not self.in_def and ( + len(self.identifiers.locally_assigned) > 0 or + len(self.identifiers.argument_declared) > 0 + ): + self.printer.writeline("__M_locals = __M_dict_builtin(%s)" % + ','.join([ + "%s=%s" % (x, x) for x in + self.identifiers.argument_declared + ])) + + self.write_variable_declares(self.identifiers, toplevel=True) + + for n in self.node.nodes: + n.accept_visitor(self) + + self.write_def_finish(self.node, buffered, filtered, cached) + self.printer.writeline(None) + self.printer.write_blanks(2) + if cached: + self.write_cache_decorator( + node, name, + args, buffered, + self.identifiers, toplevel=True) + + def write_module_code(self, module_code): + """write module-level template code, i.e. that which + is enclosed in <%! %> tags in the template.""" + for n in module_code: + self.printer.start_source(n.lineno) + self.printer.write_indented_block(n.text) + + def write_inherit(self, node): + """write the module-level inheritance-determination callable.""" + + self.printer.writelines( + "def _mako_inherit(template, context):", + "_mako_generate_namespaces(context)", + "return runtime._inherit_from(context, %s, _template_uri)" % + (node.parsed_attributes['file']), + None + ) + + def write_namespaces(self, namespaces): + """write the module-level namespace-generating callable.""" + self.printer.writelines( + "def _mako_get_namespace(context, name):", + "try:", + "return context.namespaces[(__name__, name)]", + "except KeyError:", + "_mako_generate_namespaces(context)", + "return context.namespaces[(__name__, name)]", + None, None + ) + self.printer.writeline("def _mako_generate_namespaces(context):") + + + for node in namespaces.values(): + if 'import' in node.attributes: + self.compiler.has_ns_imports = True + self.printer.start_source(node.lineno) + if len(node.nodes): + self.printer.writeline("def make_namespace():") + export = [] + identifiers = self.compiler.identifiers.branch(node) + self.in_def = True + class NSDefVisitor(object): + def visitDefTag(s, node): + s.visitDefOrBase(node) + + def visitBlockTag(s, node): + s.visitDefOrBase(node) + + def visitDefOrBase(s, node): + if node.is_anonymous: + raise exceptions.CompileException( + "Can't put anonymous blocks inside " + "<%namespace>", + **node.exception_kwargs + ) + self.write_inline_def(node, identifiers, nested=False) + export.append(node.funcname) + vis = NSDefVisitor() + for n in node.nodes: + n.accept_visitor(vis) + self.printer.writeline("return [%s]" % (','.join(export))) + self.printer.writeline(None) + self.in_def = False + callable_name = "make_namespace()" + else: + callable_name = "None" + + if 'file' in node.parsed_attributes: + self.printer.writeline( + "ns = runtime.TemplateNamespace(%r," + " context._clean_inheritance_tokens()," + " templateuri=%s, callables=%s, " + " calling_uri=_template_uri)" % + ( + node.name, + node.parsed_attributes.get('file', 'None'), + callable_name, + ) + ) + elif 'module' in node.parsed_attributes: + self.printer.writeline( + "ns = runtime.ModuleNamespace(%r," + " context._clean_inheritance_tokens()," + " callables=%s, calling_uri=_template_uri," + " module=%s)" % + ( + node.name, + callable_name, + node.parsed_attributes.get( + 'module', 'None') + ) + ) + else: + self.printer.writeline( + "ns = runtime.Namespace(%r," + " context._clean_inheritance_tokens()," + " callables=%s, calling_uri=_template_uri)" % + ( + node.name, + callable_name, + ) + ) + if eval(node.attributes.get('inheritable', "False")): + self.printer.writeline("context['self'].%s = ns" % (node.name)) + + self.printer.writeline( + "context.namespaces[(__name__, %s)] = ns" % repr(node.name)) + self.printer.write_blanks(1) + if not len(namespaces): + self.printer.writeline("pass") + self.printer.writeline(None) + + def write_variable_declares(self, identifiers, toplevel=False, limit=None): + """write variable declarations at the top of a function. + + the variable declarations are in the form of callable + definitions for defs and/or name lookup within the + function's context argument. the names declared are based + on the names that are referenced in the function body, + which don't otherwise have any explicit assignment + operation. names that are assigned within the body are + assumed to be locally-scoped variables and are not + separately declared. + + for def callable definitions, if the def is a top-level + callable then a 'stub' callable is generated which wraps + the current Context into a closure. if the def is not + top-level, it is fully rendered as a local closure. + + """ + + # collection of all defs available to us in this scope + comp_idents = dict([(c.funcname, c) for c in identifiers.defs]) + to_write = set() + + # write "context.get()" for all variables we are going to + # need that arent in the namespace yet + to_write = to_write.union(identifiers.undeclared) + + # write closure functions for closures that we define + # right here + to_write = to_write.union( + [c.funcname for c in identifiers.closuredefs.values()]) + + # remove identifiers that are declared in the argument + # signature of the callable + to_write = to_write.difference(identifiers.argument_declared) + + # remove identifiers that we are going to assign to. + # in this way we mimic Python's behavior, + # i.e. assignment to a variable within a block + # means that variable is now a "locally declared" var, + # which cannot be referenced beforehand. + to_write = to_write.difference(identifiers.locally_declared) + + if self.compiler.enable_loop: + has_loop = "loop" in to_write + to_write.discard("loop") + else: + has_loop = False + + # if a limiting set was sent, constraint to those items in that list + # (this is used for the caching decorator) + if limit is not None: + to_write = to_write.intersection(limit) + + if toplevel and getattr(self.compiler, 'has_ns_imports', False): + self.printer.writeline("_import_ns = {}") + self.compiler.has_imports = True + for ident, ns in self.compiler.namespaces.items(): + if 'import' in ns.attributes: + self.printer.writeline( + "_mako_get_namespace(context, %r)." + "_populate(_import_ns, %r)" % + ( + ident, + re.split(r'\s*,\s*', ns.attributes['import']) + )) + + if has_loop: + self.printer.writeline( + 'loop = __M_loop = runtime.LoopStack()' + ) + + for ident in to_write: + if ident in comp_idents: + comp = comp_idents[ident] + if comp.is_block: + if not comp.is_anonymous: + self.write_def_decl(comp, identifiers) + else: + self.write_inline_def(comp, identifiers, nested=True) + else: + if comp.is_root(): + self.write_def_decl(comp, identifiers) + else: + self.write_inline_def(comp, identifiers, nested=True) + + elif ident in self.compiler.namespaces: + self.printer.writeline( + "%s = _mako_get_namespace(context, %r)" % + (ident, ident) + ) + else: + if getattr(self.compiler, 'has_ns_imports', False): + if self.compiler.strict_undefined: + self.printer.writelines( + "%s = _import_ns.get(%r, UNDEFINED)" % + (ident, ident), + "if %s is UNDEFINED:" % ident, + "try:", + "%s = context[%r]" % (ident, ident), + "except KeyError:", + "raise NameError(\"'%s' is not defined\")" % + ident, + None, None + ) + else: + self.printer.writeline( + "%s = _import_ns.get(%r, context.get(%r, UNDEFINED))" % + (ident, ident, ident)) + else: + if self.compiler.strict_undefined: + self.printer.writelines( + "try:", + "%s = context[%r]" % (ident, ident), + "except KeyError:", + "raise NameError(\"'%s' is not defined\")" % + ident, + None + ) + else: + self.printer.writeline( + "%s = context.get(%r, UNDEFINED)" % (ident, ident) + ) + + self.printer.writeline("__M_writer = context.writer()") + + def write_def_decl(self, node, identifiers): + """write a locally-available callable referencing a top-level def""" + funcname = node.funcname + namedecls = node.get_argument_expressions() + nameargs = node.get_argument_expressions(as_call=True) + + if not self.in_def and ( + len(self.identifiers.locally_assigned) > 0 or + len(self.identifiers.argument_declared) > 0): + nameargs.insert(0, 'context._locals(__M_locals)') + else: + nameargs.insert(0, 'context') + self.printer.writeline("def %s(%s):" % (funcname, ",".join(namedecls))) + self.printer.writeline( + "return render_%s(%s)" % (funcname, ",".join(nameargs))) + self.printer.writeline(None) + + def write_inline_def(self, node, identifiers, nested): + """write a locally-available def callable inside an enclosing def.""" + + namedecls = node.get_argument_expressions() + + decorator = node.decorator + if decorator: + self.printer.writeline( + "@runtime._decorate_inline(context, %s)" % decorator) + self.printer.writeline( + "def %s(%s):" % (node.funcname, ",".join(namedecls))) + filtered = len(node.filter_args.args) > 0 + buffered = eval(node.attributes.get('buffered', 'False')) + cached = eval(node.attributes.get('cached', 'False')) + self.printer.writelines( + # push new frame, assign current frame to __M_caller + "__M_caller = context.caller_stack._push_frame()", + "try:" + ) + if buffered or filtered or cached: + self.printer.writelines( + "context._push_buffer()", + ) + + identifiers = identifiers.branch(node, nested=nested) + + self.write_variable_declares(identifiers) + + self.identifier_stack.append(identifiers) + for n in node.nodes: + n.accept_visitor(self) + self.identifier_stack.pop() + + self.write_def_finish(node, buffered, filtered, cached) + self.printer.writeline(None) + if cached: + self.write_cache_decorator(node, node.funcname, + namedecls, False, identifiers, + inline=True, toplevel=False) + + def write_def_finish(self, node, buffered, filtered, cached, + callstack=True): + """write the end section of a rendering function, either outermost or + inline. + + this takes into account if the rendering function was filtered, + buffered, etc. and closes the corresponding try: block if any, and + writes code to retrieve captured content, apply filters, send proper + return value.""" + + if not buffered and not cached and not filtered: + self.printer.writeline("return ''") + if callstack: + self.printer.writelines( + "finally:", + "context.caller_stack._pop_frame()", + None + ) + + if buffered or filtered or cached: + if buffered or cached: + # in a caching scenario, don't try to get a writer + # from the context after popping; assume the caching + # implemenation might be using a context with no + # extra buffers + self.printer.writelines( + "finally:", + "__M_buf = context._pop_buffer()" + ) + else: + self.printer.writelines( + "finally:", + "__M_buf, __M_writer = context._pop_buffer_and_writer()" + ) + + if callstack: + self.printer.writeline("context.caller_stack._pop_frame()") + + s = "__M_buf.getvalue()" + if filtered: + s = self.create_filter_callable(node.filter_args.args, s, + False) + self.printer.writeline(None) + if buffered and not cached: + s = self.create_filter_callable(self.compiler.buffer_filters, + s, False) + if buffered or cached: + self.printer.writeline("return %s" % s) + else: + self.printer.writelines( + "__M_writer(%s)" % s, + "return ''" + ) + + def write_cache_decorator(self, node_or_pagetag, name, + args, buffered, identifiers, + inline=False, toplevel=False): + """write a post-function decorator to replace a rendering + callable with a cached version of itself.""" + + self.printer.writeline("__M_%s = %s" % (name, name)) + cachekey = node_or_pagetag.parsed_attributes.get('cache_key', + repr(name)) + + cache_args = {} + if self.compiler.pagetag is not None: + cache_args.update( + ( + pa[6:], + self.compiler.pagetag.parsed_attributes[pa] + ) + for pa in self.compiler.pagetag.parsed_attributes + if pa.startswith('cache_') and pa != 'cache_key' + ) + cache_args.update( + ( + pa[6:], + node_or_pagetag.parsed_attributes[pa] + ) for pa in node_or_pagetag.parsed_attributes + if pa.startswith('cache_') and pa != 'cache_key' + ) + if 'timeout' in cache_args: + cache_args['timeout'] = int(eval(cache_args['timeout'])) + + self.printer.writeline("def %s(%s):" % (name, ','.join(args))) + + # form "arg1, arg2, arg3=arg3, arg4=arg4", etc. + pass_args = [ + "%s=%s" % ((a.split('=')[0],) * 2) if '=' in a else a + for a in args + ] + + self.write_variable_declares( + identifiers, + toplevel=toplevel, + limit=node_or_pagetag.undeclared_identifiers() + ) + if buffered: + s = "context.get('local')."\ + "cache._ctx_get_or_create("\ + "%s, lambda:__M_%s(%s), context, %s__M_defname=%r)" % ( + cachekey, name, ','.join(pass_args), + ''.join(["%s=%s, " % (k, v) + for k, v in cache_args.items()]), + name + ) + # apply buffer_filters + s = self.create_filter_callable(self.compiler.buffer_filters, s, + False) + self.printer.writelines("return " + s, None) + else: + self.printer.writelines( + "__M_writer(context.get('local')." + "cache._ctx_get_or_create(" + "%s, lambda:__M_%s(%s), context, %s__M_defname=%r))" % + ( + cachekey, name, ','.join(pass_args), + ''.join(["%s=%s, " % (k, v) + for k, v in cache_args.items()]), + name, + ), + "return ''", + None + ) + + def create_filter_callable(self, args, target, is_expression): + """write a filter-applying expression based on the filters + present in the given filter names, adjusting for the global + 'default' filter aliases as needed.""" + + def locate_encode(name): + if re.match(r'decode\..+', name): + return "filters." + name + elif self.compiler.disable_unicode: + return filters.NON_UNICODE_ESCAPES.get(name, name) + else: + return filters.DEFAULT_ESCAPES.get(name, name) + + if 'n' not in args: + if is_expression: + if self.compiler.pagetag: + args = self.compiler.pagetag.filter_args.args + args + if self.compiler.default_filters: + args = self.compiler.default_filters + args + for e in args: + # if filter given as a function, get just the identifier portion + if e == 'n': + continue + m = re.match(r'(.+?)(\(.*\))', e) + if m: + ident, fargs = m.group(1, 2) + f = locate_encode(ident) + e = f + fargs + else: + e = locate_encode(e) + assert e is not None + target = "%s(%s)" % (e, target) + return target + + def visitExpression(self, node): + self.printer.start_source(node.lineno) + if len(node.escapes) or \ + ( + self.compiler.pagetag is not None and + len(self.compiler.pagetag.filter_args.args) + ) or \ + len(self.compiler.default_filters): + + s = self.create_filter_callable(node.escapes_code.args, + "%s" % node.text, True) + self.printer.writeline("__M_writer(%s)" % s) + else: + self.printer.writeline("__M_writer(%s)" % node.text) + + def visitControlLine(self, node): + if node.isend: + self.printer.writeline(None) + if node.has_loop_context: + self.printer.writeline('finally:') + self.printer.writeline("loop = __M_loop._exit()") + self.printer.writeline(None) + else: + self.printer.start_source(node.lineno) + if self.compiler.enable_loop and node.keyword == 'for': + text = mangle_mako_loop(node, self.printer) + else: + text = node.text + self.printer.writeline(text) + children = node.get_children() + # this covers the three situations where we want to insert a pass: + # 1) a ternary control line with no children, + # 2) a primary control line with nothing but its own ternary + # and end control lines, and + # 3) any control line with no content other than comments + if not children or ( + compat.all(isinstance(c, (parsetree.Comment, + parsetree.ControlLine)) + for c in children) and + compat.all((node.is_ternary(c.keyword) or c.isend) + for c in children + if isinstance(c, parsetree.ControlLine))): + self.printer.writeline("pass") + + def visitText(self, node): + self.printer.start_source(node.lineno) + self.printer.writeline("__M_writer(%s)" % repr(node.content)) + + def visitTextTag(self, node): + filtered = len(node.filter_args.args) > 0 + if filtered: + self.printer.writelines( + "__M_writer = context._push_writer()", + "try:", + ) + for n in node.nodes: + n.accept_visitor(self) + if filtered: + self.printer.writelines( + "finally:", + "__M_buf, __M_writer = context._pop_buffer_and_writer()", + "__M_writer(%s)" % + self.create_filter_callable( + node.filter_args.args, + "__M_buf.getvalue()", + False), + None + ) + + def visitCode(self, node): + if not node.ismodule: + self.printer.start_source(node.lineno) + self.printer.write_indented_block(node.text) + + if not self.in_def and len(self.identifiers.locally_assigned) > 0: + # if we are the "template" def, fudge locally + # declared/modified variables into the "__M_locals" dictionary, + # which is used for def calls within the same template, + # to simulate "enclosing scope" + self.printer.writeline( + '__M_locals_builtin_stored = __M_locals_builtin()') + self.printer.writeline( + '__M_locals.update(__M_dict_builtin([(__M_key,' + ' __M_locals_builtin_stored[__M_key]) for __M_key in' + ' [%s] if __M_key in __M_locals_builtin_stored]))' % + ','.join([repr(x) for x in node.declared_identifiers()])) + + def visitIncludeTag(self, node): + self.printer.start_source(node.lineno) + args = node.attributes.get('args') + if args: + self.printer.writeline( + "runtime._include_file(context, %s, _template_uri, %s)" % + (node.parsed_attributes['file'], args)) + else: + self.printer.writeline( + "runtime._include_file(context, %s, _template_uri)" % + (node.parsed_attributes['file'])) + + def visitNamespaceTag(self, node): + pass + + def visitDefTag(self, node): + pass + + def visitBlockTag(self, node): + if node.is_anonymous: + self.printer.writeline("%s()" % node.funcname) + else: + nameargs = node.get_argument_expressions(as_call=True) + nameargs += ['**pageargs'] + self.printer.writeline("if 'parent' not in context._data or " + "not hasattr(context._data['parent'], '%s'):" + % node.funcname) + self.printer.writeline( + "context['self'].%s(%s)" % (node.funcname, ",".join(nameargs))) + self.printer.writeline("\n") + + def visitCallNamespaceTag(self, node): + # TODO: we can put namespace-specific checks here, such + # as ensure the given namespace will be imported, + # pre-import the namespace, etc. + self.visitCallTag(node) + + def visitCallTag(self, node): + self.printer.writeline("def ccall(caller):") + export = ['body'] + callable_identifiers = self.identifiers.branch(node, nested=True) + body_identifiers = callable_identifiers.branch(node, nested=False) + # we want the 'caller' passed to ccall to be used + # for the body() function, but for other non-body() + # <%def>s within <%call> we want the current caller + # off the call stack (if any) + body_identifiers.add_declared('caller') + + self.identifier_stack.append(body_identifiers) + class DefVisitor(object): + def visitDefTag(s, node): + s.visitDefOrBase(node) + + def visitBlockTag(s, node): + s.visitDefOrBase(node) + + def visitDefOrBase(s, node): + self.write_inline_def(node, callable_identifiers, nested=False) + if not node.is_anonymous: + export.append(node.funcname) + # remove defs that are within the <%call> from the + # "closuredefs" defined in the body, so they dont render twice + if node.funcname in body_identifiers.closuredefs: + del body_identifiers.closuredefs[node.funcname] + + vis = DefVisitor() + for n in node.nodes: + n.accept_visitor(vis) + self.identifier_stack.pop() + + bodyargs = node.body_decl.get_argument_expressions() + self.printer.writeline("def body(%s):" % ','.join(bodyargs)) + + # TODO: figure out best way to specify + # buffering/nonbuffering (at call time would be better) + buffered = False + if buffered: + self.printer.writelines( + "context._push_buffer()", + "try:" + ) + self.write_variable_declares(body_identifiers) + self.identifier_stack.append(body_identifiers) + + for n in node.nodes: + n.accept_visitor(self) + self.identifier_stack.pop() + + self.write_def_finish(node, buffered, False, False, callstack=False) + self.printer.writelines( + None, + "return [%s]" % (','.join(export)), + None + ) + + self.printer.writelines( + # push on caller for nested call + "context.caller_stack.nextcaller = " + "runtime.Namespace('caller', context, " + "callables=ccall(__M_caller))", + "try:") + self.printer.start_source(node.lineno) + self.printer.writelines( + "__M_writer(%s)" % self.create_filter_callable( + [], node.expression, True), + "finally:", + "context.caller_stack.nextcaller = None", + None + ) + +class _Identifiers(object): + """tracks the status of identifier names as template code is rendered.""" + + def __init__(self, compiler, node=None, parent=None, nested=False): + if parent is not None: + # if we are the branch created in write_namespaces(), + # we don't share any context from the main body(). + if isinstance(node, parsetree.NamespaceTag): + self.declared = set() + self.topleveldefs = util.SetLikeDict() + else: + # things that have already been declared + # in an enclosing namespace (i.e. names we can just use) + self.declared = set(parent.declared).\ + union([c.name for c in parent.closuredefs.values()]).\ + union(parent.locally_declared).\ + union(parent.argument_declared) + + # if these identifiers correspond to a "nested" + # scope, it means whatever the parent identifiers + # had as undeclared will have been declared by that parent, + # and therefore we have them in our scope. + if nested: + self.declared = self.declared.union(parent.undeclared) + + # top level defs that are available + self.topleveldefs = util.SetLikeDict(**parent.topleveldefs) + else: + self.declared = set() + self.topleveldefs = util.SetLikeDict() + + self.compiler = compiler + + # things within this level that are referenced before they + # are declared (e.g. assigned to) + self.undeclared = set() + + # things that are declared locally. some of these things + # could be in the "undeclared" list as well if they are + # referenced before declared + self.locally_declared = set() + + # assignments made in explicit python blocks. + # these will be propagated to + # the context of local def calls. + self.locally_assigned = set() + + # things that are declared in the argument + # signature of the def callable + self.argument_declared = set() + + # closure defs that are defined in this level + self.closuredefs = util.SetLikeDict() + + self.node = node + + if node is not None: + node.accept_visitor(self) + + illegal_names = self.compiler.reserved_names.intersection( + self.locally_declared) + if illegal_names: + raise exceptions.NameConflictError( + "Reserved words declared in template: %s" % + ", ".join(illegal_names)) + + + def branch(self, node, **kwargs): + """create a new Identifiers for a new Node, with + this Identifiers as the parent.""" + + return _Identifiers(self.compiler, node, self, **kwargs) + + @property + def defs(self): + return set(self.topleveldefs.union(self.closuredefs).values()) + + def __repr__(self): + return "Identifiers(declared=%r, locally_declared=%r, "\ + "undeclared=%r, topleveldefs=%r, closuredefs=%r, "\ + "argumentdeclared=%r)" %\ + ( + list(self.declared), + list(self.locally_declared), + list(self.undeclared), + [c.name for c in self.topleveldefs.values()], + [c.name for c in self.closuredefs.values()], + self.argument_declared) + + def check_declared(self, node): + """update the state of this Identifiers with the undeclared + and declared identifiers of the given node.""" + + for ident in node.undeclared_identifiers(): + if ident != 'context' and\ + ident not in self.declared.union(self.locally_declared): + self.undeclared.add(ident) + for ident in node.declared_identifiers(): + self.locally_declared.add(ident) + + def add_declared(self, ident): + self.declared.add(ident) + if ident in self.undeclared: + self.undeclared.remove(ident) + + def visitExpression(self, node): + self.check_declared(node) + + def visitControlLine(self, node): + self.check_declared(node) + + def visitCode(self, node): + if not node.ismodule: + self.check_declared(node) + self.locally_assigned = self.locally_assigned.union( + node.declared_identifiers()) + + def visitNamespaceTag(self, node): + # only traverse into the sub-elements of a + # <%namespace> tag if we are the branch created in + # write_namespaces() + if self.node is node: + for n in node.nodes: + n.accept_visitor(self) + + def _check_name_exists(self, collection, node): + existing = collection.get(node.funcname) + collection[node.funcname] = node + if existing is not None and \ + existing is not node and \ + (node.is_block or existing.is_block): + raise exceptions.CompileException( + "%%def or %%block named '%s' already " + "exists in this template." % + node.funcname, **node.exception_kwargs) + + def visitDefTag(self, node): + if node.is_root() and not node.is_anonymous: + self._check_name_exists(self.topleveldefs, node) + elif node is not self.node: + self._check_name_exists(self.closuredefs, node) + + for ident in node.undeclared_identifiers(): + if ident != 'context' and \ + ident not in self.declared.union(self.locally_declared): + self.undeclared.add(ident) + + # visit defs only one level deep + if node is self.node: + for ident in node.declared_identifiers(): + self.argument_declared.add(ident) + + for n in node.nodes: + n.accept_visitor(self) + + def visitBlockTag(self, node): + if node is not self.node and not node.is_anonymous: + + if isinstance(self.node, parsetree.DefTag): + raise exceptions.CompileException( + "Named block '%s' not allowed inside of def '%s'" + % (node.name, self.node.name), **node.exception_kwargs) + elif isinstance(self.node, + (parsetree.CallTag, parsetree.CallNamespaceTag)): + raise exceptions.CompileException( + "Named block '%s' not allowed inside of <%%call> tag" + % (node.name, ), **node.exception_kwargs) + + for ident in node.undeclared_identifiers(): + if ident != 'context' and \ + ident not in self.declared.union(self.locally_declared): + self.undeclared.add(ident) + + if not node.is_anonymous: + self._check_name_exists(self.topleveldefs, node) + self.undeclared.add(node.funcname) + elif node is not self.node: + self._check_name_exists(self.closuredefs, node) + for ident in node.declared_identifiers(): + self.argument_declared.add(ident) + for n in node.nodes: + n.accept_visitor(self) + + def visitTextTag(self, node): + for ident in node.undeclared_identifiers(): + if ident != 'context' and \ + ident not in self.declared.union(self.locally_declared): + self.undeclared.add(ident) + + def visitIncludeTag(self, node): + self.check_declared(node) + + def visitPageTag(self, node): + for ident in node.declared_identifiers(): + self.argument_declared.add(ident) + self.check_declared(node) + + def visitCallNamespaceTag(self, node): + self.visitCallTag(node) + + def visitCallTag(self, node): + if node is self.node: + for ident in node.undeclared_identifiers(): + if ident != 'context' and \ + ident not in self.declared.union( + self.locally_declared): + self.undeclared.add(ident) + for ident in node.declared_identifiers(): + self.argument_declared.add(ident) + for n in node.nodes: + n.accept_visitor(self) + else: + for ident in node.undeclared_identifiers(): + if ident != 'context' and \ + ident not in self.declared.union( + self.locally_declared): + self.undeclared.add(ident) + + +_FOR_LOOP = re.compile( + r'^for\s+((?:\(?)\s*[A-Za-z_][A-Za-z_0-9]*' + r'(?:\s*,\s*(?:[A-Za-z_][A-Za-z0-9_]*),??)*\s*(?:\)?))\s+in\s+(.*):' +) + +def mangle_mako_loop(node, printer): + """converts a for loop into a context manager wrapped around a for loop + when access to the `loop` variable has been detected in the for loop body + """ + loop_variable = LoopVariable() + node.accept_visitor(loop_variable) + if loop_variable.detected: + node.nodes[-1].has_loop_context = True + match = _FOR_LOOP.match(node.text) + if match: + printer.writelines( + 'loop = __M_loop._enter(%s)' % match.group(2), + 'try:' + #'with __M_loop(%s) as loop:' % match.group(2) + ) + text = 'for %s in loop:' % match.group(1) + else: + raise SyntaxError("Couldn't apply loop context: %s" % node.text) + else: + text = node.text + return text + + +class LoopVariable(object): + """A node visitor which looks for the name 'loop' within undeclared + identifiers.""" + + def __init__(self): + self.detected = False + + def _loop_reference_detected(self, node): + if 'loop' in node.undeclared_identifiers(): + self.detected = True + else: + for n in node.get_children(): + n.accept_visitor(self) + + def visitControlLine(self, node): + self._loop_reference_detected(node) + + def visitCode(self, node): + self._loop_reference_detected(node) + + def visitExpression(self, node): + self._loop_reference_detected(node) diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/compat.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/compat.py new file mode 100644 index 00000000000..fe277bbf05a --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/compat.py @@ -0,0 +1,174 @@ +import sys +import time + +py3k = sys.version_info >= (3, 0) +py33 = sys.version_info >= (3, 3) +py2k = sys.version_info < (3,) +py26 = sys.version_info >= (2, 6) +jython = sys.platform.startswith('java') +win32 = sys.platform.startswith('win') +pypy = hasattr(sys, 'pypy_version_info') + +if py3k: + from io import StringIO + import builtins as compat_builtins + from urllib.parse import quote_plus, unquote_plus + from html.entities import codepoint2name, name2codepoint + string_types = str, + binary_type = bytes + text_type = str + + from io import BytesIO as byte_buffer + + def u(s): + return s + + def b(s): + return s.encode("latin-1") + + def octal(lit): + return eval("0o" + lit) + +else: + import __builtin__ as compat_builtins + try: + from cStringIO import StringIO + except: + from StringIO import StringIO + + byte_buffer = StringIO + + from urllib import quote_plus, unquote_plus + from htmlentitydefs import codepoint2name, name2codepoint + string_types = basestring, + binary_type = str + text_type = unicode + + def u(s): + return unicode(s, "utf-8") + + def b(s): + return s + + def octal(lit): + return eval("0" + lit) + + +if py33: + from importlib import machinery + def load_module(module_id, path): + return machinery.SourceFileLoader(module_id, path).load_module() +else: + import imp + def load_module(module_id, path): + fp = open(path, 'rb') + try: + return imp.load_source(module_id, path, fp) + finally: + fp.close() + + +if py3k: + def reraise(tp, value, tb=None, cause=None): + if cause is not None: + value.__cause__ = cause + if value.__traceback__ is not tb: + raise value.with_traceback(tb) + raise value +else: + exec("def reraise(tp, value, tb=None, cause=None):\n" + " raise tp, value, tb\n") + + +def exception_as(): + return sys.exc_info()[1] + +try: + import threading + if py3k: + import _thread as thread + else: + import thread +except ImportError: + import dummy_threading as threading + if py3k: + import _dummy_thread as thread + else: + import dummy_thread as thread + +if win32 or jython: + time_func = time.clock +else: + time_func = time.time + +try: + from functools import partial +except: + def partial(func, *args, **keywords): + def newfunc(*fargs, **fkeywords): + newkeywords = keywords.copy() + newkeywords.update(fkeywords) + return func(*(args + fargs), **newkeywords) + return newfunc + + +all = all +import json + +def exception_name(exc): + return exc.__class__.__name__ + +try: + from inspect import CO_VARKEYWORDS, CO_VARARGS + def inspect_func_args(fn): + if py3k: + co = fn.__code__ + else: + co = fn.func_code + + nargs = co.co_argcount + names = co.co_varnames + args = list(names[:nargs]) + + varargs = None + if co.co_flags & CO_VARARGS: + varargs = co.co_varnames[nargs] + nargs = nargs + 1 + varkw = None + if co.co_flags & CO_VARKEYWORDS: + varkw = co.co_varnames[nargs] + + if py3k: + return args, varargs, varkw, fn.__defaults__ + else: + return args, varargs, varkw, fn.func_defaults +except ImportError: + import inspect + def inspect_func_args(fn): + return inspect.getargspec(fn) + +if py3k: + def callable(fn): + return hasattr(fn, '__call__') +else: + callable = callable + + +################################################ +# cross-compatible metaclass implementation +# Copyright (c) 2010-2012 Benjamin Peterson +def with_metaclass(meta, base=object): + """Create a base class with a metaclass.""" + return meta("%sBase" % meta.__name__, (base,), {}) +################################################ + + +def arg_stringname(func_arg): + """Gets the string name of a kwarg or vararg + In Python3.4 a function's args are + of _ast.arg type not _ast.name + """ + if hasattr(func_arg, 'arg'): + return func_arg.arg + else: + return str(func_arg) diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/exceptions.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/exceptions.py new file mode 100644 index 00000000000..c531f2118d0 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/exceptions.py @@ -0,0 +1,373 @@ +# mako/exceptions.py +# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file> +# +# This module is part of Mako and is released under +# the MIT License: http://www.opensource.org/licenses/mit-license.php + +"""exception classes""" + +import traceback +import sys +from mako import util, compat + +class MakoException(Exception): + pass + +class RuntimeException(MakoException): + pass + +def _format_filepos(lineno, pos, filename): + if filename is None: + return " at line: %d char: %d" % (lineno, pos) + else: + return " in file '%s' at line: %d char: %d" % (filename, lineno, pos) + + +class CompileException(MakoException): + def __init__(self, message, source, lineno, pos, filename): + MakoException.__init__(self, + message + _format_filepos(lineno, pos, filename)) + self.lineno = lineno + self.pos = pos + self.filename = filename + self.source = source + +class SyntaxException(MakoException): + def __init__(self, message, source, lineno, pos, filename): + MakoException.__init__(self, + message + _format_filepos(lineno, pos, filename)) + self.lineno = lineno + self.pos = pos + self.filename = filename + self.source = source + +class UnsupportedError(MakoException): + """raised when a retired feature is used.""" + +class NameConflictError(MakoException): + """raised when a reserved word is used inappropriately""" + +class TemplateLookupException(MakoException): + pass + +class TopLevelLookupException(TemplateLookupException): + pass + +class RichTraceback(object): + """Pull the current exception from the ``sys`` traceback and extracts + Mako-specific template information. + + See the usage examples in :ref:`handling_exceptions`. + + """ + def __init__(self, error=None, traceback=None): + self.source, self.lineno = "", 0 + + if error is None or traceback is None: + t, value, tback = sys.exc_info() + + if error is None: + error = value or t + + if traceback is None: + traceback = tback + + self.error = error + self.records = self._init(traceback) + + if isinstance(self.error, (CompileException, SyntaxException)): + self.source = self.error.source + self.lineno = self.error.lineno + self._has_source = True + + self._init_message() + + @property + def errorname(self): + return compat.exception_name(self.error) + + def _init_message(self): + """Find a unicode representation of self.error""" + try: + self.message = compat.text_type(self.error) + except UnicodeError: + try: + self.message = str(self.error) + except UnicodeEncodeError: + # Fallback to args as neither unicode nor + # str(Exception(u'\xe6')) work in Python < 2.6 + self.message = self.error.args[0] + if not isinstance(self.message, compat.text_type): + self.message = compat.text_type(self.message, 'ascii', 'replace') + + def _get_reformatted_records(self, records): + for rec in records: + if rec[6] is not None: + yield (rec[4], rec[5], rec[2], rec[6]) + else: + yield tuple(rec[0:4]) + + @property + def traceback(self): + """Return a list of 4-tuple traceback records (i.e. normal python + format) with template-corresponding lines remapped to the originating + template. + + """ + return list(self._get_reformatted_records(self.records)) + + @property + def reverse_records(self): + return reversed(self.records) + + @property + def reverse_traceback(self): + """Return the same data as traceback, except in reverse order. + """ + + return list(self._get_reformatted_records(self.reverse_records)) + + def _init(self, trcback): + """format a traceback from sys.exc_info() into 7-item tuples, + containing the regular four traceback tuple items, plus the original + template filename, the line number adjusted relative to the template + source, and code line from that line number of the template.""" + + import mako.template + mods = {} + rawrecords = traceback.extract_tb(trcback) + new_trcback = [] + for filename, lineno, function, line in rawrecords: + if not line: + line = '' + try: + (line_map, template_lines) = mods[filename] + except KeyError: + try: + info = mako.template._get_module_info(filename) + module_source = info.code + template_source = info.source + template_filename = info.template_filename or filename + except KeyError: + # A normal .py file (not a Template) + if not compat.py3k: + try: + fp = open(filename, 'rb') + encoding = util.parse_encoding(fp) + fp.close() + except IOError: + encoding = None + if encoding: + line = line.decode(encoding) + else: + line = line.decode('ascii', 'replace') + new_trcback.append((filename, lineno, function, line, + None, None, None, None)) + continue + + template_ln = 1 + + source_map = mako.template.ModuleInfo.\ + get_module_source_metadata( + module_source, full_line_map=True) + line_map = source_map['full_line_map'] + + template_lines = [line for line in + template_source.split("\n")] + mods[filename] = (line_map, template_lines) + + template_ln = line_map[lineno - 1] + + if template_ln <= len(template_lines): + template_line = template_lines[template_ln - 1] + else: + template_line = None + new_trcback.append((filename, lineno, function, + line, template_filename, template_ln, + template_line, template_source)) + if not self.source: + for l in range(len(new_trcback) - 1, 0, -1): + if new_trcback[l][5]: + self.source = new_trcback[l][7] + self.lineno = new_trcback[l][5] + break + else: + if new_trcback: + try: + # A normal .py file (not a Template) + fp = open(new_trcback[-1][0], 'rb') + encoding = util.parse_encoding(fp) + fp.seek(0) + self.source = fp.read() + fp.close() + if encoding: + self.source = self.source.decode(encoding) + except IOError: + self.source = '' + self.lineno = new_trcback[-1][1] + return new_trcback + + +def text_error_template(lookup=None): + """Provides a template that renders a stack trace in a similar format to + the Python interpreter, substituting source template filenames, line + numbers and code for that of the originating source template, as + applicable. + + """ + import mako.template + return mako.template.Template(r""" +<%page args="error=None, traceback=None"/> +<%! + from mako.exceptions import RichTraceback +%>\ +<% + tback = RichTraceback(error=error, traceback=traceback) +%>\ +Traceback (most recent call last): +% for (filename, lineno, function, line) in tback.traceback: + File "${filename}", line ${lineno}, in ${function or '?'} + ${line | trim} +% endfor +${tback.errorname}: ${tback.message} +""") + + +def _install_pygments(): + global syntax_highlight, pygments_html_formatter + from mako.ext.pygmentplugin import syntax_highlight,\ + pygments_html_formatter + +def _install_fallback(): + global syntax_highlight, pygments_html_formatter + from mako.filters import html_escape + pygments_html_formatter = None + def syntax_highlight(filename='', language=None): + return html_escape + +def _install_highlighting(): + try: + _install_pygments() + except ImportError: + _install_fallback() +_install_highlighting() + +def html_error_template(): + """Provides a template that renders a stack trace in an HTML format, + providing an excerpt of code as well as substituting source template + filenames, line numbers and code for that of the originating source + template, as applicable. + + The template's default ``encoding_errors`` value is + ``'htmlentityreplace'``. The template has two options. With the + ``full`` option disabled, only a section of an HTML document is + returned. With the ``css`` option disabled, the default stylesheet + won't be included. + + """ + import mako.template + return mako.template.Template(r""" +<%! + from mako.exceptions import RichTraceback, syntax_highlight,\ + pygments_html_formatter +%> +<%page args="full=True, css=True, error=None, traceback=None"/> +% if full: +<html> +<head> + <title>Mako Runtime Error</title> +% endif +% if css: + <style> + body { font-family:verdana; margin:10px 30px 10px 30px;} + .stacktrace { margin:5px 5px 5px 5px; } + .highlight { padding:0px 10px 0px 10px; background-color:#9F9FDF; } + .nonhighlight { padding:0px; background-color:#DFDFDF; } + .sample { padding:10px; margin:10px 10px 10px 10px; + font-family:monospace; } + .sampleline { padding:0px 10px 0px 10px; } + .sourceline { margin:5px 5px 10px 5px; font-family:monospace;} + .location { font-size:80%; } + .highlight { white-space:pre; } + .sampleline { white-space:pre; } + + % if pygments_html_formatter: + ${pygments_html_formatter.get_style_defs()} + .linenos { min-width: 2.5em; text-align: right; } + pre { margin: 0; } + .syntax-highlighted { padding: 0 10px; } + .syntax-highlightedtable { border-spacing: 1px; } + .nonhighlight { border-top: 1px solid #DFDFDF; + border-bottom: 1px solid #DFDFDF; } + .stacktrace .nonhighlight { margin: 5px 15px 10px; } + .sourceline { margin: 0 0; font-family:monospace; } + .code { background-color: #F8F8F8; width: 100%; } + .error .code { background-color: #FFBDBD; } + .error .syntax-highlighted { background-color: #FFBDBD; } + % endif + + </style> +% endif +% if full: +</head> +<body> +% endif + +<h2>Error !</h2> +<% + tback = RichTraceback(error=error, traceback=traceback) + src = tback.source + line = tback.lineno + if src: + lines = src.split('\n') + else: + lines = None +%> +<h3>${tback.errorname}: ${tback.message|h}</h3> + +% if lines: + <div class="sample"> + <div class="nonhighlight"> +% for index in range(max(0, line-4),min(len(lines), line+5)): + <% + if pygments_html_formatter: + pygments_html_formatter.linenostart = index + 1 + %> + % if index + 1 == line: + <% + if pygments_html_formatter: + old_cssclass = pygments_html_formatter.cssclass + pygments_html_formatter.cssclass = 'error ' + old_cssclass + %> + ${lines[index] | syntax_highlight(language='mako')} + <% + if pygments_html_formatter: + pygments_html_formatter.cssclass = old_cssclass + %> + % else: + ${lines[index] | syntax_highlight(language='mako')} + % endif +% endfor + </div> + </div> +% endif + +<div class="stacktrace"> +% for (filename, lineno, function, line) in tback.reverse_traceback: + <div class="location">${filename}, line ${lineno}:</div> + <div class="nonhighlight"> + <% + if pygments_html_formatter: + pygments_html_formatter.linenostart = lineno + %> + <div class="sourceline">${line | syntax_highlight(filename)}</div> + </div> +% endfor +</div> + +% if full: +</body> +</html> +% endif +""", output_encoding=sys.getdefaultencoding(), + encoding_errors='htmlentityreplace') diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/filters.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/filters.py new file mode 100644 index 00000000000..d79ce2388f6 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/filters.py @@ -0,0 +1,201 @@ +# mako/filters.py +# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file> +# +# This module is part of Mako and is released under +# the MIT License: http://www.opensource.org/licenses/mit-license.php + + +import re +import codecs + +from mako.compat import quote_plus, unquote_plus, codepoint2name, \ + name2codepoint + +from mako import compat + +xml_escapes = { + '&': '&', + '>': '>', + '<': '<', + '"': '"', # also " in html-only + "'": ''' # also ' in html-only +} + +# XXX: " is valid in HTML and XML +# ' is not valid HTML, but is valid XML + +def legacy_html_escape(s): + """legacy HTML escape for non-unicode mode.""" + s = s.replace("&", "&") + s = s.replace(">", ">") + s = s.replace("<", "<") + s = s.replace('"', """) + s = s.replace("'", "'") + return s + + +try: + import markupsafe + html_escape = markupsafe.escape +except ImportError: + html_escape = legacy_html_escape + +def xml_escape(string): + return re.sub(r'([&<"\'>])', lambda m: xml_escapes[m.group()], string) + +def url_escape(string): + # convert into a list of octets + string = string.encode("utf8") + return quote_plus(string) + +def legacy_url_escape(string): + # convert into a list of octets + return quote_plus(string) + +def url_unescape(string): + text = unquote_plus(string) + if not is_ascii_str(text): + text = text.decode("utf8") + return text + +def trim(string): + return string.strip() + + +class Decode(object): + def __getattr__(self, key): + def decode(x): + if isinstance(x, compat.text_type): + return x + elif not isinstance(x, compat.binary_type): + return decode(str(x)) + else: + return compat.text_type(x, encoding=key) + return decode +decode = Decode() + + +_ASCII_re = re.compile(r'\A[\x00-\x7f]*\Z') + +def is_ascii_str(text): + return isinstance(text, str) and _ASCII_re.match(text) + +################################################################ + +class XMLEntityEscaper(object): + def __init__(self, codepoint2name, name2codepoint): + self.codepoint2entity = dict([(c, compat.text_type('&%s;' % n)) + for c, n in codepoint2name.items()]) + self.name2codepoint = name2codepoint + + def escape_entities(self, text): + """Replace characters with their character entity references. + + Only characters corresponding to a named entity are replaced. + """ + return compat.text_type(text).translate(self.codepoint2entity) + + def __escape(self, m): + codepoint = ord(m.group()) + try: + return self.codepoint2entity[codepoint] + except (KeyError, IndexError): + return '&#x%X;' % codepoint + + + __escapable = re.compile(r'["&<>]|[^\x00-\x7f]') + + def escape(self, text): + """Replace characters with their character references. + + Replace characters by their named entity references. + Non-ASCII characters, if they do not have a named entity reference, + are replaced by numerical character references. + + The return value is guaranteed to be ASCII. + """ + return self.__escapable.sub(self.__escape, compat.text_type(text) + ).encode('ascii') + + # XXX: This regexp will not match all valid XML entity names__. + # (It punts on details involving involving CombiningChars and Extenders.) + # + # .. __: http://www.w3.org/TR/2000/REC-xml-20001006#NT-EntityRef + __characterrefs = re.compile(r'''& (?: + \#(\d+) + | \#x([\da-f]+) + | ( (?!\d) [:\w] [-.:\w]+ ) + ) ;''', + re.X | re.UNICODE) + + def __unescape(self, m): + dval, hval, name = m.groups() + if dval: + codepoint = int(dval) + elif hval: + codepoint = int(hval, 16) + else: + codepoint = self.name2codepoint.get(name, 0xfffd) + # U+FFFD = "REPLACEMENT CHARACTER" + if codepoint < 128: + return chr(codepoint) + return chr(codepoint) + + def unescape(self, text): + """Unescape character references. + + All character references (both entity references and numerical + character references) are unescaped. + """ + return self.__characterrefs.sub(self.__unescape, text) + + +_html_entities_escaper = XMLEntityEscaper(codepoint2name, name2codepoint) + +html_entities_escape = _html_entities_escaper.escape_entities +html_entities_unescape = _html_entities_escaper.unescape + + +def htmlentityreplace_errors(ex): + """An encoding error handler. + + This python `codecs`_ error handler replaces unencodable + characters with HTML entities, or, if no HTML entity exists for + the character, XML character references. + + >>> u'The cost was \u20ac12.'.encode('latin1', 'htmlentityreplace') + 'The cost was €12.' + """ + if isinstance(ex, UnicodeEncodeError): + # Handle encoding errors + bad_text = ex.object[ex.start:ex.end] + text = _html_entities_escaper.escape(bad_text) + return (compat.text_type(text), ex.end) + raise ex + +codecs.register_error('htmlentityreplace', htmlentityreplace_errors) + + +# TODO: options to make this dynamic per-compilation will be added in a later +# release +DEFAULT_ESCAPES = { + 'x': 'filters.xml_escape', + 'h': 'filters.html_escape', + 'u': 'filters.url_escape', + 'trim': 'filters.trim', + 'entity': 'filters.html_entities_escape', + 'unicode': 'unicode', + 'decode': 'decode', + 'str': 'str', + 'n': 'n' +} + +if compat.py3k: + DEFAULT_ESCAPES.update({ + 'unicode': 'str' + }) + +NON_UNICODE_ESCAPES = DEFAULT_ESCAPES.copy() +NON_UNICODE_ESCAPES['h'] = 'filters.legacy_html_escape' +NON_UNICODE_ESCAPES['u'] = 'filters.legacy_url_escape' + diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/lexer.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/lexer.py new file mode 100644 index 00000000000..1dda398215d --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/lexer.py @@ -0,0 +1,441 @@ +# mako/lexer.py +# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file> +# +# This module is part of Mako and is released under +# the MIT License: http://www.opensource.org/licenses/mit-license.php + +"""provides the Lexer class for parsing template strings into parse trees.""" + +import re +import codecs +from mako import parsetree, exceptions, compat +from mako.pygen import adjust_whitespace + +_regexp_cache = {} + +class Lexer(object): + def __init__(self, text, filename=None, + disable_unicode=False, + input_encoding=None, preprocessor=None): + self.text = text + self.filename = filename + self.template = parsetree.TemplateNode(self.filename) + self.matched_lineno = 1 + self.matched_charpos = 0 + self.lineno = 1 + self.match_position = 0 + self.tag = [] + self.control_line = [] + self.ternary_stack = [] + self.disable_unicode = disable_unicode + self.encoding = input_encoding + + if compat.py3k and disable_unicode: + raise exceptions.UnsupportedError( + "Mako for Python 3 does not " + "support disabling Unicode") + + if preprocessor is None: + self.preprocessor = [] + elif not hasattr(preprocessor, '__iter__'): + self.preprocessor = [preprocessor] + else: + self.preprocessor = preprocessor + + @property + def exception_kwargs(self): + return {'source': self.text, + 'lineno': self.matched_lineno, + 'pos': self.matched_charpos, + 'filename': self.filename} + + def match(self, regexp, flags=None): + """compile the given regexp, cache the reg, and call match_reg().""" + + try: + reg = _regexp_cache[(regexp, flags)] + except KeyError: + if flags: + reg = re.compile(regexp, flags) + else: + reg = re.compile(regexp) + _regexp_cache[(regexp, flags)] = reg + + return self.match_reg(reg) + + def match_reg(self, reg): + """match the given regular expression object to the current text + position. + + if a match occurs, update the current text and line position. + + """ + + mp = self.match_position + + match = reg.match(self.text, self.match_position) + if match: + (start, end) = match.span() + if end == start: + self.match_position = end + 1 + else: + self.match_position = end + self.matched_lineno = self.lineno + lines = re.findall(r"\n", self.text[mp:self.match_position]) + cp = mp - 1 + while (cp >= 0 and cp < self.textlength and self.text[cp] != '\n'): + cp -= 1 + self.matched_charpos = mp - cp + self.lineno += len(lines) + #print "MATCHED:", match.group(0), "LINE START:", + # self.matched_lineno, "LINE END:", self.lineno + #print "MATCH:", regexp, "\n", self.text[mp : mp + 15], \ + # (match and "TRUE" or "FALSE") + return match + + def parse_until_text(self, *text): + startpos = self.match_position + text_re = r'|'.join(text) + brace_level = 0 + while True: + match = self.match(r'#.*\n') + if match: + continue + match = self.match(r'(\"\"\"|\'\'\'|\"|\')((?<!\\)\\\1|.)*?\1', + re.S) + if match: + continue + match = self.match(r'(%s)' % text_re) + if match: + if match.group(1) == '}' and brace_level > 0: + brace_level -= 1 + continue + return \ + self.text[startpos: + self.match_position - len(match.group(1))],\ + match.group(1) + match = self.match(r"(.*?)(?=\"|\'|#|%s)" % text_re, re.S) + if match: + brace_level += match.group(1).count('{') + brace_level -= match.group(1).count('}') + continue + raise exceptions.SyntaxException( + "Expected: %s" % + ','.join(text), + **self.exception_kwargs) + + def append_node(self, nodecls, *args, **kwargs): + kwargs.setdefault('source', self.text) + kwargs.setdefault('lineno', self.matched_lineno) + kwargs.setdefault('pos', self.matched_charpos) + kwargs['filename'] = self.filename + node = nodecls(*args, **kwargs) + if len(self.tag): + self.tag[-1].nodes.append(node) + else: + self.template.nodes.append(node) + # build a set of child nodes for the control line + # (used for loop variable detection) + # also build a set of child nodes on ternary control lines + # (used for determining if a pass needs to be auto-inserted + if self.control_line: + control_frame = self.control_line[-1] + control_frame.nodes.append(node) + if not (isinstance(node, parsetree.ControlLine) and + control_frame.is_ternary(node.keyword)): + if self.ternary_stack and self.ternary_stack[-1]: + self.ternary_stack[-1][-1].nodes.append(node) + if isinstance(node, parsetree.Tag): + if len(self.tag): + node.parent = self.tag[-1] + self.tag.append(node) + elif isinstance(node, parsetree.ControlLine): + if node.isend: + self.control_line.pop() + self.ternary_stack.pop() + elif node.is_primary: + self.control_line.append(node) + self.ternary_stack.append([]) + elif self.control_line and \ + self.control_line[-1].is_ternary(node.keyword): + self.ternary_stack[-1].append(node) + elif self.control_line and \ + not self.control_line[-1].is_ternary(node.keyword): + raise exceptions.SyntaxException( + "Keyword '%s' not a legal ternary for keyword '%s'" % + (node.keyword, self.control_line[-1].keyword), + **self.exception_kwargs) + + _coding_re = re.compile(r'#.*coding[:=]\s*([-\w.]+).*\r?\n') + + def decode_raw_stream(self, text, decode_raw, known_encoding, filename): + """given string/unicode or bytes/string, determine encoding + from magic encoding comment, return body as unicode + or raw if decode_raw=False + + """ + if isinstance(text, compat.text_type): + m = self._coding_re.match(text) + encoding = m and m.group(1) or known_encoding or 'ascii' + return encoding, text + + if text.startswith(codecs.BOM_UTF8): + text = text[len(codecs.BOM_UTF8):] + parsed_encoding = 'utf-8' + m = self._coding_re.match(text.decode('utf-8', 'ignore')) + if m is not None and m.group(1) != 'utf-8': + raise exceptions.CompileException( + "Found utf-8 BOM in file, with conflicting " + "magic encoding comment of '%s'" % m.group(1), + text.decode('utf-8', 'ignore'), + 0, 0, filename) + else: + m = self._coding_re.match(text.decode('utf-8', 'ignore')) + if m: + parsed_encoding = m.group(1) + else: + parsed_encoding = known_encoding or 'ascii' + + if decode_raw: + try: + text = text.decode(parsed_encoding) + except UnicodeDecodeError: + raise exceptions.CompileException( + "Unicode decode operation of encoding '%s' failed" % + parsed_encoding, + text.decode('utf-8', 'ignore'), + 0, 0, filename) + + return parsed_encoding, text + + def parse(self): + self.encoding, self.text = self.decode_raw_stream(self.text, + not self.disable_unicode, + self.encoding, + self.filename,) + + for preproc in self.preprocessor: + self.text = preproc(self.text) + + # push the match marker past the + # encoding comment. + self.match_reg(self._coding_re) + + self.textlength = len(self.text) + + while (True): + if self.match_position > self.textlength: + break + + if self.match_end(): + break + if self.match_expression(): + continue + if self.match_control_line(): + continue + if self.match_comment(): + continue + if self.match_tag_start(): + continue + if self.match_tag_end(): + continue + if self.match_python_block(): + continue + if self.match_text(): + continue + + if self.match_position > self.textlength: + break + raise exceptions.CompileException("assertion failed") + + if len(self.tag): + raise exceptions.SyntaxException("Unclosed tag: <%%%s>" % + self.tag[-1].keyword, + **self.exception_kwargs) + if len(self.control_line): + raise exceptions.SyntaxException( + "Unterminated control keyword: '%s'" % + self.control_line[-1].keyword, + self.text, + self.control_line[-1].lineno, + self.control_line[-1].pos, self.filename) + return self.template + + def match_tag_start(self): + match = self.match(r''' + \<% # opening tag + + ([\w\.\:]+) # keyword + + ((?:\s+\w+|\s*=\s*|".*?"|'.*?')*) # attrname, = \ + # sign, string expression + + \s* # more whitespace + + (/)?> # closing + + ''', + + re.I | re.S | re.X) + + if match: + keyword, attr, isend = match.groups() + self.keyword = keyword + attributes = {} + if attr: + for att in re.findall( + r"\s*(\w+)\s*=\s*(?:'([^']*)'|\"([^\"]*)\")", attr): + key, val1, val2 = att + text = val1 or val2 + text = text.replace('\r\n', '\n') + attributes[key] = text + self.append_node(parsetree.Tag, keyword, attributes) + if isend: + self.tag.pop() + else: + if keyword == 'text': + match = self.match(r'(.*?)(?=\</%text>)', re.S) + if not match: + raise exceptions.SyntaxException( + "Unclosed tag: <%%%s>" % + self.tag[-1].keyword, + **self.exception_kwargs) + self.append_node(parsetree.Text, match.group(1)) + return self.match_tag_end() + return True + else: + return False + + def match_tag_end(self): + match = self.match(r'\</%[\t ]*(.+?)[\t ]*>') + if match: + if not len(self.tag): + raise exceptions.SyntaxException( + "Closing tag without opening tag: </%%%s>" % + match.group(1), + **self.exception_kwargs) + elif self.tag[-1].keyword != match.group(1): + raise exceptions.SyntaxException( + "Closing tag </%%%s> does not match tag: <%%%s>" % + (match.group(1), self.tag[-1].keyword), + **self.exception_kwargs) + self.tag.pop() + return True + else: + return False + + def match_end(self): + match = self.match(r'\Z', re.S) + if match: + string = match.group() + if string: + return string + else: + return True + else: + return False + + def match_text(self): + match = self.match(r""" + (.*?) # anything, followed by: + ( + (?<=\n)(?=[ \t]*(?=%|\#\#)) # an eval or line-based + # comment preceded by a + # consumed newline and whitespace + | + (?=\${) # an expression + | + (?=</?[%&]) # a substitution or block or call start or end + # - don't consume + | + (\\\r?\n) # an escaped newline - throw away + | + \Z # end of string + )""", re.X | re.S) + + if match: + text = match.group(1) + if text: + self.append_node(parsetree.Text, text) + return True + else: + return False + + def match_python_block(self): + match = self.match(r"<%(!)?") + if match: + line, pos = self.matched_lineno, self.matched_charpos + text, end = self.parse_until_text(r'%>') + # the trailing newline helps + # compiler.parse() not complain about indentation + text = adjust_whitespace(text) + "\n" + self.append_node( + parsetree.Code, + text, + match.group(1) == '!', lineno=line, pos=pos) + return True + else: + return False + + def match_expression(self): + match = self.match(r"\${") + if match: + line, pos = self.matched_lineno, self.matched_charpos + text, end = self.parse_until_text(r'\|', r'}') + if end == '|': + escapes, end = self.parse_until_text(r'}') + else: + escapes = "" + text = text.replace('\r\n', '\n') + self.append_node( + parsetree.Expression, + text, escapes.strip(), + lineno=line, pos=pos) + return True + else: + return False + + def match_control_line(self): + match = self.match( + r"(?<=^)[\t ]*(%(?!%)|##)[\t ]*((?:(?:\\r?\n)|[^\r\n])*)" + r"(?:\r?\n|\Z)", re.M) + if match: + operator = match.group(1) + text = match.group(2) + if operator == '%': + m2 = re.match(r'(end)?(\w+)\s*(.*)', text) + if not m2: + raise exceptions.SyntaxException( + "Invalid control line: '%s'" % + text, + **self.exception_kwargs) + isend, keyword = m2.group(1, 2) + isend = (isend is not None) + + if isend: + if not len(self.control_line): + raise exceptions.SyntaxException( + "No starting keyword '%s' for '%s'" % + (keyword, text), + **self.exception_kwargs) + elif self.control_line[-1].keyword != keyword: + raise exceptions.SyntaxException( + "Keyword '%s' doesn't match keyword '%s'" % + (text, self.control_line[-1].keyword), + **self.exception_kwargs) + self.append_node(parsetree.ControlLine, keyword, isend, text) + else: + self.append_node(parsetree.Comment, text) + return True + else: + return False + + def match_comment(self): + """matches the multiline version of a comment""" + match = self.match(r"<%doc>(.*?)</%doc>", re.S) + if match: + self.append_node(parsetree.Comment, match.group(1)) + return True + else: + return False + diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/lookup.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/lookup.py new file mode 100644 index 00000000000..2af5411907a --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/lookup.py @@ -0,0 +1,359 @@ +# mako/lookup.py +# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file> +# +# This module is part of Mako and is released under +# the MIT License: http://www.opensource.org/licenses/mit-license.php + +import os, stat, posixpath, re +from mako import exceptions, util +from mako.template import Template + +try: + import threading +except: + import dummy_threading as threading + +class TemplateCollection(object): + """Represent a collection of :class:`.Template` objects, + identifiable via URI. + + A :class:`.TemplateCollection` is linked to the usage of + all template tags that address other templates, such + as ``<%include>``, ``<%namespace>``, and ``<%inherit>``. + The ``file`` attribute of each of those tags refers + to a string URI that is passed to that :class:`.Template` + object's :class:`.TemplateCollection` for resolution. + + :class:`.TemplateCollection` is an abstract class, + with the usual default implementation being :class:`.TemplateLookup`. + + """ + + def has_template(self, uri): + """Return ``True`` if this :class:`.TemplateLookup` is + capable of returning a :class:`.Template` object for the + given ``uri``. + + :param uri: String URI of the template to be resolved. + + """ + try: + self.get_template(uri) + return True + except exceptions.TemplateLookupException: + return False + + def get_template(self, uri, relativeto=None): + """Return a :class:`.Template` object corresponding to the given + ``uri``. + + The default implementation raises + :class:`.NotImplementedError`. Implementations should + raise :class:`.TemplateLookupException` if the given ``uri`` + cannot be resolved. + + :param uri: String URI of the template to be resolved. + :param relativeto: if present, the given ``uri`` is assumed to + be relative to this URI. + + """ + raise NotImplementedError() + + def filename_to_uri(self, uri, filename): + """Convert the given ``filename`` to a URI relative to + this :class:`.TemplateCollection`.""" + + return uri + + def adjust_uri(self, uri, filename): + """Adjust the given ``uri`` based on the calling ``filename``. + + When this method is called from the runtime, the + ``filename`` parameter is taken directly to the ``filename`` + attribute of the calling template. Therefore a custom + :class:`.TemplateCollection` subclass can place any string + identifier desired in the ``filename`` parameter of the + :class:`.Template` objects it constructs and have them come back + here. + + """ + return uri + +class TemplateLookup(TemplateCollection): + """Represent a collection of templates that locates template source files + from the local filesystem. + + The primary argument is the ``directories`` argument, the list of + directories to search: + + .. sourcecode:: python + + lookup = TemplateLookup(["/path/to/templates"]) + some_template = lookup.get_template("/index.html") + + The :class:`.TemplateLookup` can also be given :class:`.Template` objects + programatically using :meth:`.put_string` or :meth:`.put_template`: + + .. sourcecode:: python + + lookup = TemplateLookup() + lookup.put_string("base.html", ''' + <html><body>${self.next()}</body></html> + ''') + lookup.put_string("hello.html", ''' + <%include file='base.html'/> + + Hello, world ! + ''') + + + :param directories: A list of directory names which will be + searched for a particular template URI. The URI is appended + to each directory and the filesystem checked. + + :param collection_size: Approximate size of the collection used + to store templates. If left at its default of ``-1``, the size + is unbounded, and a plain Python dictionary is used to + relate URI strings to :class:`.Template` instances. + Otherwise, a least-recently-used cache object is used which + will maintain the size of the collection approximately to + the number given. + + :param filesystem_checks: When at its default value of ``True``, + each call to :meth:`.TemplateLookup.get_template()` will + compare the filesystem last modified time to the time in + which an existing :class:`.Template` object was created. + This allows the :class:`.TemplateLookup` to regenerate a + new :class:`.Template` whenever the original source has + been updated. Set this to ``False`` for a very minor + performance increase. + + :param modulename_callable: A callable which, when present, + is passed the path of the source file as well as the + requested URI, and then returns the full path of the + generated Python module file. This is used to inject + alternate schemes for Python module location. If left at + its default of ``None``, the built in system of generation + based on ``module_directory`` plus ``uri`` is used. + + All other keyword parameters available for + :class:`.Template` are mirrored here. When new + :class:`.Template` objects are created, the keywords + established with this :class:`.TemplateLookup` are passed on + to each new :class:`.Template`. + + """ + + def __init__(self, + directories=None, + module_directory=None, + filesystem_checks=True, + collection_size=-1, + format_exceptions=False, + error_handler=None, + disable_unicode=False, + bytestring_passthrough=False, + output_encoding=None, + encoding_errors='strict', + + cache_args=None, + cache_impl='beaker', + cache_enabled=True, + cache_type=None, + cache_dir=None, + cache_url=None, + + modulename_callable=None, + module_writer=None, + default_filters=None, + buffer_filters=(), + strict_undefined=False, + imports=None, + future_imports=None, + enable_loop=True, + input_encoding=None, + preprocessor=None, + lexer_cls=None): + + self.directories = [posixpath.normpath(d) for d in + util.to_list(directories, ()) + ] + self.module_directory = module_directory + self.modulename_callable = modulename_callable + self.filesystem_checks = filesystem_checks + self.collection_size = collection_size + + if cache_args is None: + cache_args = {} + # transfer deprecated cache_* args + if cache_dir: + cache_args.setdefault('dir', cache_dir) + if cache_url: + cache_args.setdefault('url', cache_url) + if cache_type: + cache_args.setdefault('type', cache_type) + + self.template_args = { + 'format_exceptions':format_exceptions, + 'error_handler':error_handler, + 'disable_unicode':disable_unicode, + 'bytestring_passthrough':bytestring_passthrough, + 'output_encoding':output_encoding, + 'cache_impl':cache_impl, + 'encoding_errors':encoding_errors, + 'input_encoding':input_encoding, + 'module_directory':module_directory, + 'module_writer':module_writer, + 'cache_args':cache_args, + 'cache_enabled':cache_enabled, + 'default_filters':default_filters, + 'buffer_filters':buffer_filters, + 'strict_undefined':strict_undefined, + 'imports':imports, + 'future_imports':future_imports, + 'enable_loop':enable_loop, + 'preprocessor':preprocessor, + 'lexer_cls':lexer_cls + } + + if collection_size == -1: + self._collection = {} + self._uri_cache = {} + else: + self._collection = util.LRUCache(collection_size) + self._uri_cache = util.LRUCache(collection_size) + self._mutex = threading.Lock() + + def get_template(self, uri): + """Return a :class:`.Template` object corresponding to the given + ``uri``. + + .. note:: The ``relativeto`` argument is not supported here at the moment. + + """ + + try: + if self.filesystem_checks: + return self._check(uri, self._collection[uri]) + else: + return self._collection[uri] + except KeyError: + u = re.sub(r'^\/+', '', uri) + for dir in self.directories: + srcfile = posixpath.normpath(posixpath.join(dir, u)) + if os.path.isfile(srcfile): + return self._load(srcfile, uri) + else: + raise exceptions.TopLevelLookupException( + "Cant locate template for uri %r" % uri) + + def adjust_uri(self, uri, relativeto): + """Adjust the given ``uri`` based on the given relative URI.""" + + key = (uri, relativeto) + if key in self._uri_cache: + return self._uri_cache[key] + + if uri[0] != '/': + if relativeto is not None: + v = self._uri_cache[key] = posixpath.join( + posixpath.dirname(relativeto), uri) + else: + v = self._uri_cache[key] = '/' + uri + else: + v = self._uri_cache[key] = uri + return v + + + def filename_to_uri(self, filename): + """Convert the given ``filename`` to a URI relative to + this :class:`.TemplateCollection`.""" + + try: + return self._uri_cache[filename] + except KeyError: + value = self._relativeize(filename) + self._uri_cache[filename] = value + return value + + def _relativeize(self, filename): + """Return the portion of a filename that is 'relative' + to the directories in this lookup. + + """ + + filename = posixpath.normpath(filename) + for dir in self.directories: + if filename[0:len(dir)] == dir: + return filename[len(dir):] + else: + return None + + def _load(self, filename, uri): + self._mutex.acquire() + try: + try: + # try returning from collection one + # more time in case concurrent thread already loaded + return self._collection[uri] + except KeyError: + pass + try: + if self.modulename_callable is not None: + module_filename = self.modulename_callable(filename, uri) + else: + module_filename = None + self._collection[uri] = template = Template( + uri=uri, + filename=posixpath.normpath(filename), + lookup=self, + module_filename=module_filename, + **self.template_args) + return template + except: + # if compilation fails etc, ensure + # template is removed from collection, + # re-raise + self._collection.pop(uri, None) + raise + finally: + self._mutex.release() + + def _check(self, uri, template): + if template.filename is None: + return template + + try: + template_stat = os.stat(template.filename) + if template.module._modified_time < \ + template_stat[stat.ST_MTIME]: + self._collection.pop(uri, None) + return self._load(template.filename, uri) + else: + return template + except OSError: + self._collection.pop(uri, None) + raise exceptions.TemplateLookupException( + "Cant locate template for uri %r" % uri) + + + def put_string(self, uri, text): + """Place a new :class:`.Template` object into this + :class:`.TemplateLookup`, based on the given string of + ``text``. + + """ + self._collection[uri] = Template( + text, + lookup=self, + uri=uri, + **self.template_args) + + def put_template(self, uri, template): + """Place a new :class:`.Template` object into this + :class:`.TemplateLookup`, based on the given + :class:`.Template` object. + + """ + self._collection[uri] = template + diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/parsetree.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/parsetree.py new file mode 100644 index 00000000000..49ec4e0696c --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/parsetree.py @@ -0,0 +1,594 @@ +# mako/parsetree.py +# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file> +# +# This module is part of Mako and is released under +# the MIT License: http://www.opensource.org/licenses/mit-license.php + +"""defines the parse tree components for Mako templates.""" + +from mako import exceptions, ast, util, filters, compat +import re + +class Node(object): + """base class for a Node in the parse tree.""" + + def __init__(self, source, lineno, pos, filename): + self.source = source + self.lineno = lineno + self.pos = pos + self.filename = filename + + @property + def exception_kwargs(self): + return {'source': self.source, 'lineno': self.lineno, + 'pos': self.pos, 'filename': self.filename} + + def get_children(self): + return [] + + def accept_visitor(self, visitor): + def traverse(node): + for n in node.get_children(): + n.accept_visitor(visitor) + + method = getattr(visitor, "visit" + self.__class__.__name__, traverse) + method(self) + +class TemplateNode(Node): + """a 'container' node that stores the overall collection of nodes.""" + + def __init__(self, filename): + super(TemplateNode, self).__init__('', 0, 0, filename) + self.nodes = [] + self.page_attributes = {} + + def get_children(self): + return self.nodes + + def __repr__(self): + return "TemplateNode(%s, %r)" % ( + util.sorted_dict_repr(self.page_attributes), + self.nodes) + +class ControlLine(Node): + """defines a control line, a line-oriented python line or end tag. + + e.g.:: + + % if foo: + (markup) + % endif + + """ + + has_loop_context = False + + def __init__(self, keyword, isend, text, **kwargs): + super(ControlLine, self).__init__(**kwargs) + self.text = text + self.keyword = keyword + self.isend = isend + self.is_primary = keyword in ['for', 'if', 'while', 'try', 'with'] + self.nodes = [] + if self.isend: + self._declared_identifiers = [] + self._undeclared_identifiers = [] + else: + code = ast.PythonFragment(text, **self.exception_kwargs) + self._declared_identifiers = code.declared_identifiers + self._undeclared_identifiers = code.undeclared_identifiers + + def get_children(self): + return self.nodes + + def declared_identifiers(self): + return self._declared_identifiers + + def undeclared_identifiers(self): + return self._undeclared_identifiers + + def is_ternary(self, keyword): + """return true if the given keyword is a ternary keyword + for this ControlLine""" + + return keyword in { + 'if':set(['else', 'elif']), + 'try':set(['except', 'finally']), + 'for':set(['else']) + }.get(self.keyword, []) + + def __repr__(self): + return "ControlLine(%r, %r, %r, %r)" % ( + self.keyword, + self.text, + self.isend, + (self.lineno, self.pos) + ) + +class Text(Node): + """defines plain text in the template.""" + + def __init__(self, content, **kwargs): + super(Text, self).__init__(**kwargs) + self.content = content + + def __repr__(self): + return "Text(%r, %r)" % (self.content, (self.lineno, self.pos)) + +class Code(Node): + """defines a Python code block, either inline or module level. + + e.g.:: + + inline: + <% + x = 12 + %> + + module level: + <%! + import logger + %> + + """ + + def __init__(self, text, ismodule, **kwargs): + super(Code, self).__init__(**kwargs) + self.text = text + self.ismodule = ismodule + self.code = ast.PythonCode(text, **self.exception_kwargs) + + def declared_identifiers(self): + return self.code.declared_identifiers + + def undeclared_identifiers(self): + return self.code.undeclared_identifiers + + def __repr__(self): + return "Code(%r, %r, %r)" % ( + self.text, + self.ismodule, + (self.lineno, self.pos) + ) + +class Comment(Node): + """defines a comment line. + + # this is a comment + + """ + + def __init__(self, text, **kwargs): + super(Comment, self).__init__(**kwargs) + self.text = text + + def __repr__(self): + return "Comment(%r, %r)" % (self.text, (self.lineno, self.pos)) + +class Expression(Node): + """defines an inline expression. + + ${x+y} + + """ + + def __init__(self, text, escapes, **kwargs): + super(Expression, self).__init__(**kwargs) + self.text = text + self.escapes = escapes + self.escapes_code = ast.ArgumentList(escapes, **self.exception_kwargs) + self.code = ast.PythonCode(text, **self.exception_kwargs) + + def declared_identifiers(self): + return [] + + def undeclared_identifiers(self): + # TODO: make the "filter" shortcut list configurable at parse/gen time + return self.code.undeclared_identifiers.union( + self.escapes_code.undeclared_identifiers.difference( + set(filters.DEFAULT_ESCAPES.keys()) + ) + ).difference(self.code.declared_identifiers) + + def __repr__(self): + return "Expression(%r, %r, %r)" % ( + self.text, + self.escapes_code.args, + (self.lineno, self.pos) + ) + +class _TagMeta(type): + """metaclass to allow Tag to produce a subclass according to + its keyword""" + + _classmap = {} + + def __init__(cls, clsname, bases, dict): + if getattr(cls, '__keyword__', None) is not None: + cls._classmap[cls.__keyword__] = cls + super(_TagMeta, cls).__init__(clsname, bases, dict) + + def __call__(cls, keyword, attributes, **kwargs): + if ":" in keyword: + ns, defname = keyword.split(':') + return type.__call__(CallNamespaceTag, ns, defname, + attributes, **kwargs) + + try: + cls = _TagMeta._classmap[keyword] + except KeyError: + raise exceptions.CompileException( + "No such tag: '%s'" % keyword, + source=kwargs['source'], + lineno=kwargs['lineno'], + pos=kwargs['pos'], + filename=kwargs['filename'] + ) + return type.__call__(cls, keyword, attributes, **kwargs) + +class Tag(compat.with_metaclass(_TagMeta, Node)): + """abstract base class for tags. + + <%sometag/> + + <%someothertag> + stuff + </%someothertag> + + """ + __keyword__ = None + + def __init__(self, keyword, attributes, expressions, + nonexpressions, required, **kwargs): + """construct a new Tag instance. + + this constructor not called directly, and is only called + by subclasses. + + :param keyword: the tag keyword + + :param attributes: raw dictionary of attribute key/value pairs + + :param expressions: a set of identifiers that are legal attributes, + which can also contain embedded expressions + + :param nonexpressions: a set of identifiers that are legal + attributes, which cannot contain embedded expressions + + :param \**kwargs: + other arguments passed to the Node superclass (lineno, pos) + + """ + super(Tag, self).__init__(**kwargs) + self.keyword = keyword + self.attributes = attributes + self._parse_attributes(expressions, nonexpressions) + missing = [r for r in required if r not in self.parsed_attributes] + if len(missing): + raise exceptions.CompileException( + "Missing attribute(s): %s" % + ",".join([repr(m) for m in missing]), + **self.exception_kwargs) + self.parent = None + self.nodes = [] + + def is_root(self): + return self.parent is None + + def get_children(self): + return self.nodes + + def _parse_attributes(self, expressions, nonexpressions): + undeclared_identifiers = set() + self.parsed_attributes = {} + for key in self.attributes: + if key in expressions: + expr = [] + for x in re.compile(r'(\${.+?})', + re.S).split(self.attributes[key]): + m = re.compile(r'^\${(.+?)}$', re.S).match(x) + if m: + code = ast.PythonCode(m.group(1).rstrip(), + **self.exception_kwargs) + # we aren't discarding "declared_identifiers" here, + # which we do so that list comprehension-declared + # variables aren't counted. As yet can't find a + # condition that requires it here. + undeclared_identifiers = \ + undeclared_identifiers.union( + code.undeclared_identifiers) + expr.append('(%s)' % m.group(1)) + else: + if x: + expr.append(repr(x)) + self.parsed_attributes[key] = " + ".join(expr) or repr('') + elif key in nonexpressions: + if re.search(r'\${.+?}', self.attributes[key]): + raise exceptions.CompileException( + "Attibute '%s' in tag '%s' does not allow embedded " + "expressions" % (key, self.keyword), + **self.exception_kwargs) + self.parsed_attributes[key] = repr(self.attributes[key]) + else: + raise exceptions.CompileException( + "Invalid attribute for tag '%s': '%s'" % + (self.keyword, key), + **self.exception_kwargs) + self.expression_undeclared_identifiers = undeclared_identifiers + + def declared_identifiers(self): + return [] + + def undeclared_identifiers(self): + return self.expression_undeclared_identifiers + + def __repr__(self): + return "%s(%r, %s, %r, %r)" % (self.__class__.__name__, + self.keyword, + util.sorted_dict_repr(self.attributes), + (self.lineno, self.pos), + self.nodes + ) + +class IncludeTag(Tag): + __keyword__ = 'include' + + def __init__(self, keyword, attributes, **kwargs): + super(IncludeTag, self).__init__( + keyword, + attributes, + ('file', 'import', 'args'), + (), ('file',), **kwargs) + self.page_args = ast.PythonCode( + "__DUMMY(%s)" % attributes.get('args', ''), + **self.exception_kwargs) + + def declared_identifiers(self): + return [] + + def undeclared_identifiers(self): + identifiers = self.page_args.undeclared_identifiers.\ + difference(set(["__DUMMY"])).\ + difference(self.page_args.declared_identifiers) + return identifiers.union(super(IncludeTag, self). + undeclared_identifiers()) + +class NamespaceTag(Tag): + __keyword__ = 'namespace' + + def __init__(self, keyword, attributes, **kwargs): + super(NamespaceTag, self).__init__( + keyword, attributes, + ('file',), + ('name','inheritable', + 'import','module'), + (), **kwargs) + + self.name = attributes.get('name', '__anon_%s' % hex(abs(id(self)))) + if not 'name' in attributes and not 'import' in attributes: + raise exceptions.CompileException( + "'name' and/or 'import' attributes are required " + "for <%namespace>", + **self.exception_kwargs) + if 'file' in attributes and 'module' in attributes: + raise exceptions.CompileException( + "<%namespace> may only have one of 'file' or 'module'", + **self.exception_kwargs + ) + + def declared_identifiers(self): + return [] + +class TextTag(Tag): + __keyword__ = 'text' + + def __init__(self, keyword, attributes, **kwargs): + super(TextTag, self).__init__( + keyword, + attributes, (), + ('filter'), (), **kwargs) + self.filter_args = ast.ArgumentList( + attributes.get('filter', ''), + **self.exception_kwargs) + + def undeclared_identifiers(self): + return self.filter_args.\ + undeclared_identifiers.\ + difference(filters.DEFAULT_ESCAPES.keys()).union( + self.expression_undeclared_identifiers + ) + +class DefTag(Tag): + __keyword__ = 'def' + + def __init__(self, keyword, attributes, **kwargs): + expressions = ['buffered', 'cached'] + [ + c for c in attributes if c.startswith('cache_')] + + + super(DefTag, self).__init__( + keyword, + attributes, + expressions, + ('name', 'filter', 'decorator'), + ('name',), + **kwargs) + name = attributes['name'] + if re.match(r'^[\w_]+$', name): + raise exceptions.CompileException( + "Missing parenthesis in %def", + **self.exception_kwargs) + self.function_decl = ast.FunctionDecl("def " + name + ":pass", + **self.exception_kwargs) + self.name = self.function_decl.funcname + self.decorator = attributes.get('decorator', '') + self.filter_args = ast.ArgumentList( + attributes.get('filter', ''), + **self.exception_kwargs) + + is_anonymous = False + is_block = False + + @property + def funcname(self): + return self.function_decl.funcname + + def get_argument_expressions(self, **kw): + return self.function_decl.get_argument_expressions(**kw) + + def declared_identifiers(self): + return self.function_decl.allargnames + + def undeclared_identifiers(self): + res = [] + for c in self.function_decl.defaults: + res += list(ast.PythonCode(c, **self.exception_kwargs). + undeclared_identifiers) + return set(res).union( + self.filter_args.\ + undeclared_identifiers.\ + difference(filters.DEFAULT_ESCAPES.keys()) + ).union( + self.expression_undeclared_identifiers + ).difference( + self.function_decl.allargnames + ) + +class BlockTag(Tag): + __keyword__ = 'block' + + def __init__(self, keyword, attributes, **kwargs): + expressions = ['buffered', 'cached', 'args'] + [ + c for c in attributes if c.startswith('cache_')] + + super(BlockTag, self).__init__( + keyword, + attributes, + expressions, + ('name','filter', 'decorator'), + (), + **kwargs) + name = attributes.get('name') + if name and not re.match(r'^[\w_]+$',name): + raise exceptions.CompileException( + "%block may not specify an argument signature", + **self.exception_kwargs) + if not name and attributes.get('args', None): + raise exceptions.CompileException( + "Only named %blocks may specify args", + **self.exception_kwargs + ) + self.body_decl = ast.FunctionArgs(attributes.get('args', ''), + **self.exception_kwargs) + + self.name = name + self.decorator = attributes.get('decorator', '') + self.filter_args = ast.ArgumentList( + attributes.get('filter', ''), + **self.exception_kwargs) + + + is_block = True + + @property + def is_anonymous(self): + return self.name is None + + @property + def funcname(self): + return self.name or "__M_anon_%d" % (self.lineno, ) + + def get_argument_expressions(self, **kw): + return self.body_decl.get_argument_expressions(**kw) + + def declared_identifiers(self): + return self.body_decl.allargnames + + def undeclared_identifiers(self): + return (self.filter_args.\ + undeclared_identifiers.\ + difference(filters.DEFAULT_ESCAPES.keys()) + ).union(self.expression_undeclared_identifiers) + + + +class CallTag(Tag): + __keyword__ = 'call' + + def __init__(self, keyword, attributes, **kwargs): + super(CallTag, self).__init__(keyword, attributes, + ('args'), ('expr',), ('expr',), **kwargs) + self.expression = attributes['expr'] + self.code = ast.PythonCode(self.expression, **self.exception_kwargs) + self.body_decl = ast.FunctionArgs(attributes.get('args', ''), + **self.exception_kwargs) + + def declared_identifiers(self): + return self.code.declared_identifiers.union(self.body_decl.allargnames) + + def undeclared_identifiers(self): + return self.code.undeclared_identifiers.\ + difference(self.code.declared_identifiers) + +class CallNamespaceTag(Tag): + + def __init__(self, namespace, defname, attributes, **kwargs): + super(CallNamespaceTag, self).__init__( + namespace + ":" + defname, + attributes, + tuple(attributes.keys()) + ('args', ), + (), + (), + **kwargs) + + self.expression = "%s.%s(%s)" % ( + namespace, + defname, + ",".join(["%s=%s" % (k, v) for k, v in + self.parsed_attributes.items() + if k != 'args']) + ) + self.code = ast.PythonCode(self.expression, **self.exception_kwargs) + self.body_decl = ast.FunctionArgs( + attributes.get('args', ''), + **self.exception_kwargs) + + def declared_identifiers(self): + return self.code.declared_identifiers.union(self.body_decl.allargnames) + + def undeclared_identifiers(self): + return self.code.undeclared_identifiers.\ + difference(self.code.declared_identifiers) + +class InheritTag(Tag): + __keyword__ = 'inherit' + + def __init__(self, keyword, attributes, **kwargs): + super(InheritTag, self).__init__( + keyword, attributes, + ('file',), (), ('file',), **kwargs) + +class PageTag(Tag): + __keyword__ = 'page' + + def __init__(self, keyword, attributes, **kwargs): + expressions = ['cached', 'args', 'expression_filter', 'enable_loop'] + [ + c for c in attributes if c.startswith('cache_')] + + super(PageTag, self).__init__( + keyword, + attributes, + expressions, + (), + (), + **kwargs) + self.body_decl = ast.FunctionArgs(attributes.get('args', ''), + **self.exception_kwargs) + self.filter_args = ast.ArgumentList( + attributes.get('expression_filter', ''), + **self.exception_kwargs) + + def declared_identifiers(self): + return self.body_decl.allargnames + + diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/pygen.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/pygen.py new file mode 100644 index 00000000000..5ba5125a4c7 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/pygen.py @@ -0,0 +1,299 @@ +# mako/pygen.py +# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file> +# +# This module is part of Mako and is released under +# the MIT License: http://www.opensource.org/licenses/mit-license.php + +"""utilities for generating and formatting literal Python code.""" + +import re +from mako import exceptions + +class PythonPrinter(object): + def __init__(self, stream): + # indentation counter + self.indent = 0 + + # a stack storing information about why we incremented + # the indentation counter, to help us determine if we + # should decrement it + self.indent_detail = [] + + # the string of whitespace multiplied by the indent + # counter to produce a line + self.indentstring = " " + + # the stream we are writing to + self.stream = stream + + # current line number + self.lineno = 1 + + # a list of lines that represents a buffered "block" of code, + # which can be later printed relative to an indent level + self.line_buffer = [] + + self.in_indent_lines = False + + self._reset_multi_line_flags() + + # mapping of generated python lines to template + # source lines + self.source_map = {} + + def _update_lineno(self, num): + self.lineno += num + + def start_source(self, lineno): + if self.lineno not in self.source_map: + self.source_map[self.lineno] = lineno + + def write_blanks(self, num): + self.stream.write("\n" * num) + self._update_lineno(num) + + def write_indented_block(self, block): + """print a line or lines of python which already contain indentation. + + The indentation of the total block of lines will be adjusted to that of + the current indent level.""" + self.in_indent_lines = False + for l in re.split(r'\r?\n', block): + self.line_buffer.append(l) + self._update_lineno(1) + + def writelines(self, *lines): + """print a series of lines of python.""" + for line in lines: + self.writeline(line) + + def writeline(self, line): + """print a line of python, indenting it according to the current + indent level. + + this also adjusts the indentation counter according to the + content of the line. + + """ + + if not self.in_indent_lines: + self._flush_adjusted_lines() + self.in_indent_lines = True + + if (line is None or + re.match(r"^\s*#",line) or + re.match(r"^\s*$", line) + ): + hastext = False + else: + hastext = True + + is_comment = line and len(line) and line[0] == '#' + + # see if this line should decrease the indentation level + if (not is_comment and + (not hastext or self._is_unindentor(line)) + ): + + if self.indent > 0: + self.indent -= 1 + # if the indent_detail stack is empty, the user + # probably put extra closures - the resulting + # module wont compile. + if len(self.indent_detail) == 0: + raise exceptions.SyntaxException( + "Too many whitespace closures") + self.indent_detail.pop() + + if line is None: + return + + # write the line + self.stream.write(self._indent_line(line) + "\n") + self._update_lineno(len(line.split("\n"))) + + # see if this line should increase the indentation level. + # note that a line can both decrase (before printing) and + # then increase (after printing) the indentation level. + + if re.search(r":[ \t]*(?:#.*)?$", line): + # increment indentation count, and also + # keep track of what the keyword was that indented us, + # if it is a python compound statement keyword + # where we might have to look for an "unindent" keyword + match = re.match(r"^\s*(if|try|elif|while|for|with)", line) + if match: + # its a "compound" keyword, so we will check for "unindentors" + indentor = match.group(1) + self.indent += 1 + self.indent_detail.append(indentor) + else: + indentor = None + # its not a "compound" keyword. but lets also + # test for valid Python keywords that might be indenting us, + # else assume its a non-indenting line + m2 = re.match(r"^\s*(def|class|else|elif|except|finally)", + line) + if m2: + self.indent += 1 + self.indent_detail.append(indentor) + + def close(self): + """close this printer, flushing any remaining lines.""" + self._flush_adjusted_lines() + + def _is_unindentor(self, line): + """return true if the given line is an 'unindentor', + relative to the last 'indent' event received. + + """ + + # no indentation detail has been pushed on; return False + if len(self.indent_detail) == 0: + return False + + indentor = self.indent_detail[-1] + + # the last indent keyword we grabbed is not a + # compound statement keyword; return False + if indentor is None: + return False + + # if the current line doesnt have one of the "unindentor" keywords, + # return False + match = re.match(r"^\s*(else|elif|except|finally).*\:", line) + if not match: + return False + + # whitespace matches up, we have a compound indentor, + # and this line has an unindentor, this + # is probably good enough + return True + + # should we decide that its not good enough, heres + # more stuff to check. + #keyword = match.group(1) + + # match the original indent keyword + #for crit in [ + # (r'if|elif', r'else|elif'), + # (r'try', r'except|finally|else'), + # (r'while|for', r'else'), + #]: + # if re.match(crit[0], indentor) and re.match(crit[1], keyword): + # return True + + #return False + + def _indent_line(self, line, stripspace=''): + """indent the given line according to the current indent level. + + stripspace is a string of space that will be truncated from the + start of the line before indenting.""" + + return re.sub(r"^%s" % stripspace, self.indentstring + * self.indent, line) + + def _reset_multi_line_flags(self): + """reset the flags which would indicate we are in a backslashed + or triple-quoted section.""" + + self.backslashed, self.triplequoted = False, False + + def _in_multi_line(self, line): + """return true if the given line is part of a multi-line block, + via backslash or triple-quote.""" + + # we are only looking for explicitly joined lines here, not + # implicit ones (i.e. brackets, braces etc.). this is just to + # guard against the possibility of modifying the space inside of + # a literal multiline string with unfortunately placed + # whitespace + + current_state = (self.backslashed or self.triplequoted) + + if re.search(r"\\$", line): + self.backslashed = True + else: + self.backslashed = False + + triples = len(re.findall(r"\"\"\"|\'\'\'", line)) + if triples == 1 or triples % 2 != 0: + self.triplequoted = not self.triplequoted + + return current_state + + def _flush_adjusted_lines(self): + stripspace = None + self._reset_multi_line_flags() + + for entry in self.line_buffer: + if self._in_multi_line(entry): + self.stream.write(entry + "\n") + else: + entry = entry.expandtabs() + if stripspace is None and re.search(r"^[ \t]*[^# \t]", entry): + stripspace = re.match(r"^([ \t]*)", entry).group(1) + self.stream.write(self._indent_line(entry, stripspace) + "\n") + + self.line_buffer = [] + self._reset_multi_line_flags() + + +def adjust_whitespace(text): + """remove the left-whitespace margin of a block of Python code.""" + + state = [False, False] + (backslashed, triplequoted) = (0, 1) + + def in_multi_line(line): + start_state = (state[backslashed] or state[triplequoted]) + + if re.search(r"\\$", line): + state[backslashed] = True + else: + state[backslashed] = False + + def match(reg, t): + m = re.match(reg, t) + if m: + return m, t[len(m.group(0)):] + else: + return None, t + + while line: + if state[triplequoted]: + m, line = match(r"%s" % state[triplequoted], line) + if m: + state[triplequoted] = False + else: + m, line = match(r".*?(?=%s|$)" % state[triplequoted], line) + else: + m, line = match(r'#', line) + if m: + return start_state + + m, line = match(r"\"\"\"|\'\'\'", line) + if m: + state[triplequoted] = m.group(0) + continue + + m, line = match(r".*?(?=\"\"\"|\'\'\'|#|$)", line) + + return start_state + + def _indent_line(line, stripspace=''): + return re.sub(r"^%s" % stripspace, '', line) + + lines = [] + stripspace = None + + for line in re.split(r'\r?\n', text): + if in_multi_line(line): + lines.append(line) + else: + line = line.expandtabs() + if stripspace is None and re.search(r"^[ \t]*[^# \t]", line): + stripspace = re.match(r"^([ \t]*)", line).group(1) + lines.append(_indent_line(line, stripspace)) + return "\n".join(lines) diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/pyparser.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/pyparser.py new file mode 100644 index 00000000000..bfa46a9fafd --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/pyparser.py @@ -0,0 +1,232 @@ +# mako/pyparser.py +# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file> +# +# This module is part of Mako and is released under +# the MIT License: http://www.opensource.org/licenses/mit-license.php + +"""Handles parsing of Python code. + +Parsing to AST is done via _ast on Python > 2.5, otherwise the compiler +module is used. +""" + +from mako import exceptions, util, compat +from mako.compat import arg_stringname +import operator + +if compat.py3k: + # words that cannot be assigned to (notably + # smaller than the total keys in __builtins__) + reserved = set(['True', 'False', 'None', 'print']) + + # the "id" attribute on a function node + arg_id = operator.attrgetter('arg') +else: + # words that cannot be assigned to (notably + # smaller than the total keys in __builtins__) + reserved = set(['True', 'False', 'None']) + + # the "id" attribute on a function node + arg_id = operator.attrgetter('id') + +import _ast +util.restore__ast(_ast) +from mako import _ast_util + + +def parse(code, mode='exec', **exception_kwargs): + """Parse an expression into AST""" + + try: + return _ast_util.parse(code, '<unknown>', mode) + except Exception: + raise exceptions.SyntaxException( + "(%s) %s (%r)" % ( + compat.exception_as().__class__.__name__, + compat.exception_as(), + code[0:50] + ), **exception_kwargs) + + +class FindIdentifiers(_ast_util.NodeVisitor): + + def __init__(self, listener, **exception_kwargs): + self.in_function = False + self.in_assign_targets = False + self.local_ident_stack = set() + self.listener = listener + self.exception_kwargs = exception_kwargs + + def _add_declared(self, name): + if not self.in_function: + self.listener.declared_identifiers.add(name) + else: + self.local_ident_stack.add(name) + + def visit_ClassDef(self, node): + self._add_declared(node.name) + + def visit_Assign(self, node): + + # flip around the visiting of Assign so the expression gets + # evaluated first, in the case of a clause like "x=x+5" (x + # is undeclared) + + self.visit(node.value) + in_a = self.in_assign_targets + self.in_assign_targets = True + for n in node.targets: + self.visit(n) + self.in_assign_targets = in_a + + if compat.py3k: + + # ExceptHandler is in Python 2, but this block only works in + # Python 3 (and is required there) + + def visit_ExceptHandler(self, node): + if node.name is not None: + self._add_declared(node.name) + if node.type is not None: + self.visit(node.type) + for statement in node.body: + self.visit(statement) + + def visit_Lambda(self, node, *args): + self._visit_function(node, True) + + def visit_FunctionDef(self, node): + self._add_declared(node.name) + self._visit_function(node, False) + + def _expand_tuples(self, args): + for arg in args: + if isinstance(arg, _ast.Tuple): + for n in arg.elts: + yield n + else: + yield arg + + def _visit_function(self, node, islambda): + + # push function state onto stack. dont log any more + # identifiers as "declared" until outside of the function, + # but keep logging identifiers as "undeclared". track + # argument names in each function header so they arent + # counted as "undeclared" + + inf = self.in_function + self.in_function = True + + local_ident_stack = self.local_ident_stack + self.local_ident_stack = local_ident_stack.union([ + arg_id(arg) for arg in self._expand_tuples(node.args.args) + ]) + if islambda: + self.visit(node.body) + else: + for n in node.body: + self.visit(n) + self.in_function = inf + self.local_ident_stack = local_ident_stack + + def visit_For(self, node): + + # flip around visit + + self.visit(node.iter) + self.visit(node.target) + for statement in node.body: + self.visit(statement) + for statement in node.orelse: + self.visit(statement) + + def visit_Name(self, node): + if isinstance(node.ctx, _ast.Store): + # this is eqiuvalent to visit_AssName in + # compiler + self._add_declared(node.id) + elif node.id not in reserved and node.id \ + not in self.listener.declared_identifiers and node.id \ + not in self.local_ident_stack: + self.listener.undeclared_identifiers.add(node.id) + + def visit_Import(self, node): + for name in node.names: + if name.asname is not None: + self._add_declared(name.asname) + else: + self._add_declared(name.name.split('.')[0]) + + def visit_ImportFrom(self, node): + for name in node.names: + if name.asname is not None: + self._add_declared(name.asname) + else: + if name.name == '*': + raise exceptions.CompileException( + "'import *' is not supported, since all identifier " + "names must be explicitly declared. Please use the " + "form 'from <modulename> import <name1>, <name2>, " + "...' instead.", **self.exception_kwargs) + self._add_declared(name.name) + + +class FindTuple(_ast_util.NodeVisitor): + + def __init__(self, listener, code_factory, **exception_kwargs): + self.listener = listener + self.exception_kwargs = exception_kwargs + self.code_factory = code_factory + + def visit_Tuple(self, node): + for n in node.elts: + p = self.code_factory(n, **self.exception_kwargs) + self.listener.codeargs.append(p) + self.listener.args.append(ExpressionGenerator(n).value()) + self.listener.declared_identifiers = \ + self.listener.declared_identifiers.union( + p.declared_identifiers) + self.listener.undeclared_identifiers = \ + self.listener.undeclared_identifiers.union( + p.undeclared_identifiers) + + +class ParseFunc(_ast_util.NodeVisitor): + + def __init__(self, listener, **exception_kwargs): + self.listener = listener + self.exception_kwargs = exception_kwargs + + def visit_FunctionDef(self, node): + self.listener.funcname = node.name + + argnames = [arg_id(arg) for arg in node.args.args] + if node.args.vararg: + argnames.append(arg_stringname(node.args.vararg)) + + if compat.py2k: + # kw-only args don't exist in Python 2 + kwargnames = [] + else: + kwargnames = [arg_id(arg) for arg in node.args.kwonlyargs] + if node.args.kwarg: + kwargnames.append(arg_stringname(node.args.kwarg)) + self.listener.argnames = argnames + self.listener.defaults = node.args.defaults # ast + self.listener.kwargnames = kwargnames + if compat.py2k: + self.listener.kwdefaults = [] + else: + self.listener.kwdefaults = node.args.kw_defaults + self.listener.varargs = node.args.vararg + self.listener.kwargs = node.args.kwarg + +class ExpressionGenerator(object): + + def __init__(self, astnode): + self.generator = _ast_util.SourceGenerator(' ' * 4) + self.generator.visit(astnode) + + def value(self): + return ''.join(self.generator.result) diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/runtime.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/runtime.py new file mode 100644 index 00000000000..6b6a35a9215 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/runtime.py @@ -0,0 +1,878 @@ +# mako/runtime.py +# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file> +# +# This module is part of Mako and is released under +# the MIT License: http://www.opensource.org/licenses/mit-license.php + +"""provides runtime services for templates, including Context, +Namespace, and various helper functions.""" + +from mako import exceptions, util, compat +from mako.compat import compat_builtins +import sys + + +class Context(object): + """Provides runtime namespace, output buffer, and various + callstacks for templates. + + See :ref:`runtime_toplevel` for detail on the usage of + :class:`.Context`. + + """ + + def __init__(self, buffer, **data): + self._buffer_stack = [buffer] + + self._data = data + + self._kwargs = data.copy() + self._with_template = None + self._outputting_as_unicode = None + self.namespaces = {} + + # "capture" function which proxies to the + # generic "capture" function + self._data['capture'] = compat.partial(capture, self) + + # "caller" stack used by def calls with content + self.caller_stack = self._data['caller'] = CallerStack() + + def _set_with_template(self, t): + self._with_template = t + illegal_names = t.reserved_names.intersection(self._data) + if illegal_names: + raise exceptions.NameConflictError( + "Reserved words passed to render(): %s" % + ", ".join(illegal_names)) + + @property + def lookup(self): + """Return the :class:`.TemplateLookup` associated + with this :class:`.Context`. + + """ + return self._with_template.lookup + + @property + def kwargs(self): + """Return the dictionary of top level keyword arguments associated + with this :class:`.Context`. + + This dictionary only includes the top-level arguments passed to + :meth:`.Template.render`. It does not include names produced within + the template execution such as local variable names or special names + such as ``self``, ``next``, etc. + + The purpose of this dictionary is primarily for the case that + a :class:`.Template` accepts arguments via its ``<%page>`` tag, + which are normally expected to be passed via :meth:`.Template.render`, + except the template is being called in an inheritance context, + using the ``body()`` method. :attr:`.Context.kwargs` can then be + used to propagate these arguments to the inheriting template:: + + ${next.body(**context.kwargs)} + + """ + return self._kwargs.copy() + + def push_caller(self, caller): + """Push a ``caller`` callable onto the callstack for + this :class:`.Context`.""" + + + self.caller_stack.append(caller) + + def pop_caller(self): + """Pop a ``caller`` callable onto the callstack for this + :class:`.Context`.""" + + del self.caller_stack[-1] + + def keys(self): + """Return a list of all names established in this :class:`.Context`.""" + + return list(self._data.keys()) + + def __getitem__(self, key): + if key in self._data: + return self._data[key] + else: + return compat_builtins.__dict__[key] + + def _push_writer(self): + """push a capturing buffer onto this Context and return + the new writer function.""" + + buf = util.FastEncodingBuffer() + self._buffer_stack.append(buf) + return buf.write + + def _pop_buffer_and_writer(self): + """pop the most recent capturing buffer from this Context + and return the current writer after the pop. + + """ + + buf = self._buffer_stack.pop() + return buf, self._buffer_stack[-1].write + + def _push_buffer(self): + """push a capturing buffer onto this Context.""" + + self._push_writer() + + def _pop_buffer(self): + """pop the most recent capturing buffer from this Context.""" + + return self._buffer_stack.pop() + + def get(self, key, default=None): + """Return a value from this :class:`.Context`.""" + + return self._data.get(key, compat_builtins.__dict__.get(key, default)) + + def write(self, string): + """Write a string to this :class:`.Context` object's + underlying output buffer.""" + + self._buffer_stack[-1].write(string) + + def writer(self): + """Return the current writer function.""" + + return self._buffer_stack[-1].write + + def _copy(self): + c = Context.__new__(Context) + c._buffer_stack = self._buffer_stack + c._data = self._data.copy() + c._kwargs = self._kwargs + c._with_template = self._with_template + c._outputting_as_unicode = self._outputting_as_unicode + c.namespaces = self.namespaces + c.caller_stack = self.caller_stack + return c + + def _locals(self, d): + """Create a new :class:`.Context` with a copy of this + :class:`.Context`'s current state, + updated with the given dictionary. + + The :attr:`.Context.kwargs` collection remains + unaffected. + + + """ + + if not d: + return self + c = self._copy() + c._data.update(d) + return c + + def _clean_inheritance_tokens(self): + """create a new copy of this :class:`.Context`. with + tokens related to inheritance state removed.""" + + c = self._copy() + x = c._data + x.pop('self', None) + x.pop('parent', None) + x.pop('next', None) + return c + +class CallerStack(list): + def __init__(self): + self.nextcaller = None + + def __nonzero__(self): + return self.__bool__() + + def __bool__(self): + return len(self) and self._get_caller() and True or False + + def _get_caller(self): + # this method can be removed once + # codegen MAGIC_NUMBER moves past 7 + return self[-1] + + def __getattr__(self, key): + return getattr(self._get_caller(), key) + + def _push_frame(self): + frame = self.nextcaller or None + self.append(frame) + self.nextcaller = None + return frame + + def _pop_frame(self): + self.nextcaller = self.pop() + + +class Undefined(object): + """Represents an undefined value in a template. + + All template modules have a constant value + ``UNDEFINED`` present which is an instance of this + object. + + """ + def __str__(self): + raise NameError("Undefined") + + def __nonzero__(self): + return self.__bool__() + + def __bool__(self): + return False + +UNDEFINED = Undefined() + +class LoopStack(object): + """a stack for LoopContexts that implements the context manager protocol + to automatically pop off the top of the stack on context exit + """ + + def __init__(self): + self.stack = [] + + def _enter(self, iterable): + self._push(iterable) + return self._top + + def _exit(self): + self._pop() + return self._top + + @property + def _top(self): + if self.stack: + return self.stack[-1] + else: + return self + + def _pop(self): + return self.stack.pop() + + def _push(self, iterable): + new = LoopContext(iterable) + if self.stack: + new.parent = self.stack[-1] + return self.stack.append(new) + + def __getattr__(self, key): + raise exceptions.RuntimeException("No loop context is established") + + def __iter__(self): + return iter(self._top) + + +class LoopContext(object): + """A magic loop variable. + Automatically accessible in any ``% for`` block. + + See the section :ref:`loop_context` for usage + notes. + + :attr:`parent` -> :class:`.LoopContext` or ``None`` + The parent loop, if one exists. + :attr:`index` -> `int` + The 0-based iteration count. + :attr:`reverse_index` -> `int` + The number of iterations remaining. + :attr:`first` -> `bool` + ``True`` on the first iteration, ``False`` otherwise. + :attr:`last` -> `bool` + ``True`` on the last iteration, ``False`` otherwise. + :attr:`even` -> `bool` + ``True`` when ``index`` is even. + :attr:`odd` -> `bool` + ``True`` when ``index`` is odd. + """ + + def __init__(self, iterable): + self._iterable = iterable + self.index = 0 + self.parent = None + + def __iter__(self): + for i in self._iterable: + yield i + self.index += 1 + + @util.memoized_instancemethod + def __len__(self): + return len(self._iterable) + + @property + def reverse_index(self): + return len(self) - self.index - 1 + + @property + def first(self): + return self.index == 0 + + @property + def last(self): + return self.index == len(self) - 1 + + @property + def even(self): + return not self.odd + + @property + def odd(self): + return bool(self.index % 2) + + def cycle(self, *values): + """Cycle through values as the loop progresses. + """ + if not values: + raise ValueError("You must provide values to cycle through") + return values[self.index % len(values)] + + +class _NSAttr(object): + def __init__(self, parent): + self.__parent = parent + def __getattr__(self, key): + ns = self.__parent + while ns: + if hasattr(ns.module, key): + return getattr(ns.module, key) + else: + ns = ns.inherits + raise AttributeError(key) + +class Namespace(object): + """Provides access to collections of rendering methods, which + can be local, from other templates, or from imported modules. + + To access a particular rendering method referenced by a + :class:`.Namespace`, use plain attribute access: + + .. sourcecode:: mako + + ${some_namespace.foo(x, y, z)} + + :class:`.Namespace` also contains several built-in attributes + described here. + + """ + + def __init__(self, name, context, + callables=None, inherits=None, + populate_self=True, calling_uri=None): + self.name = name + self.context = context + self.inherits = inherits + if callables is not None: + self.callables = dict([(c.__name__, c) for c in callables]) + + callables = () + + module = None + """The Python module referenced by this :class:`.Namespace`. + + If the namespace references a :class:`.Template`, then + this module is the equivalent of ``template.module``, + i.e. the generated module for the template. + + """ + + template = None + """The :class:`.Template` object referenced by this + :class:`.Namespace`, if any. + + """ + + context = None + """The :class:`.Context` object for this :class:`.Namespace`. + + Namespaces are often created with copies of contexts that + contain slightly different data, particularly in inheritance + scenarios. Using the :class:`.Context` off of a :class:`.Namespace` one + can traverse an entire chain of templates that inherit from + one-another. + + """ + + filename = None + """The path of the filesystem file used for this + :class:`.Namespace`'s module or template. + + If this is a pure module-based + :class:`.Namespace`, this evaluates to ``module.__file__``. If a + template-based namespace, it evaluates to the original + template file location. + + """ + + uri = None + """The URI for this :class:`.Namespace`'s template. + + I.e. whatever was sent to :meth:`.TemplateLookup.get_template()`. + + This is the equivalent of :attr:`.Template.uri`. + + """ + + _templateuri = None + + @util.memoized_property + def attr(self): + """Access module level attributes by name. + + This accessor allows templates to supply "scalar" + attributes which are particularly handy in inheritance + relationships. + + .. seealso:: + + :ref:`inheritance_attr` + + :ref:`namespace_attr_for_includes` + + """ + return _NSAttr(self) + + def get_namespace(self, uri): + """Return a :class:`.Namespace` corresponding to the given ``uri``. + + If the given ``uri`` is a relative URI (i.e. it does not + contain a leading slash ``/``), the ``uri`` is adjusted to + be relative to the ``uri`` of the namespace itself. This + method is therefore mostly useful off of the built-in + ``local`` namespace, described in :ref:`namespace_local`. + + In + most cases, a template wouldn't need this function, and + should instead use the ``<%namespace>`` tag to load + namespaces. However, since all ``<%namespace>`` tags are + evaluated before the body of a template ever runs, + this method can be used to locate namespaces using + expressions that were generated within the body code of + the template, or to conditionally use a particular + namespace. + + """ + key = (self, uri) + if key in self.context.namespaces: + return self.context.namespaces[key] + else: + ns = TemplateNamespace(uri, self.context._copy(), + templateuri=uri, + calling_uri=self._templateuri) + self.context.namespaces[key] = ns + return ns + + def get_template(self, uri): + """Return a :class:`.Template` from the given ``uri``. + + The ``uri`` resolution is relative to the ``uri`` of this + :class:`.Namespace` object's :class:`.Template`. + + """ + return _lookup_template(self.context, uri, self._templateuri) + + def get_cached(self, key, **kwargs): + """Return a value from the :class:`.Cache` referenced by this + :class:`.Namespace` object's :class:`.Template`. + + The advantage to this method versus direct access to the + :class:`.Cache` is that the configuration parameters + declared in ``<%page>`` take effect here, thereby calling + up the same configured backend as that configured + by ``<%page>``. + + """ + + return self.cache.get(key, **kwargs) + + @property + def cache(self): + """Return the :class:`.Cache` object referenced + by this :class:`.Namespace` object's + :class:`.Template`. + + """ + return self.template.cache + + def include_file(self, uri, **kwargs): + """Include a file at the given ``uri``.""" + + _include_file(self.context, uri, self._templateuri, **kwargs) + + def _populate(self, d, l): + for ident in l: + if ident == '*': + for (k, v) in self._get_star(): + d[k] = v + else: + d[ident] = getattr(self, ident) + + def _get_star(self): + if self.callables: + for key in self.callables: + yield (key, self.callables[key]) + + def __getattr__(self, key): + if key in self.callables: + val = self.callables[key] + elif self.inherits: + val = getattr(self.inherits, key) + else: + raise AttributeError( + "Namespace '%s' has no member '%s'" % + (self.name, key)) + setattr(self, key, val) + return val + +class TemplateNamespace(Namespace): + """A :class:`.Namespace` specific to a :class:`.Template` instance.""" + + def __init__(self, name, context, template=None, templateuri=None, + callables=None, inherits=None, + populate_self=True, calling_uri=None): + self.name = name + self.context = context + self.inherits = inherits + if callables is not None: + self.callables = dict([(c.__name__, c) for c in callables]) + + if templateuri is not None: + self.template = _lookup_template(context, templateuri, + calling_uri) + self._templateuri = self.template.module._template_uri + elif template is not None: + self.template = template + self._templateuri = template.module._template_uri + else: + raise TypeError("'template' argument is required.") + + if populate_self: + lclcallable, lclcontext = \ + _populate_self_namespace(context, self.template, + self_ns=self) + + @property + def module(self): + """The Python module referenced by this :class:`.Namespace`. + + If the namespace references a :class:`.Template`, then + this module is the equivalent of ``template.module``, + i.e. the generated module for the template. + + """ + return self.template.module + + @property + def filename(self): + """The path of the filesystem file used for this + :class:`.Namespace`'s module or template. + """ + return self.template.filename + + @property + def uri(self): + """The URI for this :class:`.Namespace`'s template. + + I.e. whatever was sent to :meth:`.TemplateLookup.get_template()`. + + This is the equivalent of :attr:`.Template.uri`. + + """ + return self.template.uri + + def _get_star(self): + if self.callables: + for key in self.callables: + yield (key, self.callables[key]) + def get(key): + callable_ = self.template._get_def_callable(key) + return compat.partial(callable_, self.context) + for k in self.template.module._exports: + yield (k, get(k)) + + def __getattr__(self, key): + if key in self.callables: + val = self.callables[key] + elif self.template.has_def(key): + callable_ = self.template._get_def_callable(key) + val = compat.partial(callable_, self.context) + elif self.inherits: + val = getattr(self.inherits, key) + + else: + raise AttributeError( + "Namespace '%s' has no member '%s'" % + (self.name, key)) + setattr(self, key, val) + return val + +class ModuleNamespace(Namespace): + """A :class:`.Namespace` specific to a Python module instance.""" + + def __init__(self, name, context, module, + callables=None, inherits=None, + populate_self=True, calling_uri=None): + self.name = name + self.context = context + self.inherits = inherits + if callables is not None: + self.callables = dict([(c.__name__, c) for c in callables]) + + mod = __import__(module) + for token in module.split('.')[1:]: + mod = getattr(mod, token) + self.module = mod + + @property + def filename(self): + """The path of the filesystem file used for this + :class:`.Namespace`'s module or template. + """ + return self.module.__file__ + + def _get_star(self): + if self.callables: + for key in self.callables: + yield (key, self.callables[key]) + for key in dir(self.module): + if key[0] != '_': + callable_ = getattr(self.module, key) + if compat.callable(callable_): + yield key, compat.partial(callable_, self.context) + + + def __getattr__(self, key): + if key in self.callables: + val = self.callables[key] + elif hasattr(self.module, key): + callable_ = getattr(self.module, key) + val = compat.partial(callable_, self.context) + elif self.inherits: + val = getattr(self.inherits, key) + else: + raise AttributeError( + "Namespace '%s' has no member '%s'" % + (self.name, key)) + setattr(self, key, val) + return val + +def supports_caller(func): + """Apply a caller_stack compatibility decorator to a plain + Python function. + + See the example in :ref:`namespaces_python_modules`. + + """ + + def wrap_stackframe(context, *args, **kwargs): + context.caller_stack._push_frame() + try: + return func(context, *args, **kwargs) + finally: + context.caller_stack._pop_frame() + return wrap_stackframe + +def capture(context, callable_, *args, **kwargs): + """Execute the given template def, capturing the output into + a buffer. + + See the example in :ref:`namespaces_python_modules`. + + """ + + if not compat.callable(callable_): + raise exceptions.RuntimeException( + "capture() function expects a callable as " + "its argument (i.e. capture(func, *args, **kwargs))" + ) + context._push_buffer() + try: + callable_(*args, **kwargs) + finally: + buf = context._pop_buffer() + return buf.getvalue() + +def _decorate_toplevel(fn): + def decorate_render(render_fn): + def go(context, *args, **kw): + def y(*args, **kw): + return render_fn(context, *args, **kw) + try: + y.__name__ = render_fn.__name__[7:] + except TypeError: + # < Python 2.4 + pass + return fn(y)(context, *args, **kw) + return go + return decorate_render + +def _decorate_inline(context, fn): + def decorate_render(render_fn): + dec = fn(render_fn) + def go(*args, **kw): + return dec(context, *args, **kw) + return go + return decorate_render + +def _include_file(context, uri, calling_uri, **kwargs): + """locate the template from the given uri and include it in + the current output.""" + + template = _lookup_template(context, uri, calling_uri) + (callable_, ctx) = _populate_self_namespace( + context._clean_inheritance_tokens(), + template) + callable_(ctx, **_kwargs_for_include(callable_, context._data, **kwargs)) + +def _inherit_from(context, uri, calling_uri): + """called by the _inherit method in template modules to set + up the inheritance chain at the start of a template's + execution.""" + + if uri is None: + return None + template = _lookup_template(context, uri, calling_uri) + self_ns = context['self'] + ih = self_ns + while ih.inherits is not None: + ih = ih.inherits + lclcontext = context._locals({'next': ih}) + ih.inherits = TemplateNamespace("self:%s" % template.uri, + lclcontext, + template=template, + populate_self=False) + context._data['parent'] = lclcontext._data['local'] = ih.inherits + callable_ = getattr(template.module, '_mako_inherit', None) + if callable_ is not None: + ret = callable_(template, lclcontext) + if ret: + return ret + + gen_ns = getattr(template.module, '_mako_generate_namespaces', None) + if gen_ns is not None: + gen_ns(context) + return (template.callable_, lclcontext) + +def _lookup_template(context, uri, relativeto): + lookup = context._with_template.lookup + if lookup is None: + raise exceptions.TemplateLookupException( + "Template '%s' has no TemplateLookup associated" % + context._with_template.uri) + uri = lookup.adjust_uri(uri, relativeto) + try: + return lookup.get_template(uri) + except exceptions.TopLevelLookupException: + raise exceptions.TemplateLookupException(str(compat.exception_as())) + +def _populate_self_namespace(context, template, self_ns=None): + if self_ns is None: + self_ns = TemplateNamespace('self:%s' % template.uri, + context, template=template, + populate_self=False) + context._data['self'] = context._data['local'] = self_ns + if hasattr(template.module, '_mako_inherit'): + ret = template.module._mako_inherit(template, context) + if ret: + return ret + return (template.callable_, context) + +def _render(template, callable_, args, data, as_unicode=False): + """create a Context and return the string + output of the given template and template callable.""" + + if as_unicode: + buf = util.FastEncodingBuffer(as_unicode=True) + elif template.bytestring_passthrough: + buf = compat.StringIO() + else: + buf = util.FastEncodingBuffer( + as_unicode=as_unicode, + encoding=template.output_encoding, + errors=template.encoding_errors) + context = Context(buf, **data) + context._outputting_as_unicode = as_unicode + context._set_with_template(template) + + _render_context(template, callable_, context, *args, + **_kwargs_for_callable(callable_, data)) + return context._pop_buffer().getvalue() + +def _kwargs_for_callable(callable_, data): + argspec = compat.inspect_func_args(callable_) + # for normal pages, **pageargs is usually present + if argspec[2]: + return data + + # for rendering defs from the top level, figure out the args + namedargs = argspec[0] + [v for v in argspec[1:3] if v is not None] + kwargs = {} + for arg in namedargs: + if arg != 'context' and arg in data and arg not in kwargs: + kwargs[arg] = data[arg] + return kwargs + +def _kwargs_for_include(callable_, data, **kwargs): + argspec = compat.inspect_func_args(callable_) + namedargs = argspec[0] + [v for v in argspec[1:3] if v is not None] + for arg in namedargs: + if arg != 'context' and arg in data and arg not in kwargs: + kwargs[arg] = data[arg] + return kwargs + +def _render_context(tmpl, callable_, context, *args, **kwargs): + import mako.template as template + # create polymorphic 'self' namespace for this + # template with possibly updated context + if not isinstance(tmpl, template.DefTemplate): + # if main render method, call from the base of the inheritance stack + (inherit, lclcontext) = _populate_self_namespace(context, tmpl) + _exec_template(inherit, lclcontext, args=args, kwargs=kwargs) + else: + # otherwise, call the actual rendering method specified + (inherit, lclcontext) = _populate_self_namespace(context, tmpl.parent) + _exec_template(callable_, context, args=args, kwargs=kwargs) + +def _exec_template(callable_, context, args=None, kwargs=None): + """execute a rendering callable given the callable, a + Context, and optional explicit arguments + + the contextual Template will be located if it exists, and + the error handling options specified on that Template will + be interpreted here. + """ + template = context._with_template + if template is not None and \ + (template.format_exceptions or template.error_handler): + try: + callable_(context, *args, **kwargs) + except Exception: + _render_error(template, context, compat.exception_as()) + except: + e = sys.exc_info()[0] + _render_error(template, context, e) + else: + callable_(context, *args, **kwargs) + +def _render_error(template, context, error): + if template.error_handler: + result = template.error_handler(context, error) + if not result: + compat.reraise(*sys.exc_info()) + else: + error_template = exceptions.html_error_template() + if context._outputting_as_unicode: + context._buffer_stack[:] = [ + util.FastEncodingBuffer(as_unicode=True)] + else: + context._buffer_stack[:] = [util.FastEncodingBuffer( + error_template.output_encoding, + error_template.encoding_errors)] + + context._set_with_template(error_template) + error_template.render_context(context, error=error) diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/template.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/template.py new file mode 100644 index 00000000000..fb6106289fa --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/template.py @@ -0,0 +1,705 @@ +# mako/template.py +# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file> +# +# This module is part of Mako and is released under +# the MIT License: http://www.opensource.org/licenses/mit-license.php + +"""Provides the Template class, a facade for parsing, generating and executing +template strings, as well as template runtime operations.""" + +from mako.lexer import Lexer +from mako import runtime, util, exceptions, codegen, cache, compat +import os +import re +import shutil +import stat +import sys +import tempfile +import types +import weakref + + +class Template(object): + """Represents a compiled template. + + :class:`.Template` includes a reference to the original + template source (via the :attr:`.source` attribute) + as well as the source code of the + generated Python module (i.e. the :attr:`.code` attribute), + as well as a reference to an actual Python module. + + :class:`.Template` is constructed using either a literal string + representing the template text, or a filename representing a filesystem + path to a source file. + + :param text: textual template source. This argument is mutually + exclusive versus the ``filename`` parameter. + + :param filename: filename of the source template. This argument is + mutually exclusive versus the ``text`` parameter. + + :param buffer_filters: string list of filters to be applied + to the output of ``%def``\ s which are buffered, cached, or otherwise + filtered, after all filters + defined with the ``%def`` itself have been applied. Allows the + creation of default expression filters that let the output + of return-valued ``%def``\ s "opt out" of that filtering via + passing special attributes or objects. + + :param bytestring_passthrough: When ``True``, and ``output_encoding`` is + set to ``None``, and :meth:`.Template.render` is used to render, + the `StringIO` or `cStringIO` buffer will be used instead of the + default "fast" buffer. This allows raw bytestrings in the + output stream, such as in expressions, to pass straight + through to the buffer. This flag is forced + to ``True`` if ``disable_unicode`` is also configured. + + .. versionadded:: 0.4 + Added to provide the same behavior as that of the previous series. + + :param cache_args: Dictionary of cache configuration arguments that + will be passed to the :class:`.CacheImpl`. See :ref:`caching_toplevel`. + + :param cache_dir: + + .. deprecated:: 0.6 + Use the ``'dir'`` argument in the ``cache_args`` dictionary. + See :ref:`caching_toplevel`. + + :param cache_enabled: Boolean flag which enables caching of this + template. See :ref:`caching_toplevel`. + + :param cache_impl: String name of a :class:`.CacheImpl` caching + implementation to use. Defaults to ``'beaker'``. + + :param cache_type: + + .. deprecated:: 0.6 + Use the ``'type'`` argument in the ``cache_args`` dictionary. + See :ref:`caching_toplevel`. + + :param cache_url: + + .. deprecated:: 0.6 + Use the ``'url'`` argument in the ``cache_args`` dictionary. + See :ref:`caching_toplevel`. + + :param default_filters: List of string filter names that will + be applied to all expressions. See :ref:`filtering_default_filters`. + + :param disable_unicode: Disables all awareness of Python Unicode + objects. See :ref:`unicode_disabled`. + + :param enable_loop: When ``True``, enable the ``loop`` context variable. + This can be set to ``False`` to support templates that may + be making usage of the name "``loop``". Individual templates can + re-enable the "loop" context by placing the directive + ``enable_loop="True"`` inside the ``<%page>`` tag -- see + :ref:`migrating_loop`. + + :param encoding_errors: Error parameter passed to ``encode()`` when + string encoding is performed. See :ref:`usage_unicode`. + + :param error_handler: Python callable which is called whenever + compile or runtime exceptions occur. The callable is passed + the current context as well as the exception. If the + callable returns ``True``, the exception is considered to + be handled, else it is re-raised after the function + completes. Is used to provide custom error-rendering + functions. + + :param format_exceptions: if ``True``, exceptions which occur during + the render phase of this template will be caught and + formatted into an HTML error page, which then becomes the + rendered result of the :meth:`.render` call. Otherwise, + runtime exceptions are propagated outwards. + + :param imports: String list of Python statements, typically individual + "import" lines, which will be placed into the module level + preamble of all generated Python modules. See the example + in :ref:`filtering_default_filters`. + + :param future_imports: String list of names to import from `__future__`. + These will be concatenated into a comma-separated string and inserted + into the beginning of the template, e.g. ``futures_imports=['FOO', + 'BAR']`` results in ``from __future__ import FOO, BAR``. If you're + interested in using features like the new division operator, you must + use future_imports to convey that to the renderer, as otherwise the + import will not appear as the first executed statement in the generated + code and will therefore not have the desired effect. + + :param input_encoding: Encoding of the template's source code. Can + be used in lieu of the coding comment. See + :ref:`usage_unicode` as well as :ref:`unicode_toplevel` for + details on source encoding. + + :param lookup: a :class:`.TemplateLookup` instance that will be used + for all file lookups via the ``<%namespace>``, + ``<%include>``, and ``<%inherit>`` tags. See + :ref:`usage_templatelookup`. + + :param module_directory: Filesystem location where generated + Python module files will be placed. + + :param module_filename: Overrides the filename of the generated + Python module file. For advanced usage only. + + :param module_writer: A callable which overrides how the Python + module is written entirely. The callable is passed the + encoded source content of the module and the destination + path to be written to. The default behavior of module writing + uses a tempfile in conjunction with a file move in order + to make the operation atomic. So a user-defined module + writing function that mimics the default behavior would be: + + .. sourcecode:: python + + import tempfile + import os + import shutil + + def module_writer(source, outputpath): + (dest, name) = \\ + tempfile.mkstemp( + dir=os.path.dirname(outputpath) + ) + + os.write(dest, source) + os.close(dest) + shutil.move(name, outputpath) + + from mako.template import Template + mytemplate = Template( + filename="index.html", + module_directory="/path/to/modules", + module_writer=module_writer + ) + + The function is provided for unusual configurations where + certain platform-specific permissions or other special + steps are needed. + + :param output_encoding: The encoding to use when :meth:`.render` + is called. + See :ref:`usage_unicode` as well as :ref:`unicode_toplevel`. + + :param preprocessor: Python callable which will be passed + the full template source before it is parsed. The return + result of the callable will be used as the template source + code. + + :param lexer_cls: A :class:`.Lexer` class used to parse + the template. The :class:`.Lexer` class is used by + default. + + .. versionadded:: 0.7.4 + + :param strict_undefined: Replaces the automatic usage of + ``UNDEFINED`` for any undeclared variables not located in + the :class:`.Context` with an immediate raise of + ``NameError``. The advantage is immediate reporting of + missing variables which include the name. + + .. versionadded:: 0.3.6 + + :param uri: string URI or other identifier for this template. + If not provided, the ``uri`` is generated from the filesystem + path, or from the in-memory identity of a non-file-based + template. The primary usage of the ``uri`` is to provide a key + within :class:`.TemplateLookup`, as well as to generate the + file path of the generated Python module file, if + ``module_directory`` is specified. + + """ + + lexer_cls = Lexer + + def __init__(self, + text=None, + filename=None, + uri=None, + format_exceptions=False, + error_handler=None, + lookup=None, + output_encoding=None, + encoding_errors='strict', + module_directory=None, + cache_args=None, + cache_impl='beaker', + cache_enabled=True, + cache_type=None, + cache_dir=None, + cache_url=None, + module_filename=None, + input_encoding=None, + disable_unicode=False, + module_writer=None, + bytestring_passthrough=False, + default_filters=None, + buffer_filters=(), + strict_undefined=False, + imports=None, + future_imports=None, + enable_loop=True, + preprocessor=None, + lexer_cls=None): + if uri: + self.module_id = re.sub(r'\W', "_", uri) + self.uri = uri + elif filename: + self.module_id = re.sub(r'\W', "_", filename) + drive, path = os.path.splitdrive(filename) + path = os.path.normpath(path).replace(os.path.sep, "/") + self.uri = path + else: + self.module_id = "memory:" + hex(id(self)) + self.uri = self.module_id + + u_norm = self.uri + if u_norm.startswith("/"): + u_norm = u_norm[1:] + u_norm = os.path.normpath(u_norm) + if u_norm.startswith(".."): + raise exceptions.TemplateLookupException( + "Template uri \"%s\" is invalid - " + "it cannot be relative outside " + "of the root path." % self.uri) + + self.input_encoding = input_encoding + self.output_encoding = output_encoding + self.encoding_errors = encoding_errors + self.disable_unicode = disable_unicode + self.bytestring_passthrough = bytestring_passthrough or disable_unicode + self.enable_loop = enable_loop + self.strict_undefined = strict_undefined + self.module_writer = module_writer + + if compat.py3k and disable_unicode: + raise exceptions.UnsupportedError( + "Mako for Python 3 does not " + "support disabling Unicode") + elif output_encoding and disable_unicode: + raise exceptions.UnsupportedError( + "output_encoding must be set to " + "None when disable_unicode is used.") + if default_filters is None: + if compat.py3k or self.disable_unicode: + self.default_filters = ['str'] + else: + self.default_filters = ['unicode'] + else: + self.default_filters = default_filters + self.buffer_filters = buffer_filters + + self.imports = imports + self.future_imports = future_imports + self.preprocessor = preprocessor + + if lexer_cls is not None: + self.lexer_cls = lexer_cls + + # if plain text, compile code in memory only + if text is not None: + (code, module) = _compile_text(self, text, filename) + self._code = code + self._source = text + ModuleInfo(module, None, self, filename, code, text) + elif filename is not None: + # if template filename and a module directory, load + # a filesystem-based module file, generating if needed + if module_filename is not None: + path = module_filename + elif module_directory is not None: + path = os.path.abspath( + os.path.join( + os.path.normpath(module_directory), + u_norm + ".py" + ) + ) + else: + path = None + module = self._compile_from_file(path, filename) + else: + raise exceptions.RuntimeException( + "Template requires text or filename") + + self.module = module + self.filename = filename + self.callable_ = self.module.render_body + self.format_exceptions = format_exceptions + self.error_handler = error_handler + self.lookup = lookup + + self.module_directory = module_directory + + self._setup_cache_args( + cache_impl, cache_enabled, cache_args, + cache_type, cache_dir, cache_url + ) + + + @util.memoized_property + def reserved_names(self): + if self.enable_loop: + return codegen.RESERVED_NAMES + else: + return codegen.RESERVED_NAMES.difference(['loop']) + + def _setup_cache_args(self, + cache_impl, cache_enabled, cache_args, + cache_type, cache_dir, cache_url): + self.cache_impl = cache_impl + self.cache_enabled = cache_enabled + if cache_args: + self.cache_args = cache_args + else: + self.cache_args = {} + + # transfer deprecated cache_* args + if cache_type: + self.cache_args['type'] = cache_type + if cache_dir: + self.cache_args['dir'] = cache_dir + if cache_url: + self.cache_args['url'] = cache_url + + def _compile_from_file(self, path, filename): + if path is not None: + util.verify_directory(os.path.dirname(path)) + filemtime = os.stat(filename)[stat.ST_MTIME] + if not os.path.exists(path) or \ + os.stat(path)[stat.ST_MTIME] < filemtime: + data = util.read_file(filename) + _compile_module_file( + self, + data, + filename, + path, + self.module_writer) + module = compat.load_module(self.module_id, path) + del sys.modules[self.module_id] + if module._magic_number != codegen.MAGIC_NUMBER: + data = util.read_file(filename) + _compile_module_file( + self, + data, + filename, + path, + self.module_writer) + module = compat.load_module(self.module_id, path) + del sys.modules[self.module_id] + ModuleInfo(module, path, self, filename, None, None) + else: + # template filename and no module directory, compile code + # in memory + data = util.read_file(filename) + code, module = _compile_text( + self, + data, + filename) + self._source = None + self._code = code + ModuleInfo(module, None, self, filename, code, None) + return module + + @property + def source(self): + """Return the template source code for this :class:`.Template`.""" + + return _get_module_info_from_callable(self.callable_).source + + @property + def code(self): + """Return the module source code for this :class:`.Template`.""" + + return _get_module_info_from_callable(self.callable_).code + + @util.memoized_property + def cache(self): + return cache.Cache(self) + + @property + def cache_dir(self): + return self.cache_args['dir'] + @property + def cache_url(self): + return self.cache_args['url'] + @property + def cache_type(self): + return self.cache_args['type'] + + def render(self, *args, **data): + """Render the output of this template as a string. + + If the template specifies an output encoding, the string + will be encoded accordingly, else the output is raw (raw + output uses `cStringIO` and can't handle multibyte + characters). A :class:`.Context` object is created corresponding + to the given data. Arguments that are explicitly declared + by this template's internal rendering method are also + pulled from the given ``*args``, ``**data`` members. + + """ + return runtime._render(self, self.callable_, args, data) + + def render_unicode(self, *args, **data): + """Render the output of this template as a unicode object.""" + + return runtime._render(self, + self.callable_, + args, + data, + as_unicode=True) + + def render_context(self, context, *args, **kwargs): + """Render this :class:`.Template` with the given context. + + The data is written to the context's buffer. + + """ + if getattr(context, '_with_template', None) is None: + context._set_with_template(self) + runtime._render_context(self, + self.callable_, + context, + *args, + **kwargs) + + def has_def(self, name): + return hasattr(self.module, "render_%s" % name) + + def get_def(self, name): + """Return a def of this template as a :class:`.DefTemplate`.""" + + return DefTemplate(self, getattr(self.module, "render_%s" % name)) + + def _get_def_callable(self, name): + return getattr(self.module, "render_%s" % name) + + @property + def last_modified(self): + return self.module._modified_time + +class ModuleTemplate(Template): + """A Template which is constructed given an existing Python module. + + e.g.:: + + t = Template("this is a template") + f = file("mymodule.py", "w") + f.write(t.code) + f.close() + + import mymodule + + t = ModuleTemplate(mymodule) + print t.render() + + """ + + def __init__(self, module, + module_filename=None, + template=None, + template_filename=None, + module_source=None, + template_source=None, + output_encoding=None, + encoding_errors='strict', + disable_unicode=False, + bytestring_passthrough=False, + format_exceptions=False, + error_handler=None, + lookup=None, + cache_args=None, + cache_impl='beaker', + cache_enabled=True, + cache_type=None, + cache_dir=None, + cache_url=None, + ): + self.module_id = re.sub(r'\W', "_", module._template_uri) + self.uri = module._template_uri + self.input_encoding = module._source_encoding + self.output_encoding = output_encoding + self.encoding_errors = encoding_errors + self.disable_unicode = disable_unicode + self.bytestring_passthrough = bytestring_passthrough or disable_unicode + self.enable_loop = module._enable_loop + + if compat.py3k and disable_unicode: + raise exceptions.UnsupportedError( + "Mako for Python 3 does not " + "support disabling Unicode") + elif output_encoding and disable_unicode: + raise exceptions.UnsupportedError( + "output_encoding must be set to " + "None when disable_unicode is used.") + + self.module = module + self.filename = template_filename + ModuleInfo(module, + module_filename, + self, + template_filename, + module_source, + template_source) + + self.callable_ = self.module.render_body + self.format_exceptions = format_exceptions + self.error_handler = error_handler + self.lookup = lookup + self._setup_cache_args( + cache_impl, cache_enabled, cache_args, + cache_type, cache_dir, cache_url + ) + +class DefTemplate(Template): + """A :class:`.Template` which represents a callable def in a parent + template.""" + + def __init__(self, parent, callable_): + self.parent = parent + self.callable_ = callable_ + self.output_encoding = parent.output_encoding + self.module = parent.module + self.encoding_errors = parent.encoding_errors + self.format_exceptions = parent.format_exceptions + self.error_handler = parent.error_handler + self.enable_loop = parent.enable_loop + self.lookup = parent.lookup + self.bytestring_passthrough = parent.bytestring_passthrough + + def get_def(self, name): + return self.parent.get_def(name) + +class ModuleInfo(object): + """Stores information about a module currently loaded into + memory, provides reverse lookups of template source, module + source code based on a module's identifier. + + """ + _modules = weakref.WeakValueDictionary() + + def __init__(self, + module, + module_filename, + template, + template_filename, + module_source, + template_source): + self.module = module + self.module_filename = module_filename + self.template_filename = template_filename + self.module_source = module_source + self.template_source = template_source + self._modules[module.__name__] = template._mmarker = self + if module_filename: + self._modules[module_filename] = self + + @classmethod + def get_module_source_metadata(cls, module_source, full_line_map=False): + source_map = re.search( + r"__M_BEGIN_METADATA(.+?)__M_END_METADATA", + module_source, re.S).group(1) + source_map = compat.json.loads(source_map) + source_map['line_map'] = dict((int(k), int(v)) + for k, v in source_map['line_map'].items()) + if full_line_map: + f_line_map = source_map['full_line_map'] = [] + line_map = source_map['line_map'] + + curr_templ_line = 1 + for mod_line in range(1, max(line_map)): + if mod_line in line_map: + curr_templ_line = line_map[mod_line] + f_line_map.append(curr_templ_line) + return source_map + + @property + def code(self): + if self.module_source is not None: + return self.module_source + else: + return util.read_python_file(self.module_filename) + + @property + def source(self): + if self.template_source is not None: + if self.module._source_encoding and \ + not isinstance(self.template_source, compat.text_type): + return self.template_source.decode( + self.module._source_encoding) + else: + return self.template_source + else: + data = util.read_file(self.template_filename) + if self.module._source_encoding: + return data.decode(self.module._source_encoding) + else: + return data + +def _compile(template, text, filename, generate_magic_comment): + lexer = template.lexer_cls(text, + filename, + disable_unicode=template.disable_unicode, + input_encoding=template.input_encoding, + preprocessor=template.preprocessor) + node = lexer.parse() + source = codegen.compile(node, + template.uri, + filename, + default_filters=template.default_filters, + buffer_filters=template.buffer_filters, + imports=template.imports, + future_imports=template.future_imports, + source_encoding=lexer.encoding, + generate_magic_comment=generate_magic_comment, + disable_unicode=template.disable_unicode, + strict_undefined=template.strict_undefined, + enable_loop=template.enable_loop, + reserved_names=template.reserved_names) + return source, lexer + +def _compile_text(template, text, filename): + identifier = template.module_id + source, lexer = _compile(template, text, filename, + generate_magic_comment=template.disable_unicode) + + cid = identifier + if not compat.py3k and isinstance(cid, compat.text_type): + cid = cid.encode() + module = types.ModuleType(cid) + code = compile(source, cid, 'exec') + + # this exec() works for 2.4->3.3. + exec(code, module.__dict__, module.__dict__) + return (source, module) + +def _compile_module_file(template, text, filename, outputpath, module_writer): + source, lexer = _compile(template, text, filename, + generate_magic_comment=True) + + if isinstance(source, compat.text_type): + source = source.encode(lexer.encoding or 'ascii') + + if module_writer: + module_writer(source, outputpath) + else: + # make tempfiles in the same location as the ultimate + # location. this ensures they're on the same filesystem, + # avoiding synchronization issues. + (dest, name) = tempfile.mkstemp(dir=os.path.dirname(outputpath)) + + os.write(dest, source) + os.close(dest) + shutil.move(name, outputpath) + +def _get_module_info_from_callable(callable_): + if compat.py3k: + return _get_module_info(callable_.__globals__['__name__']) + else: + return _get_module_info(callable_.func_globals['__name__']) + +def _get_module_info(filename): + return ModuleInfo._modules[filename] + diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/util.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/util.py new file mode 100644 index 00000000000..cba2ab7920c --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/util.py @@ -0,0 +1,360 @@ +# mako/util.py +# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file> +# +# This module is part of Mako and is released under +# the MIT License: http://www.opensource.org/licenses/mit-license.php + +import re +import collections +import codecs +import os +from mako import compat +import operator + +def update_wrapper(decorated, fn): + decorated.__wrapped__ = fn + decorated.__name__ = fn.__name__ + return decorated + + +class PluginLoader(object): + def __init__(self, group): + self.group = group + self.impls = {} + + def load(self, name): + if name in self.impls: + return self.impls[name]() + else: + import pkg_resources + for impl in pkg_resources.iter_entry_points( + self.group, + name): + self.impls[name] = impl.load + return impl.load() + else: + from mako import exceptions + raise exceptions.RuntimeException( + "Can't load plugin %s %s" % + (self.group, name)) + + def register(self, name, modulepath, objname): + def load(): + mod = __import__(modulepath) + for token in modulepath.split(".")[1:]: + mod = getattr(mod, token) + return getattr(mod, objname) + self.impls[name] = load + +def verify_directory(dir): + """create and/or verify a filesystem directory.""" + + tries = 0 + + while not os.path.exists(dir): + try: + tries += 1 + os.makedirs(dir, compat.octal("0775")) + except: + if tries > 5: + raise + +def to_list(x, default=None): + if x is None: + return default + if not isinstance(x, (list, tuple)): + return [x] + else: + return x + + +class memoized_property(object): + """A read-only @property that is only evaluated once.""" + def __init__(self, fget, doc=None): + self.fget = fget + self.__doc__ = doc or fget.__doc__ + self.__name__ = fget.__name__ + + def __get__(self, obj, cls): + if obj is None: + return self + obj.__dict__[self.__name__] = result = self.fget(obj) + return result + +class memoized_instancemethod(object): + """Decorate a method memoize its return value. + + Best applied to no-arg methods: memoization is not sensitive to + argument values, and will always return the same value even when + called with different arguments. + + """ + def __init__(self, fget, doc=None): + self.fget = fget + self.__doc__ = doc or fget.__doc__ + self.__name__ = fget.__name__ + + def __get__(self, obj, cls): + if obj is None: + return self + def oneshot(*args, **kw): + result = self.fget(obj, *args, **kw) + memo = lambda *a, **kw: result + memo.__name__ = self.__name__ + memo.__doc__ = self.__doc__ + obj.__dict__[self.__name__] = memo + return result + oneshot.__name__ = self.__name__ + oneshot.__doc__ = self.__doc__ + return oneshot + +class SetLikeDict(dict): + """a dictionary that has some setlike methods on it""" + def union(self, other): + """produce a 'union' of this dict and another (at the key level). + + values in the second dict take precedence over that of the first""" + x = SetLikeDict(**self) + x.update(other) + return x + +class FastEncodingBuffer(object): + """a very rudimentary buffer that is faster than StringIO, + but doesn't crash on unicode data like cStringIO.""" + + def __init__(self, encoding=None, errors='strict', as_unicode=False): + self.data = collections.deque() + self.encoding = encoding + if as_unicode: + self.delim = compat.u('') + else: + self.delim = '' + self.as_unicode = as_unicode + self.errors = errors + self.write = self.data.append + + def truncate(self): + self.data = collections.deque() + self.write = self.data.append + + def getvalue(self): + if self.encoding: + return self.delim.join(self.data).encode(self.encoding, + self.errors) + else: + return self.delim.join(self.data) + +class LRUCache(dict): + """A dictionary-like object that stores a limited number of items, + discarding lesser used items periodically. + + this is a rewrite of LRUCache from Myghty to use a periodic timestamp-based + paradigm so that synchronization is not really needed. the size management + is inexact. + """ + + class _Item(object): + def __init__(self, key, value): + self.key = key + self.value = value + self.timestamp = compat.time_func() + def __repr__(self): + return repr(self.value) + + def __init__(self, capacity, threshold=.5): + self.capacity = capacity + self.threshold = threshold + + def __getitem__(self, key): + item = dict.__getitem__(self, key) + item.timestamp = compat.time_func() + return item.value + + def values(self): + return [i.value for i in dict.values(self)] + + def setdefault(self, key, value): + if key in self: + return self[key] + else: + self[key] = value + return value + + def __setitem__(self, key, value): + item = dict.get(self, key) + if item is None: + item = self._Item(key, value) + dict.__setitem__(self, key, item) + else: + item.value = value + self._manage_size() + + def _manage_size(self): + while len(self) > self.capacity + self.capacity * self.threshold: + bytime = sorted(dict.values(self), + key=operator.attrgetter('timestamp'), reverse=True) + for item in bytime[self.capacity:]: + try: + del self[item.key] + except KeyError: + # if we couldn't find a key, most likely some other thread + # broke in on us. loop around and try again + break + +# Regexp to match python magic encoding line +_PYTHON_MAGIC_COMMENT_re = re.compile( + r'[ \t\f]* \# .* coding[=:][ \t]*([-\w.]+)', + re.VERBOSE) + +def parse_encoding(fp): + """Deduce the encoding of a Python source file (binary mode) from magic + comment. + + It does this in the same way as the `Python interpreter`__ + + .. __: http://docs.python.org/ref/encodings.html + + The ``fp`` argument should be a seekable file object in binary mode. + """ + pos = fp.tell() + fp.seek(0) + try: + line1 = fp.readline() + has_bom = line1.startswith(codecs.BOM_UTF8) + if has_bom: + line1 = line1[len(codecs.BOM_UTF8):] + + m = _PYTHON_MAGIC_COMMENT_re.match(line1.decode('ascii', 'ignore')) + if not m: + try: + import parser + parser.suite(line1.decode('ascii', 'ignore')) + except (ImportError, SyntaxError): + # Either it's a real syntax error, in which case the source + # is not valid python source, or line2 is a continuation of + # line1, in which case we don't want to scan line2 for a magic + # comment. + pass + else: + line2 = fp.readline() + m = _PYTHON_MAGIC_COMMENT_re.match( + line2.decode('ascii', 'ignore')) + + if has_bom: + if m: + raise SyntaxError("python refuses to compile code with both a UTF8" \ + " byte-order-mark and a magic encoding comment") + return 'utf_8' + elif m: + return m.group(1) + else: + return None + finally: + fp.seek(pos) + +def sorted_dict_repr(d): + """repr() a dictionary with the keys in order. + + Used by the lexer unit test to compare parse trees based on strings. + + """ + keys = list(d.keys()) + keys.sort() + return "{" + ", ".join(["%r: %r" % (k, d[k]) for k in keys]) + "}" + +def restore__ast(_ast): + """Attempt to restore the required classes to the _ast module if it + appears to be missing them + """ + if hasattr(_ast, 'AST'): + return + _ast.PyCF_ONLY_AST = 2 << 9 + m = compile("""\ +def foo(): pass +class Bar(object): pass +if False: pass +baz = 'mako' +1 + 2 - 3 * 4 / 5 +6 // 7 % 8 << 9 >> 10 +11 & 12 ^ 13 | 14 +15 and 16 or 17 +-baz + (not +18) - ~17 +baz and 'foo' or 'bar' +(mako is baz == baz) is not baz != mako +mako > baz < mako >= baz <= mako +mako in baz not in mako""", '<unknown>', 'exec', _ast.PyCF_ONLY_AST) + _ast.Module = type(m) + + for cls in _ast.Module.__mro__: + if cls.__name__ == 'mod': + _ast.mod = cls + elif cls.__name__ == 'AST': + _ast.AST = cls + + _ast.FunctionDef = type(m.body[0]) + _ast.ClassDef = type(m.body[1]) + _ast.If = type(m.body[2]) + + _ast.Name = type(m.body[3].targets[0]) + _ast.Store = type(m.body[3].targets[0].ctx) + _ast.Str = type(m.body[3].value) + + _ast.Sub = type(m.body[4].value.op) + _ast.Add = type(m.body[4].value.left.op) + _ast.Div = type(m.body[4].value.right.op) + _ast.Mult = type(m.body[4].value.right.left.op) + + _ast.RShift = type(m.body[5].value.op) + _ast.LShift = type(m.body[5].value.left.op) + _ast.Mod = type(m.body[5].value.left.left.op) + _ast.FloorDiv = type(m.body[5].value.left.left.left.op) + + _ast.BitOr = type(m.body[6].value.op) + _ast.BitXor = type(m.body[6].value.left.op) + _ast.BitAnd = type(m.body[6].value.left.left.op) + + _ast.Or = type(m.body[7].value.op) + _ast.And = type(m.body[7].value.values[0].op) + + _ast.Invert = type(m.body[8].value.right.op) + _ast.Not = type(m.body[8].value.left.right.op) + _ast.UAdd = type(m.body[8].value.left.right.operand.op) + _ast.USub = type(m.body[8].value.left.left.op) + + _ast.Or = type(m.body[9].value.op) + _ast.And = type(m.body[9].value.values[0].op) + + _ast.IsNot = type(m.body[10].value.ops[0]) + _ast.NotEq = type(m.body[10].value.ops[1]) + _ast.Is = type(m.body[10].value.left.ops[0]) + _ast.Eq = type(m.body[10].value.left.ops[1]) + + _ast.Gt = type(m.body[11].value.ops[0]) + _ast.Lt = type(m.body[11].value.ops[1]) + _ast.GtE = type(m.body[11].value.ops[2]) + _ast.LtE = type(m.body[11].value.ops[3]) + + _ast.In = type(m.body[12].value.ops[0]) + _ast.NotIn = type(m.body[12].value.ops[1]) + + + +def read_file(path, mode='rb'): + fp = open(path, mode) + try: + data = fp.read() + return data + finally: + fp.close() + +def read_python_file(path): + fp = open(path, "rb") + try: + encoding = parse_encoding(fp) + data = fp.read() + if encoding: + data = data.decode(encoding) + return data + finally: + fp.close() + diff --git a/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template new file mode 100644 index 00000000000..922117e7e16 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template @@ -0,0 +1,141 @@ +<% + max_len = 0 + for knob in knobs: + if len(knob[0]) > max_len: max_len = len(knob[0]) + max_len += len('KNOB_ ') + if max_len % 4: max_len += 4 - (max_len % 4) + + def space_knob(knob): + knob_len = len('KNOB_' + knob) + return ' '*(max_len - knob_len) +%>/****************************************************************************** +* +* Copyright 2015 +* Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http ://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +% if gen_header: +* @file ${filename}.h +% else: +* @file ${filename}.cpp +% endif +* +* @brief Dynamic Knobs for Core. +* +* ======================= AUTO GENERATED: DO NOT EDIT !!! ==================== +* +******************************************************************************/ +%if gen_header: +#pragma once +#include <string> + +template <typename T> +struct Knob +{ + const T& Value() const { return m_Value; } + const T& Value(const T& newValue) { m_Value = newValue; return Value(); } + +protected: + Knob(const T& defaultValue) : m_Value(defaultValue) {} + +private: + T m_Value; +}; + +#define DEFINE_KNOB(_name, _type, _default) \\ + + struct Knob_##_name : Knob<_type> \\ + + { \\ + + Knob_##_name() : Knob<_type>(_default) { } \\ + + static const char* Name() { return "KNOB_" #_name; } \\ + + } _name; + +#define GET_KNOB(_name) g_GlobalKnobs._name.Value() +#define SET_KNOB(_name, _newValue) g_GlobalKnobs._name.Value(_newValue) + +struct GlobalKnobs +{ + % for knob in knobs: + //----------------------------------------------------------- + // KNOB_${knob[0]} + // + % for line in knob[1]['desc']: + // ${line} + % endfor + DEFINE_KNOB(${knob[0]}, ${knob[1]['type']}, ${knob[1]['default']}); + + % endfor + GlobalKnobs(); + std::string ToString(const char* optPerLinePrefix=""); +}; +extern GlobalKnobs g_GlobalKnobs; + +% for knob in knobs: +#define KNOB_${knob[0]}${space_knob(knob[0])}GET_KNOB(${knob[0]}) +% endfor + + +% else: +% for inc in includes: +#include <${inc}> +% endfor + +//======================================================== +// Static Data Members +//======================================================== +GlobalKnobs g_GlobalKnobs; + +//======================================================== +// Knob Initialization +//======================================================== +GlobalKnobs::GlobalKnobs() +{ + % for knob in knobs: + InitKnob(${knob[0]}); + % endfor + +} + +//======================================================== +// Knob Display (Convert to String) +//======================================================== +std::string GlobalKnobs::ToString(const char* optPerLinePrefix) +{ + std::basic_stringstream<char> str; + str << std::showbase << std::setprecision(1) << std::fixed; + + if (optPerLinePrefix == nullptr) { optPerLinePrefix = ""; } + + % for knob in knobs: + str << optPerLinePrefix << "KNOB_${knob[0]}:${space_knob(knob[0])}"; + % if knob[1]['type'] == 'bool': + str << (KNOB_${knob[0]} ? "+\n" : "-\n"); + % elif knob[1]['type'] != 'float': + str << std::hex << std::setw(11) << std::left << KNOB_${knob[0]}; + str << std::dec << KNOB_${knob[0]} << "\n"; + % else: + str << KNOB_${knob[0]} << "\n"; + % endif + % endfor + str << std::ends; + + return str.str(); +} + + +% endif diff --git a/src/gallium/drivers/swr/swr_clear.cpp b/src/gallium/drivers/swr/swr_clear.cpp new file mode 100644 index 00000000000..103bca99441 --- /dev/null +++ b/src/gallium/drivers/swr/swr_clear.cpp @@ -0,0 +1,142 @@ +/**************************************************************************** + * Copyright (C) 2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************/ + +#include "swr_context.h" +#include "swr_query.h" + +static void +swr_clear(struct pipe_context *pipe, + unsigned buffers, + const union pipe_color_union *color, + double depth, + unsigned stencil) +{ + struct swr_context *ctx = swr_context(pipe); + struct pipe_framebuffer_state *fb = &ctx->framebuffer; + + UINT clearMask = 0; + + if (!swr_check_render_cond(pipe)) + return; + + if (ctx->dirty) + swr_update_derived(pipe); + +/* Update clearMask/targetMask */ +#if 0 /* XXX SWR currently only clears SWR_ATTACHMENT_COLOR0, don't bother \ + checking others yet. */ + if (buffers & PIPE_CLEAR_COLOR && fb->nr_cbufs) { + UINT i; + for (i = 0; i < fb->nr_cbufs; ++i) + if (fb->cbufs[i]) + clearMask |= (SWR_CLEAR_COLOR0 << i); + } +#else + if (buffers & PIPE_CLEAR_COLOR && fb->cbufs[0]) + clearMask |= SWR_CLEAR_COLOR; +#endif + + if (buffers & PIPE_CLEAR_DEPTH && fb->zsbuf) + clearMask |= SWR_CLEAR_DEPTH; + + if (buffers & PIPE_CLEAR_STENCIL && fb->zsbuf) + clearMask |= SWR_CLEAR_STENCIL; + +#if 0 // XXX HACK, override clear color alpha. On ubuntu, clears are + // transparent. + ((union pipe_color_union *)color)->f[3] = 1.0; /* cast off your const'd-ness */ +#endif + + /* Reset viewport to full framebuffer width/height before clear, then + * restore it */ + /* Scissor affects clear, viewport should not */ + ctx->dirty |= SWR_NEW_VIEWPORT; + SWR_VIEWPORT vp = {0}; + vp.width = ctx->framebuffer.width; + vp.height = ctx->framebuffer.height; + SwrSetViewports(ctx->swrContext, 1, &vp, NULL); + + swr_update_draw_context(ctx); + SwrClearRenderTarget(ctx->swrContext, clearMask, color->f, depth, stencil); +} + + +#if 0 // XXX, these don't get called. how to get these called? Do we need + // them? Docs? +static void +swr_clear_render_target(struct pipe_context *pipe, struct pipe_surface *ps, + const union pipe_color_union *color, + unsigned x, unsigned y, unsigned w, unsigned h) +{ + struct swr_context *ctx = swr_context(pipe); + fprintf(stderr, "SWR swr_clear_render_target!\n"); + + ctx->dirty |= SWR_NEW_FRAMEBUFFER | SWR_NEW_SCISSOR; +} + +static void +swr_clear_depth_stencil(struct pipe_context *pipe, struct pipe_surface *ps, + unsigned buffers, double depth, unsigned stencil, + unsigned x, unsigned y, unsigned w, unsigned h) +{ + struct swr_context *ctx = swr_context(pipe); + fprintf(stderr, "SWR swr_clear_depth_stencil!\n"); + + ctx->dirty |= SWR_NEW_FRAMEBUFFER | SWR_NEW_SCISSOR; +} + +static void +swr_clear_buffer(struct pipe_context *pipe, + struct pipe_resource *res, + unsigned offset, unsigned size, + const void *data, int data_size) +{ + fprintf(stderr, "SWR swr_clear_buffer!\n"); + struct swr_context *ctx = swr_context(pipe); + struct swr_resource *buf = swr_resource(res); + union pipe_color_union color; + enum pipe_format dst_fmt; + unsigned width, height, elements; + + assert(res->target == PIPE_BUFFER); + assert(buf); + assert(size % data_size == 0); + + SWR_SURFACE_STATE &swr_buffer = buf->swr; + + ctx->dirty |= SWR_NEW_FRAMEBUFFER | SWR_NEW_SCISSOR; +} +#endif + + +void +swr_clear_init(struct pipe_context *pipe) +{ + pipe->clear = swr_clear; +#if 0 // XXX, these don't get called. how to get these called? Do we need + // them? Docs? + pipe->clear_render_target = swr_clear_render_target; + pipe->clear_depth_stencil = swr_clear_depth_stencil; + pipe->clear_buffer = swr_clear_buffer; +#endif +} diff --git a/src/gallium/drivers/swr/swr_context.cpp b/src/gallium/drivers/swr/swr_context.cpp new file mode 100644 index 00000000000..c8cb145d334 --- /dev/null +++ b/src/gallium/drivers/swr/swr_context.cpp @@ -0,0 +1,382 @@ +/**************************************************************************** + * Copyright (C) 2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************/ + +#include "util/u_memory.h" +#include "util/u_inlines.h" +#include "util/u_format.h" + +extern "C" { +#include "util/u_transfer.h" +#include "util/u_surface.h" +} + +#include "swr_context.h" +#include "swr_memory.h" +#include "swr_screen.h" +#include "swr_resource.h" +#include "swr_scratch.h" +#include "swr_query.h" +#include "swr_fence.h" + +#include "api.h" +#include "backend.h" + +static struct pipe_surface * +swr_create_surface(struct pipe_context *pipe, + struct pipe_resource *pt, + const struct pipe_surface *surf_tmpl) +{ + struct pipe_surface *ps; + + ps = CALLOC_STRUCT(pipe_surface); + if (ps) { + pipe_reference_init(&ps->reference, 1); + pipe_resource_reference(&ps->texture, pt); + ps->context = pipe; + ps->format = surf_tmpl->format; + if (pt->target != PIPE_BUFFER) { + assert(surf_tmpl->u.tex.level <= pt->last_level); + ps->width = u_minify(pt->width0, surf_tmpl->u.tex.level); + ps->height = u_minify(pt->height0, surf_tmpl->u.tex.level); + ps->u.tex.level = surf_tmpl->u.tex.level; + ps->u.tex.first_layer = surf_tmpl->u.tex.first_layer; + ps->u.tex.last_layer = surf_tmpl->u.tex.last_layer; + if (ps->u.tex.first_layer != ps->u.tex.last_layer) { + debug_printf("creating surface with multiple layers, rendering " + "to first layer only\n"); + } + } else { + /* setting width as number of elements should get us correct + * renderbuffer width */ + ps->width = surf_tmpl->u.buf.last_element + - surf_tmpl->u.buf.first_element + 1; + ps->height = pt->height0; + ps->u.buf.first_element = surf_tmpl->u.buf.first_element; + ps->u.buf.last_element = surf_tmpl->u.buf.last_element; + assert(ps->u.buf.first_element <= ps->u.buf.last_element); + assert(ps->u.buf.last_element < ps->width); + } + } + return ps; +} + +static void +swr_surface_destroy(struct pipe_context *pipe, struct pipe_surface *surf) +{ + assert(surf->texture); + struct pipe_resource *resource = surf->texture; + + /* If the resource has been drawn to, store tiles. */ + swr_store_dirty_resource(pipe, resource, SWR_TILE_RESOLVED); + + pipe_resource_reference(&resource, NULL); + FREE(surf); +} + + +static void * +swr_transfer_map(struct pipe_context *pipe, + struct pipe_resource *resource, + unsigned level, + unsigned usage, + const struct pipe_box *box, + struct pipe_transfer **transfer) +{ + struct swr_screen *screen = swr_screen(pipe->screen); + struct swr_resource *spr = swr_resource(resource); + struct pipe_transfer *pt; + enum pipe_format format = resource->format; + + assert(resource); + assert(level <= resource->last_level); + + /* If mapping an attached rendertarget, store tiles to surface and set + * postStoreTileState to SWR_TILE_INVALID so tiles get reloaded on next use + * and nothing needs to be done at unmap. */ + swr_store_dirty_resource(pipe, resource, SWR_TILE_INVALID); + + if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { + /* If resource is in use, finish fence before mapping. + * Unless requested not to block, then if not done return NULL map */ + if (usage & PIPE_TRANSFER_DONTBLOCK) { + if (swr_is_fence_pending(screen->flush_fence)) + return NULL; + } else { + if (spr->status) { + /* But, if there's no fence pending, submit one. + * XXX: Remove once draw timestamps are finished. */ + if (!swr_is_fence_pending(screen->flush_fence)) + swr_fence_submit(swr_context(pipe), screen->flush_fence); + + swr_fence_finish(pipe->screen, screen->flush_fence, 0); + swr_resource_unused(pipe, spr); + } + } + } + + pt = CALLOC_STRUCT(pipe_transfer); + if (!pt) + return NULL; + pipe_resource_reference(&pt->resource, resource); + pt->level = level; + pt->box = *box; + pt->stride = spr->row_stride[level]; + pt->layer_stride = spr->img_stride[level]; + + /* if we're mapping the depth/stencil, copy in stencil */ + if (spr->base.format == PIPE_FORMAT_Z24_UNORM_S8_UINT + && spr->has_stencil) { + for (unsigned i = 0; i < spr->alignedWidth * spr->alignedHeight; i++) { + spr->swr.pBaseAddress[4 * i + 3] = spr->secondary.pBaseAddress[i]; + } + } else if (spr->base.format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT + && spr->has_stencil) { + for (unsigned i = 0; i < spr->alignedWidth * spr->alignedHeight; i++) { + spr->swr.pBaseAddress[8 * i + 4] = spr->secondary.pBaseAddress[i]; + } + } + + unsigned offset = box->z * pt->layer_stride + box->y * pt->stride + + box->x * util_format_get_blocksize(format); + + *transfer = pt; + + return spr->swr.pBaseAddress + offset + spr->mip_offsets[level]; +} + +static void +swr_transfer_unmap(struct pipe_context *pipe, struct pipe_transfer *transfer) +{ + assert(transfer->resource); + + struct swr_resource *res = swr_resource(transfer->resource); + /* if we're mapping the depth/stencil, copy out stencil */ + if (res->base.format == PIPE_FORMAT_Z24_UNORM_S8_UINT + && res->has_stencil) { + for (unsigned i = 0; i < res->alignedWidth * res->alignedHeight; i++) { + res->secondary.pBaseAddress[i] = res->swr.pBaseAddress[4 * i + 3]; + } + } else if (res->base.format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT + && res->has_stencil) { + for (unsigned i = 0; i < res->alignedWidth * res->alignedHeight; i++) { + res->secondary.pBaseAddress[i] = res->swr.pBaseAddress[8 * i + 4]; + } + } + + pipe_resource_reference(&transfer->resource, NULL); + FREE(transfer); +} + + +static void +swr_resource_copy(struct pipe_context *pipe, + struct pipe_resource *dst, + unsigned dst_level, + unsigned dstx, + unsigned dsty, + unsigned dstz, + struct pipe_resource *src, + unsigned src_level, + const struct pipe_box *src_box) +{ + struct swr_screen *screen = swr_screen(pipe->screen); + + /* If either the src or dst is a renderTarget, store tiles before copy */ + swr_store_dirty_resource(pipe, src, SWR_TILE_RESOLVED); + swr_store_dirty_resource(pipe, dst, SWR_TILE_RESOLVED); + + swr_fence_finish(pipe->screen, screen->flush_fence, 0); + swr_resource_unused(pipe, swr_resource(src)); + swr_resource_unused(pipe, swr_resource(dst)); + + if ((dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) + || (dst->target != PIPE_BUFFER && src->target != PIPE_BUFFER)) { + util_resource_copy_region( + pipe, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box); + return; + } + + debug_printf("unhandled swr_resource_copy\n"); +} + + +static void +swr_blit(struct pipe_context *pipe, const struct pipe_blit_info *blit_info) +{ + struct swr_context *ctx = swr_context(pipe); + struct pipe_blit_info info = *blit_info; + + if (blit_info->render_condition_enable && !swr_check_render_cond(pipe)) + return; + + if (info.src.resource->nr_samples > 1 && info.dst.resource->nr_samples <= 1 + && !util_format_is_depth_or_stencil(info.src.resource->format) + && !util_format_is_pure_integer(info.src.resource->format)) { + debug_printf("swr: color resolve unimplemented\n"); + return; + } + + if (util_try_blit_via_copy_region(pipe, &info)) { + return; /* done */ + } + + if (info.mask & PIPE_MASK_S) { + debug_printf("swr: cannot blit stencil, skipping\n"); + info.mask &= ~PIPE_MASK_S; + } + + if (!util_blitter_is_blit_supported(ctx->blitter, &info)) { + debug_printf("swr: blit unsupported %s -> %s\n", + util_format_short_name(info.src.resource->format), + util_format_short_name(info.dst.resource->format)); + return; + } + + /* XXX turn off occlusion and streamout queries */ + + util_blitter_save_vertex_buffer_slot(ctx->blitter, ctx->vertex_buffer); + util_blitter_save_vertex_elements(ctx->blitter, (void *)ctx->velems); + util_blitter_save_vertex_shader(ctx->blitter, (void *)ctx->vs); + /*util_blitter_save_geometry_shader(ctx->blitter, (void*)ctx->gs);*/ + util_blitter_save_so_targets( + ctx->blitter, + ctx->num_so_targets, + (struct pipe_stream_output_target **)ctx->so_targets); + util_blitter_save_rasterizer(ctx->blitter, (void *)ctx->rasterizer); + util_blitter_save_viewport(ctx->blitter, &ctx->viewport); + util_blitter_save_scissor(ctx->blitter, &ctx->scissor); + util_blitter_save_fragment_shader(ctx->blitter, ctx->fs); + util_blitter_save_blend(ctx->blitter, (void *)ctx->blend); + util_blitter_save_depth_stencil_alpha(ctx->blitter, + (void *)ctx->depth_stencil); + util_blitter_save_stencil_ref(ctx->blitter, &ctx->stencil_ref); + util_blitter_save_sample_mask(ctx->blitter, ctx->sample_mask); + util_blitter_save_framebuffer(ctx->blitter, &ctx->framebuffer); + util_blitter_save_fragment_sampler_states( + ctx->blitter, + ctx->num_samplers[PIPE_SHADER_FRAGMENT], + (void **)ctx->samplers[PIPE_SHADER_FRAGMENT]); + util_blitter_save_fragment_sampler_views( + ctx->blitter, + ctx->num_sampler_views[PIPE_SHADER_FRAGMENT], + ctx->sampler_views[PIPE_SHADER_FRAGMENT]); + util_blitter_save_render_condition(ctx->blitter, + ctx->render_cond_query, + ctx->render_cond_cond, + ctx->render_cond_mode); + + util_blitter_blit(ctx->blitter, &info); +} + + +static void +swr_destroy(struct pipe_context *pipe) +{ + struct swr_context *ctx = swr_context(pipe); + + if (ctx->blitter) + util_blitter_destroy(ctx->blitter); + + /* Idle core before deleting context */ + SwrWaitForIdle(ctx->swrContext); + if (ctx->swrContext) + SwrDestroyContext(ctx->swrContext); + + delete ctx->blendJIT; + + swr_destroy_scratch_buffers(ctx); + + FREE(ctx); +} + + +static void +swr_render_condition(struct pipe_context *pipe, + struct pipe_query *query, + boolean condition, + uint mode) +{ + struct swr_context *ctx = swr_context(pipe); + + ctx->render_cond_query = query; + ctx->render_cond_mode = mode; + ctx->render_cond_cond = condition; +} + +struct pipe_context * +swr_create_context(struct pipe_screen *screen, void *priv, unsigned flags) +{ + struct swr_context *ctx = CALLOC_STRUCT(swr_context); + ctx->blendJIT = + new std::unordered_map<BLEND_COMPILE_STATE, PFN_BLEND_JIT_FUNC>; + + SWR_CREATECONTEXT_INFO createInfo; + createInfo.driver = GL; + createInfo.privateStateSize = sizeof(swr_draw_context); + createInfo.maxSubContexts = 0; + createInfo.pfnLoadTile = swr_LoadHotTile; + createInfo.pfnStoreTile = swr_StoreHotTile; + createInfo.pfnClearTile = swr_StoreHotTileClear; + ctx->swrContext = SwrCreateContext(&createInfo); + + /* Init Load/Store/ClearTiles Tables */ + swr_InitMemoryModule(); + + InitBackendFuncTables(); + + if (ctx->swrContext == NULL) + goto fail; + + ctx->pipe.screen = screen; + ctx->pipe.destroy = swr_destroy; + ctx->pipe.priv = priv; + ctx->pipe.create_surface = swr_create_surface; + ctx->pipe.surface_destroy = swr_surface_destroy; + ctx->pipe.transfer_map = swr_transfer_map; + ctx->pipe.transfer_unmap = swr_transfer_unmap; + + ctx->pipe.transfer_flush_region = u_default_transfer_flush_region; + ctx->pipe.transfer_inline_write = u_default_transfer_inline_write; + + ctx->pipe.resource_copy_region = swr_resource_copy; + ctx->pipe.render_condition = swr_render_condition; + + swr_state_init(&ctx->pipe); + swr_clear_init(&ctx->pipe); + swr_draw_init(&ctx->pipe); + swr_query_init(&ctx->pipe); + + ctx->pipe.blit = swr_blit; + ctx->blitter = util_blitter_create(&ctx->pipe); + if (!ctx->blitter) + goto fail; + + swr_init_scratch_buffers(ctx); + + return &ctx->pipe; + +fail: + /* Should really validate the init steps and fail gracefully */ + swr_destroy(&ctx->pipe); + return NULL; +} diff --git a/src/gallium/drivers/swr/swr_context.h b/src/gallium/drivers/swr/swr_context.h new file mode 100644 index 00000000000..73a8e8ddda1 --- /dev/null +++ b/src/gallium/drivers/swr/swr_context.h @@ -0,0 +1,182 @@ +/**************************************************************************** + * Copyright (C) 2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************/ + +#ifndef SWR_CONTEXT_H +#define SWR_CONTEXT_H + +#include "pipe/p_context.h" +#include "pipe/p_state.h" +#include "util/u_blitter.h" +#include "jit_api.h" +#include "swr_state.h" +#include <unordered_map> + +#define SWR_NEW_BLEND (1 << 0) +#define SWR_NEW_RASTERIZER (1 << 1) +#define SWR_NEW_DEPTH_STENCIL_ALPHA (1 << 2) +#define SWR_NEW_SAMPLER (1 << 3) +#define SWR_NEW_SAMPLER_VIEW (1 << 4) +#define SWR_NEW_VS (1 << 5) +#define SWR_NEW_FS (1 << 6) +#define SWR_NEW_VSCONSTANTS (1 << 7) +#define SWR_NEW_FSCONSTANTS (1 << 8) +#define SWR_NEW_VERTEX (1 << 9) +#define SWR_NEW_STIPPLE (1 << 10) +#define SWR_NEW_SCISSOR (1 << 11) +#define SWR_NEW_VIEWPORT (1 << 12) +#define SWR_NEW_FRAMEBUFFER (1 << 13) +#define SWR_NEW_CLIP (1 << 14) +#define SWR_NEW_SO (1 << 15) +#define SWR_NEW_ALL 0x0000ffff + +namespace std +{ +template <> struct hash<BLEND_COMPILE_STATE> { + std::size_t operator()(const BLEND_COMPILE_STATE &k) const + { + return util_hash_crc32(&k, sizeof(k)); + } +}; +}; + +struct swr_jit_texture { + uint32_t width; // same as number of elements + uint32_t height; + uint32_t depth; // doubles as array size + uint32_t first_level; + uint32_t last_level; + const void *base_ptr; + uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS]; + uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS]; + uint32_t mip_offsets[PIPE_MAX_TEXTURE_LEVELS]; +}; + +struct swr_jit_sampler { + float min_lod; + float max_lod; + float lod_bias; + float border_color[4]; +}; + +struct swr_draw_context { + const float *constantVS[PIPE_MAX_CONSTANT_BUFFERS]; + unsigned num_constantsVS[PIPE_MAX_CONSTANT_BUFFERS]; + const float *constantFS[PIPE_MAX_CONSTANT_BUFFERS]; + unsigned num_constantsFS[PIPE_MAX_CONSTANT_BUFFERS]; + + swr_jit_texture texturesVS[PIPE_MAX_SHADER_SAMPLER_VIEWS]; + swr_jit_sampler samplersVS[PIPE_MAX_SAMPLERS]; + swr_jit_texture texturesFS[PIPE_MAX_SHADER_SAMPLER_VIEWS]; + swr_jit_sampler samplersFS[PIPE_MAX_SAMPLERS]; + + SWR_SURFACE_STATE renderTargets[SWR_NUM_ATTACHMENTS]; +}; + +struct swr_context { + struct pipe_context pipe; /**< base class */ + + HANDLE swrContext; + + /** Constant state objects */ + struct swr_blend_state *blend; + struct pipe_sampler_state *samplers[PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS]; + struct pipe_depth_stencil_alpha_state *depth_stencil; + struct pipe_rasterizer_state *rasterizer; + + struct swr_vertex_shader *vs; + struct swr_fragment_shader *fs; + struct swr_vertex_element_state *velems; + + /** Other rendering state */ + struct pipe_blend_color blend_color; + struct pipe_stencil_ref stencil_ref; + struct pipe_clip_state clip; + struct pipe_constant_buffer + constants[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS]; + struct pipe_framebuffer_state framebuffer; + struct pipe_poly_stipple poly_stipple; + struct pipe_scissor_state scissor; + struct pipe_sampler_view * + sampler_views[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS]; + + struct pipe_viewport_state viewport; + struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS]; + struct pipe_index_buffer index_buffer; + + struct blitter_context *blitter; + + /** Conditional query object and mode */ + struct pipe_query *render_cond_query; + uint render_cond_mode; + boolean render_cond_cond; + unsigned active_queries; + + unsigned num_vertex_buffers; + unsigned num_samplers[PIPE_SHADER_TYPES]; + unsigned num_sampler_views[PIPE_SHADER_TYPES]; + + unsigned sample_mask; + + // streamout + pipe_stream_output_target *so_targets[MAX_SO_STREAMS]; + uint32_t num_so_targets; + + /* Temp storage for user_buffer constants */ + struct swr_scratch_buffers *scratch; + + // blend jit functions + std::unordered_map<BLEND_COMPILE_STATE, PFN_BLEND_JIT_FUNC> *blendJIT; + + /* Derived SWR API DrawState */ + struct swr_derived_state derived; + + /* SWR private state - draw context */ + struct swr_draw_context swrDC; + + unsigned dirty; /**< Mask of SWR_NEW_x flags */ +}; + +static INLINE struct swr_context * +swr_context(struct pipe_context *pipe) +{ + return (struct swr_context *)pipe; +} + +static INLINE void +swr_update_draw_context(struct swr_context *ctx) +{ + swr_draw_context *pDC = + (swr_draw_context *)SwrGetPrivateContextState(ctx->swrContext); + memcpy(pDC, &ctx->swrDC, sizeof(swr_draw_context)); +} + +struct pipe_context *swr_create_context(struct pipe_screen *, void *priv, unsigned flags); + +void swr_state_init(struct pipe_context *pipe); + +void swr_clear_init(struct pipe_context *pipe); + +void swr_draw_init(struct pipe_context *pipe); + +void swr_finish(struct pipe_context *pipe); +#endif diff --git a/src/gallium/drivers/swr/swr_context_llvm.h b/src/gallium/drivers/swr/swr_context_llvm.h new file mode 100644 index 00000000000..58da813123f --- /dev/null +++ b/src/gallium/drivers/swr/swr_context_llvm.h @@ -0,0 +1,124 @@ +/**************************************************************************** + * Copyright (C) 2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************/ + +#pragma once + +////////////////////////////////////////////////////////////////////////// +/// Generate LLVM type information for swr_jit_texture +INLINE static StructType * +Gen_swr_jit_texture(JitManager *pShG) +{ + LLVMContext &ctx = pShG->mContext; + std::vector<Type *> members; + + members.push_back(Type::getInt32Ty(ctx)); // width + members.push_back(Type::getInt32Ty(ctx)); // height + members.push_back(Type::getInt32Ty(ctx)); // depth + members.push_back(Type::getInt32Ty(ctx)); // first_level + members.push_back(Type::getInt32Ty(ctx)); // last_level + members.push_back(PointerType::get(Type::getInt8Ty(ctx), 0)); // base_ptr + members.push_back(ArrayType::get(Type::getInt32Ty(ctx), + PIPE_MAX_TEXTURE_LEVELS)); // row_stride + members.push_back(ArrayType::get(Type::getInt32Ty(ctx), + PIPE_MAX_TEXTURE_LEVELS)); // img_stride + members.push_back(ArrayType::get(Type::getInt32Ty(ctx), + PIPE_MAX_TEXTURE_LEVELS)); // mip_offsets + + return StructType::get(ctx, members, false); +} + +static const UINT swr_jit_texture_width = 0; +static const UINT swr_jit_texture_height = 1; +static const UINT swr_jit_texture_depth = 2; +static const UINT swr_jit_texture_first_level = 3; +static const UINT swr_jit_texture_last_level = 4; +static const UINT swr_jit_texture_base_ptr = 5; +static const UINT swr_jit_texture_row_stride = 6; +static const UINT swr_jit_texture_img_stride = 7; +static const UINT swr_jit_texture_mip_offsets = 8; + +////////////////////////////////////////////////////////////////////////// +/// Generate LLVM type information for swr_jit_sampler +INLINE static StructType * +Gen_swr_jit_sampler(JitManager *pShG) +{ + LLVMContext &ctx = pShG->mContext; + std::vector<Type *> members; + + members.push_back(Type::getFloatTy(ctx)); // min_lod + members.push_back(Type::getFloatTy(ctx)); // max_lod + members.push_back(Type::getFloatTy(ctx)); // lod_bias + members.push_back( + ArrayType::get(Type::getFloatTy(ctx), 4)); // border_color + + return StructType::get(ctx, members, false); +} + +static const UINT swr_jit_sampler_min_lod = 0; +static const UINT swr_jit_sampler_max_lod = 1; +static const UINT swr_jit_sampler_lod_bias = 2; +static const UINT swr_jit_sampler_border_color = 3; + +////////////////////////////////////////////////////////////////////////// +/// Generate LLVM type information for swr_draw_context +INLINE static StructType * +Gen_swr_draw_context(JitManager *pShG) +{ + LLVMContext &ctx = pShG->mContext; + std::vector<Type *> members; + + members.push_back( + ArrayType::get(PointerType::get(Type::getFloatTy(ctx), 0), + PIPE_MAX_CONSTANT_BUFFERS)); // constantVS + members.push_back(ArrayType::get( + Type::getInt32Ty(ctx), PIPE_MAX_CONSTANT_BUFFERS)); // num_constantsVS + members.push_back( + ArrayType::get(PointerType::get(Type::getFloatTy(ctx), 0), + PIPE_MAX_CONSTANT_BUFFERS)); // constantFS + members.push_back(ArrayType::get( + Type::getInt32Ty(ctx), PIPE_MAX_CONSTANT_BUFFERS)); // num_constantsFS + members.push_back( + ArrayType::get(Gen_swr_jit_texture(pShG), + PIPE_MAX_SHADER_SAMPLER_VIEWS)); // texturesVS + members.push_back(ArrayType::get(Gen_swr_jit_sampler(pShG), + PIPE_MAX_SAMPLERS)); // samplersVS + members.push_back( + ArrayType::get(Gen_swr_jit_texture(pShG), + PIPE_MAX_SHADER_SAMPLER_VIEWS)); // texturesFS + members.push_back(ArrayType::get(Gen_swr_jit_sampler(pShG), + PIPE_MAX_SAMPLERS)); // samplersFS + members.push_back(ArrayType::get(Gen_SWR_SURFACE_STATE(pShG), + SWR_NUM_ATTACHMENTS)); // renderTargets + + return StructType::get(ctx, members, false); +} + +static const UINT swr_draw_context_constantVS = 0; +static const UINT swr_draw_context_num_constantsVS = 1; +static const UINT swr_draw_context_constantFS = 2; +static const UINT swr_draw_context_num_constantsFS = 3; +static const UINT swr_draw_context_texturesVS = 4; +static const UINT swr_draw_context_samplersVS = 5; +static const UINT swr_draw_context_texturesFS = 6; +static const UINT swr_draw_context_samplersFS = 7; +static const UINT swr_draw_context_renderTargets = 8; diff --git a/src/gallium/drivers/swr/swr_draw.cpp b/src/gallium/drivers/swr/swr_draw.cpp new file mode 100644 index 00000000000..428bf78cb55 --- /dev/null +++ b/src/gallium/drivers/swr/swr_draw.cpp @@ -0,0 +1,297 @@ +/**************************************************************************** + * Copyright (C) 2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************/ + +#include "swr_screen.h" +#include "swr_context.h" +#include "swr_resource.h" +#include "swr_fence.h" +#include "swr_query.h" +#include "jit_api.h" + +#include "util/u_draw.h" +#include "util/u_prim.h" + +/* + * Convert mesa PIPE_PRIM_X to SWR enum PRIMITIVE_TOPOLOGY + */ +static INLINE enum PRIMITIVE_TOPOLOGY +swr_convert_prim_topology(const unsigned mode) +{ + switch (mode) { + case PIPE_PRIM_POINTS: + return TOP_POINT_LIST; + case PIPE_PRIM_LINES: + return TOP_LINE_LIST; + case PIPE_PRIM_LINE_LOOP: + return TOP_LINE_LOOP; + case PIPE_PRIM_LINE_STRIP: + return TOP_LINE_STRIP; + case PIPE_PRIM_TRIANGLES: + return TOP_TRIANGLE_LIST; + case PIPE_PRIM_TRIANGLE_STRIP: + return TOP_TRIANGLE_STRIP; + case PIPE_PRIM_TRIANGLE_FAN: + return TOP_TRIANGLE_FAN; + case PIPE_PRIM_QUADS: + return TOP_QUAD_LIST; + case PIPE_PRIM_QUAD_STRIP: + return TOP_QUAD_STRIP; + case PIPE_PRIM_POLYGON: + return TOP_TRIANGLE_FAN; /* XXX TOP_POLYGON; */ + case PIPE_PRIM_LINES_ADJACENCY: + return TOP_LINE_LIST_ADJ; + case PIPE_PRIM_LINE_STRIP_ADJACENCY: + return TOP_LISTSTRIP_ADJ; + case PIPE_PRIM_TRIANGLES_ADJACENCY: + return TOP_TRI_LIST_ADJ; + case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: + return TOP_TRI_STRIP_ADJ; + default: + assert(0 && "Unknown topology"); + return TOP_UNKNOWN; + } +}; + + +/* + * Draw vertex arrays, with optional indexing, optional instancing. + */ +static void +swr_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) +{ + struct swr_context *ctx = swr_context(pipe); + + if (!swr_check_render_cond(pipe)) + return; + + if (info->indirect) { + util_draw_indirect(pipe, info); + return; + } + + /* Update derived state, pass draw info to update function */ + if (ctx->dirty) + swr_update_derived(pipe, info); + + swr_update_draw_context(ctx); + + if (ctx->vs->pipe.stream_output.num_outputs) { + if (!ctx->vs->soFunc[info->mode]) { + STREAMOUT_COMPILE_STATE state = {0}; + struct pipe_stream_output_info *so = &ctx->vs->pipe.stream_output; + + state.numVertsPerPrim = u_vertices_per_prim(info->mode); + + uint32_t offsets[MAX_SO_STREAMS] = {0}; + uint32_t num = 0; + + for (uint32_t i = 0; i < so->num_outputs; i++) { + assert(so->output[i].stream == 0); // @todo + uint32_t output_buffer = so->output[i].output_buffer; + if (so->output[i].dst_offset != offsets[output_buffer]) { + // hole - need to fill + state.stream.decl[num].bufferIndex = output_buffer; + state.stream.decl[num].hole = true; + state.stream.decl[num].componentMask = + (1 << (so->output[i].dst_offset - offsets[output_buffer])) + - 1; + num++; + offsets[output_buffer] = so->output[i].dst_offset; + } + + state.stream.decl[num].bufferIndex = output_buffer; + state.stream.decl[num].attribSlot = so->output[i].register_index - 1; + state.stream.decl[num].componentMask = + ((1 << so->output[i].num_components) - 1) + << so->output[i].start_component; + state.stream.decl[num].hole = false; + num++; + + offsets[output_buffer] += so->output[i].num_components; + } + + state.stream.numDecls = num; + + HANDLE hJitMgr = swr_screen(pipe->screen)->hJitMgr; + ctx->vs->soFunc[info->mode] = JitCompileStreamout(hJitMgr, state); + debug_printf("so shader %p\n", ctx->vs->soFunc[info->mode]); + assert(ctx->vs->soFunc[info->mode] && "Error: SoShader = NULL"); + } + + SwrSetSoFunc(ctx->swrContext, ctx->vs->soFunc[info->mode], 0); + } + + struct swr_vertex_element_state *velems = ctx->velems; + if (!velems->fsFunc + || (velems->fsState.cutIndex != info->restart_index) + || (velems->fsState.bEnableCutIndex != info->primitive_restart)) { + + velems->fsState.cutIndex = info->restart_index; + velems->fsState.bEnableCutIndex = info->primitive_restart; + + /* Create Fetch Shader */ + HANDLE hJitMgr = swr_screen(ctx->pipe.screen)->hJitMgr; + velems->fsFunc = JitCompileFetch(hJitMgr, velems->fsState); + + debug_printf("fetch shader %p\n", velems->fsFunc); + assert(velems->fsFunc && "Error: FetchShader = NULL"); + } + + SwrSetFetchFunc(ctx->swrContext, velems->fsFunc); + + if (info->indexed) + SwrDrawIndexedInstanced(ctx->swrContext, + swr_convert_prim_topology(info->mode), + info->count, + info->instance_count, + info->start, + info->index_bias, + info->start_instance); + else + SwrDrawInstanced(ctx->swrContext, + swr_convert_prim_topology(info->mode), + info->count, + info->instance_count, + info->start, + info->start_instance); +} + + +static void +swr_flush(struct pipe_context *pipe, + struct pipe_fence_handle **fence, + unsigned flags) +{ + struct swr_context *ctx = swr_context(pipe); + struct swr_screen *screen = swr_screen(pipe->screen); + struct pipe_surface *cb = ctx->framebuffer.cbufs[0]; + + /* If the current renderTarget is the display surface, store tiles back to + * the surface, in preparation for present (swr_flush_frontbuffer). + * Other renderTargets get stored back when attachment changes or + * swr_surface_destroy */ + if (cb && swr_resource(cb->texture)->display_target) + swr_store_dirty_resource(pipe, cb->texture, SWR_TILE_RESOLVED); + + if (fence) + swr_fence_reference(pipe->screen, fence, screen->flush_fence); +} + +void +swr_finish(struct pipe_context *pipe) +{ + struct pipe_fence_handle *fence = nullptr; + + swr_flush(pipe, &fence, 0); + swr_fence_finish(pipe->screen, fence, 0); + swr_fence_reference(pipe->screen, &fence, NULL); +} + + +/* + * Store SWR HotTiles back to renderTarget surface. + */ +void +swr_store_render_target(struct pipe_context *pipe, + uint32_t attachment, + enum SWR_TILE_STATE post_tile_state) +{ + struct swr_context *ctx = swr_context(pipe); + struct swr_draw_context *pDC = &ctx->swrDC; + struct SWR_SURFACE_STATE *renderTarget = &pDC->renderTargets[attachment]; + + /* Only proceed if there's a valid surface to store to */ + if (renderTarget->pBaseAddress) { + /* Set viewport to full renderTarget width/height and disable scissor + * before StoreTiles */ + boolean change_viewport = + (ctx->derived.vp.x != 0.0f || ctx->derived.vp.y != 0.0f + || ctx->derived.vp.width != renderTarget->width + || ctx->derived.vp.height != renderTarget->height); + if (change_viewport) { + SWR_VIEWPORT vp = {0}; + vp.width = renderTarget->width; + vp.height = renderTarget->height; + SwrSetViewports(ctx->swrContext, 1, &vp, NULL); + } + + boolean scissor_enable = ctx->derived.rastState.scissorEnable; + if (scissor_enable) { + ctx->derived.rastState.scissorEnable = FALSE; + SwrSetRastState(ctx->swrContext, &ctx->derived.rastState); + } + + swr_update_draw_context(ctx); + SwrStoreTiles(ctx->swrContext, + (enum SWR_RENDERTARGET_ATTACHMENT)attachment, + post_tile_state); + + /* Restore viewport and scissor enable */ + if (change_viewport) + SwrSetViewports(ctx->swrContext, 1, &ctx->derived.vp, &ctx->derived.vpm); + if (scissor_enable) { + ctx->derived.rastState.scissorEnable = scissor_enable; + SwrSetRastState(ctx->swrContext, &ctx->derived.rastState); + } + } +} + +void +swr_store_dirty_resource(struct pipe_context *pipe, + struct pipe_resource *resource, + enum SWR_TILE_STATE post_tile_state) +{ + /* Only store resource if it has been written to */ + if (swr_resource(resource)->status & SWR_RESOURCE_WRITE) { + struct swr_context *ctx = swr_context(pipe); + struct swr_screen *screen = swr_screen(pipe->screen); + struct swr_resource *spr = swr_resource(resource); + + swr_draw_context *pDC = &ctx->swrDC; + SWR_SURFACE_STATE *renderTargets = pDC->renderTargets; + for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; i++) + if (renderTargets[i].pBaseAddress == spr->swr.pBaseAddress) { + swr_store_render_target(pipe, i, post_tile_state); + + /* Mesa thinks depth/stencil are fused, so we'll never get an + * explicit resource for stencil. So, if checking depth, then + * also check for stencil. */ + if (spr->has_stencil && (i == SWR_ATTACHMENT_DEPTH)) { + swr_store_render_target( + pipe, SWR_ATTACHMENT_STENCIL, post_tile_state); + } + + /* This fence signals StoreTiles completion */ + swr_fence_submit(ctx, screen->flush_fence); + + break; + } + } +} + +void +swr_draw_init(struct pipe_context *pipe) +{ + pipe->draw_vbo = swr_draw_vbo; + pipe->flush = swr_flush; +} diff --git a/src/gallium/drivers/swr/swr_fence.cpp b/src/gallium/drivers/swr/swr_fence.cpp new file mode 100644 index 00000000000..2e95b3936a6 --- /dev/null +++ b/src/gallium/drivers/swr/swr_fence.cpp @@ -0,0 +1,150 @@ +/**************************************************************************** + * Copyright (C) 2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************/ + +#include "pipe/p_screen.h" +#include "util/u_memory.h" +#include "os/os_time.h" + +#include "swr_context.h" +#include "swr_screen.h" +#include "swr_fence.h" + +#if defined(PIPE_CC_MSVC) // portable thread yield + #define sched_yield SwitchToThread +#endif + +/* + * Fence callback, called by back-end thread on completion of all rendering up + * to SwrSync call. + */ +static void +swr_sync_cb(uint64_t userData, uint64_t userData2, uint64_t userData3) +{ + struct swr_fence *fence = (struct swr_fence *)userData; + + /* Correct value is in SwrSync data, and not the fence write field. */ + fence->read = userData2; +} + +/* + * Submit an existing fence. + */ +void +swr_fence_submit(struct swr_context *ctx, struct pipe_fence_handle *fh) +{ + struct swr_fence *fence = swr_fence(fh); + + fence->write++; + fence->pending = TRUE; + SwrSync(ctx->swrContext, swr_sync_cb, (uint64_t)fence, fence->write, 0); +} + +/* + * Create a new fence object. + */ +struct pipe_fence_handle * +swr_fence_create() +{ + static int fence_id = 0; + struct swr_fence *fence = CALLOC_STRUCT(swr_fence); + if (!fence) + return NULL; + + pipe_reference_init(&fence->reference, 1); + fence->id = fence_id++; + + return (struct pipe_fence_handle *)fence; +} + +/** Destroy a fence. Called when refcount hits zero. */ +static void +swr_fence_destroy(struct swr_fence *fence) +{ + FREE(fence); +} + +/** + * Set ptr = fence, with reference counting + */ +void +swr_fence_reference(struct pipe_screen *screen, + struct pipe_fence_handle **ptr, + struct pipe_fence_handle *f) +{ + struct swr_fence *fence = swr_fence(f); + struct swr_fence *old; + + if (likely(ptr)) { + old = swr_fence(*ptr); + *ptr = f; + } else { + old = NULL; + } + + if (pipe_reference(&old->reference, &fence->reference)) + swr_fence_destroy(old); +} + +static INLINE boolean +swr_is_fence_done(struct pipe_fence_handle *fence_handle) +{ + struct swr_fence *fence = swr_fence(fence_handle); + return (fence->read == fence->write); +} + +/* + * Wait for the fence to finish. + */ +boolean +swr_fence_finish(struct pipe_screen *screen, + struct pipe_fence_handle *fence_handle, + uint64_t timeout) +{ + while (!swr_is_fence_done(fence_handle)) + sched_yield(); + + swr_fence(fence_handle)->pending = FALSE; + + return TRUE; +} + + +uint64_t +swr_get_timestamp(struct pipe_screen *screen) +{ + return os_time_get_nano(); +} + + +void +swr_fence_init(struct pipe_screen *p_screen) +{ + p_screen->fence_reference = swr_fence_reference; + p_screen->fence_finish = swr_fence_finish; + p_screen->get_timestamp = swr_get_timestamp; + + /* Create persistant StoreTiles "flush" fence, used to signal completion + * of flushing tile state back to resource texture, via StoreTiles. */ + struct swr_screen *screen = swr_screen(p_screen); + screen->flush_fence = swr_fence_create(); +} diff --git a/src/gallium/drivers/swr/swr_fence.h b/src/gallium/drivers/swr/swr_fence.h new file mode 100644 index 00000000000..df3776e8989 --- /dev/null +++ b/src/gallium/drivers/swr/swr_fence.h @@ -0,0 +1,72 @@ +/**************************************************************************** + * Copyright (C) 2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + ***************************************************************************/ + +#ifndef SWR_FENCE_H +#define SWR_FENCE_H + +#include "pipe/p_state.h" +#include "util/u_inlines.h" + +struct pipe_screen; + +struct swr_fence { + struct pipe_reference reference; + + uint64_t read; + uint64_t write; + + unsigned pending; + + unsigned id; /* Just for reference */ +}; + + +static inline struct swr_fence * +swr_fence(struct pipe_fence_handle *fence) +{ + return (struct swr_fence *)fence; +} + +static INLINE boolean +swr_is_fence_pending(struct pipe_fence_handle *fence_handle) +{ + return swr_fence(fence_handle)->pending; +} + + +void swr_fence_init(struct pipe_screen *screen); + +struct pipe_fence_handle *swr_fence_create(); + +void swr_fence_reference(struct pipe_screen *screen, + struct pipe_fence_handle **ptr, + struct pipe_fence_handle *f); + +boolean swr_fence_finish(struct pipe_screen *screen, + struct pipe_fence_handle *fence_handle, + uint64_t timeout); + +void +swr_fence_submit(struct swr_context *ctx, struct pipe_fence_handle *fence); + +uint64_t swr_get_timestamp(struct pipe_screen *screen); + +#endif diff --git a/src/gallium/drivers/swr/swr_loader.cpp b/src/gallium/drivers/swr/swr_loader.cpp new file mode 100644 index 00000000000..2113c371c5f --- /dev/null +++ b/src/gallium/drivers/swr/swr_loader.cpp @@ -0,0 +1,67 @@ +/**************************************************************************** + * Copyright (C) 2016 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************/ + +#include "util/u_cpu_detect.h" +#include "util/u_dl.h" +#include "swr_public.h" + +#include <stdio.h> +#include <dlfcn.h> + +typedef pipe_screen *(*screen_create_proc)(struct sw_winsys *winsys); + +struct pipe_screen * +swr_create_screen(struct sw_winsys *winsys) +{ + fprintf(stderr, "SWR detected "); + + util_dl_library *pLibrary = nullptr; + + util_cpu_detect(); + if (util_cpu_caps.has_avx2) { + fprintf(stderr, "AVX2\n"); + pLibrary = util_dl_open("libswrAVX2.so"); + } else if (util_cpu_caps.has_avx) { + fprintf(stderr, "AVX\n"); + pLibrary = util_dl_open("libswrAVX.so"); + } else { + fprintf(stderr, "no AVX/AVX2 support. Aborting!\n"); + exit(-1); + } + + if (!pLibrary) { + fprintf(stderr, "SWR library load failure: %s\n", util_dl_error()); + exit(-1); + } + + util_dl_proc pScreenProc = util_dl_get_proc_address(pLibrary, "swr_create_screen"); + + if (!pScreenProc) { + fprintf(stderr, "SWR library search failure: %s\n", util_dl_error()); + exit(-1); + } + + screen_create_proc pScreenCreate = (screen_create_proc)pScreenProc; + + return pScreenCreate(winsys); +} diff --git a/src/gallium/drivers/swr/swr_memory.h b/src/gallium/drivers/swr/swr_memory.h new file mode 100644 index 00000000000..65fc169c85f --- /dev/null +++ b/src/gallium/drivers/swr/swr_memory.h @@ -0,0 +1,99 @@ +/**************************************************************************** + * Copyright (C) 2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************/ + +#pragma once + +void LoadHotTile( + SWR_SURFACE_STATE *pSrcSurface, + SWR_FORMAT dstFormat, + SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, + UINT x, UINT y, uint32_t renderTargetArrayIndex, + uint8_t *pDstHotTile); + +void StoreHotTile( + SWR_SURFACE_STATE *pDstSurface, + SWR_FORMAT srcFormat, + SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, + UINT x, UINT y, uint32_t renderTargetArrayIndex, + uint8_t *pSrcHotTile); + +void StoreHotTileClear( + SWR_SURFACE_STATE *pDstSurface, + SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, + UINT x, + UINT y, + const float* pClearColor); + +INLINE void +swr_LoadHotTile(HANDLE hPrivateContext, + SWR_FORMAT dstFormat, + SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, + UINT x, UINT y, + uint32_t renderTargetArrayIndex, uint8_t* pDstHotTile) +{ + // Grab source surface state from private context + swr_draw_context *pDC = (swr_draw_context*)hPrivateContext; + SWR_SURFACE_STATE *pSrcSurface = &pDC->renderTargets[renderTargetIndex]; + + LoadHotTile(pSrcSurface, dstFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pDstHotTile); +} + +INLINE void +swr_StoreHotTile(HANDLE hPrivateContext, + SWR_FORMAT srcFormat, + SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, + UINT x, UINT y, + uint32_t renderTargetArrayIndex, uint8_t* pSrcHotTile) +{ + // Grab destination surface state from private context + swr_draw_context *pDC = (swr_draw_context*)hPrivateContext; + SWR_SURFACE_STATE *pDstSurface = &pDC->renderTargets[renderTargetIndex]; + + StoreHotTile(pDstSurface, srcFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pSrcHotTile); +} + +INLINE void +swr_StoreHotTileClear(HANDLE hPrivateContext, + SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, + UINT x, + UINT y, + const float* pClearColor) +{ + // Grab destination surface state from private context + swr_draw_context *pDC = (swr_draw_context*)hPrivateContext; + SWR_SURFACE_STATE *pDstSurface = &pDC->renderTargets[renderTargetIndex]; + + StoreHotTileClear(pDstSurface, renderTargetIndex, x, y, pClearColor); +} + +void InitSimLoadTilesTable(); +void InitSimStoreTilesTable(); +void InitSimClearTilesTable(); + +/* Init Load/Store/ClearTiles Tables */ +INLINE void swr_InitMemoryModule() +{ + InitSimLoadTilesTable(); + InitSimStoreTilesTable(); + InitSimClearTilesTable(); +} diff --git a/src/gallium/drivers/swr/swr_public.h b/src/gallium/drivers/swr/swr_public.h new file mode 100644 index 00000000000..0814c3b85d6 --- /dev/null +++ b/src/gallium/drivers/swr/swr_public.h @@ -0,0 +1,46 @@ +/**************************************************************************** + * Copyright (C) 2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************/ + +#ifndef SWR_PUBLIC_H +#define SWR_PUBLIC_H + +struct pipe_screen; +struct sw_winsys; +struct sw_displaytarget; + +#ifdef __cplusplus +extern "C" { +#endif + +struct pipe_screen *swr_create_screen(struct sw_winsys *winsys); + +struct sw_winsys *swr_get_winsys(struct pipe_screen *pipe); + +struct sw_displaytarget *swr_get_displaytarget(struct pipe_resource *resource); + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/gallium/drivers/swr/swr_query.cpp b/src/gallium/drivers/swr/swr_query.cpp new file mode 100644 index 00000000000..810c50b2f8f --- /dev/null +++ b/src/gallium/drivers/swr/swr_query.cpp @@ -0,0 +1,334 @@ +/**************************************************************************** + * Copyright (C) 2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************/ + +#include "pipe/p_defines.h" +#include "util/u_memory.h" +#include "os/os_time.h" +#include "swr_context.h" +#include "swr_fence.h" +#include "swr_query.h" +#include "swr_screen.h" +#include "swr_state.h" + + +static struct swr_query * +swr_query(struct pipe_query *p) +{ + return (struct swr_query *)p; +} + +static struct pipe_query * +swr_create_query(struct pipe_context *pipe, unsigned type, unsigned index) +{ + struct swr_query *pq; + + assert(type < PIPE_QUERY_TYPES); + assert(index < MAX_SO_STREAMS); + + pq = CALLOC_STRUCT(swr_query); + + if (pq) { + pq->type = type; + pq->index = index; + } + + return (struct pipe_query *)pq; +} + + +static void +swr_destroy_query(struct pipe_context *pipe, struct pipe_query *q) +{ + struct swr_query *pq = swr_query(q); + + if (pq->fence) { + if (!swr_is_fence_pending(pq->fence)) { + swr_fence_submit(swr_context(pipe), pq->fence); + swr_fence_finish(pipe->screen, pq->fence, 0); + } + swr_fence_reference(pipe->screen, &pq->fence, NULL); + } + + FREE(pq); +} + + +// XXX Create a fence callback, rather than stalling SwrWaitForIdle +static void +swr_gather_stats(struct pipe_context *pipe, struct swr_query *pq) +{ + struct swr_context *ctx = swr_context(pipe); + + assert(pq->result); + union pipe_query_result *result = pq->result; + boolean enable_stats = pq->enable_stats; + SWR_STATS swr_stats = {0}; + + if (pq->fence) { + if (!swr_is_fence_pending(pq->fence)) { + swr_fence_submit(ctx, pq->fence); + swr_fence_finish(pipe->screen, pq->fence, 0); + } + swr_fence_reference(pipe->screen, &pq->fence, NULL); + } + + /* + * These queries don't need SWR Stats enabled in the core + * Set and return. + */ + switch (pq->type) { + case PIPE_QUERY_TIMESTAMP: + case PIPE_QUERY_TIME_ELAPSED: + result->u64 = swr_get_timestamp(pipe->screen); + return; + break; + case PIPE_QUERY_TIMESTAMP_DISJOINT: + /* nothing to do here */ + return; + break; + case PIPE_QUERY_GPU_FINISHED: + result->b = TRUE; /* XXX TODO Add an api func to SWR to compare drawId + vs LastRetiredId? */ + return; + break; + default: + /* Any query that needs SwrCore stats */ + break; + } + + /* + * All other results are collected from SwrCore counters + */ + + /* XXX, Should turn this into a fence callback and skip the stall */ + SwrGetStats(ctx->swrContext, &swr_stats); + /* SwrGetStats returns immediately, wait for collection */ + SwrWaitForIdle(ctx->swrContext); + + switch (pq->type) { + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_COUNTER: + result->u64 = swr_stats.DepthPassCount; + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: + result->u64 = swr_stats.IaPrimitives; + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + result->u64 = swr_stats.SoNumPrimsWritten[pq->index]; + break; + case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: { + struct pipe_query_data_so_statistics *so_stats = &result->so_statistics; + so_stats->num_primitives_written = + swr_stats.SoNumPrimsWritten[pq->index]; + so_stats->primitives_storage_needed = + swr_stats.SoPrimStorageNeeded[pq->index]; + } break; + case PIPE_QUERY_PIPELINE_STATISTICS: { + struct pipe_query_data_pipeline_statistics *p_stats = + &result->pipeline_statistics; + p_stats->ia_vertices = swr_stats.IaVertices; + p_stats->ia_primitives = swr_stats.IaPrimitives; + p_stats->vs_invocations = swr_stats.VsInvocations; + p_stats->gs_invocations = swr_stats.GsInvocations; + p_stats->gs_primitives = swr_stats.GsPrimitives; + p_stats->c_invocations = swr_stats.CPrimitives; + p_stats->c_primitives = swr_stats.CPrimitives; + p_stats->ps_invocations = swr_stats.PsInvocations; + p_stats->hs_invocations = swr_stats.HsInvocations; + p_stats->ds_invocations = swr_stats.DsInvocations; + p_stats->cs_invocations = swr_stats.CsInvocations; + } break; + default: + assert(0 && "Unsupported query"); + break; + } + + /* Only change stat collection if there are no active queries */ + if (ctx->active_queries == 0) + SwrEnableStats(ctx->swrContext, enable_stats); +} + + +static boolean +swr_get_query_result(struct pipe_context *pipe, + struct pipe_query *q, + boolean wait, + union pipe_query_result *result) +{ + struct swr_context *ctx = swr_context(pipe); + struct swr_query *pq = swr_query(q); + + if (pq->fence) { + if (!swr_is_fence_pending(pq->fence)) { + swr_fence_submit(ctx, pq->fence); + if (!wait) + return FALSE; + swr_fence_finish(pipe->screen, pq->fence, 0); + } + swr_fence_reference(pipe->screen, &pq->fence, NULL); + } + + /* XXX: Need to handle counter rollover */ + + switch (pq->type) { + /* Booleans */ + case PIPE_QUERY_OCCLUSION_PREDICATE: + result->b = pq->end.u64 != pq->start.u64 ? TRUE : FALSE; + break; + case PIPE_QUERY_GPU_FINISHED: + result->b = pq->end.b; + break; + /* Counters */ + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_TIMESTAMP: + case PIPE_QUERY_TIME_ELAPSED: + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_PRIMITIVES_EMITTED: + result->u64 = pq->end.u64 - pq->start.u64; + break; + /* Structures */ + case PIPE_QUERY_SO_STATISTICS: { + struct pipe_query_data_so_statistics *so_stats = &result->so_statistics; + struct pipe_query_data_so_statistics *start = &pq->start.so_statistics; + struct pipe_query_data_so_statistics *end = &pq->end.so_statistics; + so_stats->num_primitives_written = + end->num_primitives_written - start->num_primitives_written; + so_stats->primitives_storage_needed = + end->primitives_storage_needed - start->primitives_storage_needed; + } break; + case PIPE_QUERY_TIMESTAMP_DISJOINT: { + /* os_get_time_nano returns nanoseconds */ + result->timestamp_disjoint.frequency = UINT64_C(1000000000); + result->timestamp_disjoint.disjoint = FALSE; + } break; + case PIPE_QUERY_PIPELINE_STATISTICS: { + struct pipe_query_data_pipeline_statistics *p_stats = + &result->pipeline_statistics; + struct pipe_query_data_pipeline_statistics *start = + &pq->start.pipeline_statistics; + struct pipe_query_data_pipeline_statistics *end = + &pq->end.pipeline_statistics; + p_stats->ia_vertices = end->ia_vertices - start->ia_vertices; + p_stats->ia_primitives = end->ia_primitives - start->ia_primitives; + p_stats->vs_invocations = end->vs_invocations - start->vs_invocations; + p_stats->gs_invocations = end->gs_invocations - start->gs_invocations; + p_stats->gs_primitives = end->gs_primitives - start->gs_primitives; + p_stats->c_invocations = end->c_invocations - start->c_invocations; + p_stats->c_primitives = end->c_primitives - start->c_primitives; + p_stats->ps_invocations = end->ps_invocations - start->ps_invocations; + p_stats->hs_invocations = end->hs_invocations - start->hs_invocations; + p_stats->ds_invocations = end->ds_invocations - start->ds_invocations; + p_stats->cs_invocations = end->cs_invocations - start->cs_invocations; + } break; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: { + struct pipe_query_data_so_statistics *start = &pq->start.so_statistics; + struct pipe_query_data_so_statistics *end = &pq->end.so_statistics; + uint64_t num_primitives_written = + end->num_primitives_written - start->num_primitives_written; + uint64_t primitives_storage_needed = + end->primitives_storage_needed - start->primitives_storage_needed; + result->b = num_primitives_written > primitives_storage_needed; + } break; + default: + assert(0 && "Unsupported query"); + break; + } + + return TRUE; +} + +static boolean +swr_begin_query(struct pipe_context *pipe, struct pipe_query *q) +{ + struct swr_context *ctx = swr_context(pipe); + struct swr_query *pq = swr_query(q); + + /* Initialize Results */ + memset(&pq->start, 0, sizeof(pq->start)); + memset(&pq->end, 0, sizeof(pq->end)); + + /* Gather start stats and enable SwrCore counters */ + pq->result = &pq->start; + pq->enable_stats = TRUE; + swr_gather_stats(pipe, pq); + ctx->active_queries++; + + /* override start timestamp to 0 for TIMESTAMP query */ + if (pq->type == PIPE_QUERY_TIMESTAMP) + pq->start.u64 = 0; + + return true; +} + +static void +swr_end_query(struct pipe_context *pipe, struct pipe_query *q) +{ + struct swr_context *ctx = swr_context(pipe); + struct swr_query *pq = swr_query(q); + + assert(ctx->active_queries + && "swr_end_query, there are no active queries!"); + ctx->active_queries--; + + /* Gather end stats and disable SwrCore counters */ + pq->result = &pq->end; + pq->enable_stats = FALSE; + swr_gather_stats(pipe, pq); +} + + +boolean +swr_check_render_cond(struct pipe_context *pipe) +{ + struct swr_context *ctx = swr_context(pipe); + boolean b, wait; + uint64_t result; + + if (!ctx->render_cond_query) + return TRUE; /* no query predicate, draw normally */ + + wait = (ctx->render_cond_mode == PIPE_RENDER_COND_WAIT + || ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT); + + b = pipe->get_query_result( + pipe, ctx->render_cond_query, wait, (union pipe_query_result *)&result); + if (b) + return (!result == ctx->render_cond_cond); + else + return TRUE; +} + +void +swr_query_init(struct pipe_context *pipe) +{ + struct swr_context *ctx = swr_context(pipe); + + pipe->create_query = swr_create_query; + pipe->destroy_query = swr_destroy_query; + pipe->begin_query = swr_begin_query; + pipe->end_query = swr_end_query; + pipe->get_query_result = swr_get_query_result; + + ctx->active_queries = 0; +} diff --git a/src/gallium/drivers/swr/swr_query.h b/src/gallium/drivers/swr/swr_query.h new file mode 100644 index 00000000000..836d07b68ae --- /dev/null +++ b/src/gallium/drivers/swr/swr_query.h @@ -0,0 +1,46 @@ +/**************************************************************************** + * Copyright (C) 2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************/ + +#ifndef SWR_QUERY_H +#define SWR_QUERY_H + + +#include <limits.h> + +struct swr_query { + unsigned type; /* PIPE_QUERY_* */ + unsigned index; + + union pipe_query_result *result; + union pipe_query_result start; + union pipe_query_result end; + + struct pipe_fence_handle *fence; + + boolean enable_stats; +}; + +extern void swr_query_init(struct pipe_context *pipe); + +extern boolean swr_check_render_cond(struct pipe_context *pipe); +#endif diff --git a/src/gallium/drivers/swr/swr_resource.h b/src/gallium/drivers/swr/swr_resource.h new file mode 100644 index 00000000000..2fdc7683cb8 --- /dev/null +++ b/src/gallium/drivers/swr/swr_resource.h @@ -0,0 +1,143 @@ +/**************************************************************************** + * Copyright (C) 2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************/ + +#ifndef SWR_RESOURCE_H +#define SWR_RESOURCE_H + +#include "pipe/p_state.h" +#include "api.h" + +struct sw_displaytarget; + +enum swr_resource_status { + SWR_RESOURCE_UNUSED = 0x0, + SWR_RESOURCE_READ = 0x1, + SWR_RESOURCE_WRITE = 0x2, +}; + +struct swr_resource { + struct pipe_resource base; + + bool has_depth; + bool has_stencil; + + UINT alignedWidth; + UINT alignedHeight; + + SWR_SURFACE_STATE swr; + SWR_SURFACE_STATE secondary; /* for faking depth/stencil merged formats */ + + struct sw_displaytarget *display_target; + + unsigned row_stride[PIPE_MAX_TEXTURE_LEVELS]; + unsigned img_stride[PIPE_MAX_TEXTURE_LEVELS]; + unsigned mip_offsets[PIPE_MAX_TEXTURE_LEVELS]; + + enum swr_resource_status status; + + /* pipe_context to which resource is currently bound. */ + struct pipe_context *bound_to_context; +}; + + +static INLINE struct swr_resource * +swr_resource(struct pipe_resource *resource) +{ + return (struct swr_resource *)resource; +} + +static INLINE boolean +swr_resource_is_texture(const struct pipe_resource *resource) +{ + switch (resource->target) { + case PIPE_BUFFER: + return FALSE; + case PIPE_TEXTURE_1D: + case PIPE_TEXTURE_1D_ARRAY: + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_2D_ARRAY: + case PIPE_TEXTURE_RECT: + case PIPE_TEXTURE_3D: + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_CUBE_ARRAY: + return TRUE; + default: + assert(0); + return FALSE; + } +} + + +static INLINE void * +swr_resource_data(struct pipe_resource *resource) +{ + struct swr_resource *swr_r = swr_resource(resource); + + assert(!swr_resource_is_texture(resource)); + + return swr_r->swr.pBaseAddress; +} + + +void swr_store_render_target(struct pipe_context *pipe, + uint32_t attachment, + enum SWR_TILE_STATE post_tile_state); + +void swr_store_dirty_resource(struct pipe_context *pipe, + struct pipe_resource *resource, + enum SWR_TILE_STATE post_tile_state); + +void swr_update_resource_status(struct pipe_context *, + const struct pipe_draw_info *); + +/* + * Functions to indicate a resource's in-use status. + */ +static INLINE enum +swr_resource_status & operator|=(enum swr_resource_status & a, + enum swr_resource_status b) { + return (enum swr_resource_status &)((int&)a |= (int)b); +} + +static INLINE void +swr_resource_read(struct pipe_context *pipe, struct swr_resource *resource) +{ + resource->status |= SWR_RESOURCE_READ; + resource->bound_to_context = pipe; +} + +static INLINE void +swr_resource_write(struct pipe_context *pipe, struct swr_resource *resource) +{ + resource->status |= SWR_RESOURCE_WRITE; + resource->bound_to_context = pipe; +} + +static INLINE void +swr_resource_unused(struct pipe_context *pipe, struct swr_resource *resource) +{ + resource->status = SWR_RESOURCE_UNUSED; + resource->bound_to_context = nullptr; +} + +#endif diff --git a/src/gallium/drivers/swr/swr_scratch.cpp b/src/gallium/drivers/swr/swr_scratch.cpp new file mode 100644 index 00000000000..28eb2acb910 --- /dev/null +++ b/src/gallium/drivers/swr/swr_scratch.cpp @@ -0,0 +1,116 @@ +/**************************************************************************** + * Copyright (C) 2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************/ + +#include "util/u_memory.h" +#include "swr_context.h" +#include "swr_scratch.h" +#include "api.h" + + +void * +swr_copy_to_scratch_space(struct swr_context *ctx, + struct swr_scratch_space *space, + const void *user_buffer, + unsigned int size) +{ + void *ptr; + assert(space); + assert(user_buffer); + assert(size); + + if (size >= 2048) { /* XXX TODO create KNOB_ for this */ + /* Use per draw SwrAllocDrawContextMemory for larger copies */ + ptr = SwrAllocDrawContextMemory(ctx->swrContext, size, 4); + } else { + /* Allocate enough so that MAX_DRAWS_IN_FLIGHT sets fit. */ + unsigned int max_size_in_flight = size * KNOB_MAX_DRAWS_IN_FLIGHT; + + /* Need to grow space */ + if (max_size_in_flight > space->current_size) { + /* Must idle the pipeline, this is infrequent */ + SwrWaitForIdle(ctx->swrContext); + + space->current_size = max_size_in_flight; + + if (space->base) { + align_free(space->base); + space->base = NULL; + } + + if (!space->base) { + space->base = (uint8_t *)align_malloc(space->current_size, 4); + space->head = (void *)space->base; + } + } + + /* Wrap */ + if (((uint8_t *)space->head + size) + >= ((uint8_t *)space->base + space->current_size)) { + /* + * TODO XXX: Should add a fence on wrap. Assumption is that + * current_space >> size, and there are at least MAX_DRAWS_IN_FLIGHT + * draws in scratch. So fence would always be met on wrap. A fence + * would ensure that first frame in buffer is done before wrapping. + * If fence ever needs to be waited on, can increase buffer size. + * So far in testing, this hasn't been necessary. + */ + space->head = space->base; + } + + ptr = space->head; + space->head = (uint8_t *)space->head + size; + } + + /* Copy user_buffer to scratch */ + memcpy(ptr, user_buffer, size); + + return ptr; +} + + +void +swr_init_scratch_buffers(struct swr_context *ctx) +{ + struct swr_scratch_buffers *scratch; + + scratch = CALLOC_STRUCT(swr_scratch_buffers); + ctx->scratch = scratch; +} + +void +swr_destroy_scratch_buffers(struct swr_context *ctx) +{ + struct swr_scratch_buffers *scratch = ctx->scratch; + + if (scratch) { + if (scratch->vs_constants.base) + align_free(scratch->vs_constants.base); + if (scratch->fs_constants.base) + align_free(scratch->fs_constants.base); + if (scratch->vertex_buffer.base) + align_free(scratch->vertex_buffer.base); + if (scratch->index_buffer.base) + align_free(scratch->index_buffer.base); + FREE(scratch); + } +} diff --git a/src/gallium/drivers/swr/swr_scratch.h b/src/gallium/drivers/swr/swr_scratch.h new file mode 100644 index 00000000000..74218d63644 --- /dev/null +++ b/src/gallium/drivers/swr/swr_scratch.h @@ -0,0 +1,63 @@ +/**************************************************************************** + * Copyright (C) 2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************/ + +#ifndef SWR_SCRATCH_H +#define SWR_SCRATCH_H + +struct swr_scratch_space { + void *head; + unsigned int current_size; + /* TODO XXX: Add a fence for wrap condition. */ + + void *base; +}; + +struct swr_scratch_buffers { + struct swr_scratch_space vs_constants; + struct swr_scratch_space fs_constants; + struct swr_scratch_space vertex_buffer; + struct swr_scratch_space index_buffer; +}; + + +/* + * swr_copy_to_scratch_space + * Copies size bytes of user_buffer into the scratch ring buffer. + * Used to store temporary data such as client arrays and constants. + * + * Inputs: + * space ptr to scratch pool (vs_constants, fs_constants) + * user_buffer, data to copy into scratch space + * size to be copied + * Returns: + * pointer to data copied to scratch space. + */ +void *swr_copy_to_scratch_space(struct swr_context *ctx, + struct swr_scratch_space *space, + const void *user_buffer, + unsigned int size); + +void swr_init_scratch_buffers(struct swr_context *ctx); +void swr_destroy_scratch_buffers(struct swr_context *ctx); + +#endif diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp new file mode 100644 index 00000000000..e46df47570f --- /dev/null +++ b/src/gallium/drivers/swr/swr_screen.cpp @@ -0,0 +1,745 @@ +/**************************************************************************** + * Copyright (C) 2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************/ + +#include "pipe/p_screen.h" +#include "pipe/p_defines.h" +#include "util/u_memory.h" +#include "util/u_format.h" +#include "util/u_inlines.h" +#include "util/u_cpu_detect.h" + +#include "state_tracker/sw_winsys.h" + +extern "C" { +#include "gallivm/lp_bld_limits.h" +} + +#include "swr_public.h" +#include "swr_screen.h" +#include "swr_context.h" +#include "swr_resource.h" +#include "swr_fence.h" +#include "gen_knobs.h" + +#include "jit_api.h" + +#include <stdio.h> + +/* MSVC case instensitive compare */ +#if defined(PIPE_CC_MSVC) + #define strcasecmp lstrcmpiA +#endif + +/* + * Max texture sizes + * XXX Check max texture size values against core and sampler. + */ +#define SWR_MAX_TEXTURE_SIZE (4 * 1048 * 1048 * 1024ULL) /* 4GB */ +#define SWR_MAX_TEXTURE_2D_LEVELS 14 /* 8K x 8K for now */ +#define SWR_MAX_TEXTURE_3D_LEVELS 12 /* 2K x 2K x 2K for now */ +#define SWR_MAX_TEXTURE_CUBE_LEVELS 14 /* 8K x 8K for now */ +#define SWR_MAX_TEXTURE_ARRAY_LAYERS 512 /* 8K x 512 / 8K x 8K x 512 */ + +static const char * +swr_get_name(struct pipe_screen *screen) +{ + return "SWR"; +} + +static const char * +swr_get_vendor(struct pipe_screen *screen) +{ + return "Intel Corporation"; +} + +static boolean +swr_is_format_supported(struct pipe_screen *screen, + enum pipe_format format, + enum pipe_texture_target target, + unsigned sample_count, + unsigned bind) +{ + struct sw_winsys *winsys = swr_screen(screen)->winsys; + const struct util_format_description *format_desc; + + assert(target == PIPE_BUFFER || target == PIPE_TEXTURE_1D + || target == PIPE_TEXTURE_1D_ARRAY + || target == PIPE_TEXTURE_2D + || target == PIPE_TEXTURE_2D_ARRAY + || target == PIPE_TEXTURE_RECT + || target == PIPE_TEXTURE_3D + || target == PIPE_TEXTURE_CUBE + || target == PIPE_TEXTURE_CUBE_ARRAY); + + format_desc = util_format_description(format); + if (!format_desc) + return FALSE; + + if (sample_count > 1) + return FALSE; + + if (bind + & (PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT | PIPE_BIND_SHARED)) { + if (!winsys->is_displaytarget_format_supported(winsys, bind, format)) + return FALSE; + } + + if (bind & PIPE_BIND_RENDER_TARGET) { + if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) + return FALSE; + + if (mesa_to_swr_format(format) == (SWR_FORMAT)-1) + return FALSE; + + /* + * Although possible, it is unnatural to render into compressed or YUV + * surfaces. So disable these here to avoid going into weird paths + * inside the state trackers. + */ + if (format_desc->block.width != 1 || format_desc->block.height != 1) + return FALSE; + } + + if (bind & PIPE_BIND_DEPTH_STENCIL) { + if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS) + return FALSE; + + if (mesa_to_swr_format(format) == (SWR_FORMAT)-1) + return FALSE; + } + + return TRUE; +} + +static int +swr_get_param(struct pipe_screen *screen, enum pipe_cap param) +{ + switch (param) { + case PIPE_CAP_NPOT_TEXTURES: + case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES: + return 1; + case PIPE_CAP_TWO_SIDED_STENCIL: + return 1; + case PIPE_CAP_SM3: + return 1; + case PIPE_CAP_ANISOTROPIC_FILTER: + return 0; + case PIPE_CAP_POINT_SPRITE: + return 1; + case PIPE_CAP_MAX_RENDER_TARGETS: + return PIPE_MAX_COLOR_BUFS; + case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: + return 1; + case PIPE_CAP_OCCLUSION_QUERY: + case PIPE_CAP_QUERY_TIME_ELAPSED: + case PIPE_CAP_QUERY_PIPELINE_STATISTICS: + return 1; + case PIPE_CAP_TEXTURE_MIRROR_CLAMP: + return 1; + case PIPE_CAP_TEXTURE_SHADOW_MAP: + return 1; + case PIPE_CAP_TEXTURE_SWIZZLE: + return 1; + case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK: + return 0; + case PIPE_CAP_MAX_TEXTURE_2D_LEVELS: + return SWR_MAX_TEXTURE_2D_LEVELS; + case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: + return SWR_MAX_TEXTURE_3D_LEVELS; + case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: + return SWR_MAX_TEXTURE_CUBE_LEVELS; + case PIPE_CAP_BLEND_EQUATION_SEPARATE: + return 1; + case PIPE_CAP_INDEP_BLEND_ENABLE: + return 1; + case PIPE_CAP_INDEP_BLEND_FUNC: + return 1; + case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT: + return 0; // Don't support lower left frag coord. + case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT: + case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER: + case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER: + return 1; + case PIPE_CAP_DEPTH_CLIP_DISABLE: + return 1; + case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: + return MAX_SO_STREAMS; + case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: + case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: + return MAX_ATTRIBUTES; + case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES: + case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS: + return 1024; + case PIPE_CAP_MAX_VERTEX_STREAMS: + return 1; + case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE: + return 2048; + case PIPE_CAP_PRIMITIVE_RESTART: + return 1; + case PIPE_CAP_SHADER_STENCIL_EXPORT: + return 1; + case PIPE_CAP_TGSI_INSTANCEID: + case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR: + case PIPE_CAP_START_INSTANCE: + return 1; + case PIPE_CAP_SEAMLESS_CUBE_MAP: + case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: + return 1; + case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: + return SWR_MAX_TEXTURE_ARRAY_LAYERS; + case PIPE_CAP_MIN_TEXEL_OFFSET: + return -8; + case PIPE_CAP_MAX_TEXEL_OFFSET: + return 7; + case PIPE_CAP_CONDITIONAL_RENDER: + return 1; + case PIPE_CAP_TEXTURE_BARRIER: + return 0; + case PIPE_CAP_FRAGMENT_COLOR_CLAMPED: + case PIPE_CAP_VERTEX_COLOR_UNCLAMPED: /* draw module */ + case PIPE_CAP_VERTEX_COLOR_CLAMPED: /* draw module */ + return 1; + case PIPE_CAP_MIXED_COLORBUFFER_FORMATS: + return 1; + case PIPE_CAP_GLSL_FEATURE_LEVEL: + return 330; + case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: + return 0; + case PIPE_CAP_COMPUTE: + return 0; + case PIPE_CAP_USER_VERTEX_BUFFERS: + case PIPE_CAP_USER_INDEX_BUFFERS: + case PIPE_CAP_USER_CONSTANT_BUFFERS: + case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: + case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT: + return 1; + case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT: + return 16; + case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS: + case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY: + case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY: + case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY: + case PIPE_CAP_TEXTURE_MULTISAMPLE: + return 0; + case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT: + return 64; + case PIPE_CAP_QUERY_TIMESTAMP: + return 1; + case PIPE_CAP_CUBE_MAP_ARRAY: + return 0; + case PIPE_CAP_TEXTURE_BUFFER_OBJECTS: + return 1; + case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE: + return 65536; + case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT: + return 0; + case PIPE_CAP_TGSI_TEXCOORD: + case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: + return 0; + case PIPE_CAP_MAX_VIEWPORTS: + return 1; + case PIPE_CAP_ENDIANNESS: + return PIPE_ENDIAN_NATIVE; + case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: + case PIPE_CAP_TEXTURE_GATHER_SM5: + return 0; + case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT: + return 1; + case PIPE_CAP_TEXTURE_QUERY_LOD: + case PIPE_CAP_SAMPLE_SHADING: + case PIPE_CAP_TEXTURE_GATHER_OFFSETS: + case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION: + case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: + case PIPE_CAP_SAMPLER_VIEW_TARGET: + return 0; + case PIPE_CAP_FAKE_SW_MSAA: + return 1; + case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET: + case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET: + return 0; + case PIPE_CAP_DRAW_INDIRECT: + return 1; + + case PIPE_CAP_VENDOR_ID: + return 0xFFFFFFFF; + case PIPE_CAP_DEVICE_ID: + return 0xFFFFFFFF; + case PIPE_CAP_ACCELERATED: + return 0; + case PIPE_CAP_VIDEO_MEMORY: { + /* XXX: Do we want to return the full amount of system memory ? */ + uint64_t system_memory; + + if (!os_get_total_physical_memory(&system_memory)) + return 0; + + return (int)(system_memory >> 20); + } + case PIPE_CAP_UMA: + return 1; + case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: + return 1; + case PIPE_CAP_CLIP_HALFZ: + return 1; + case PIPE_CAP_VERTEXID_NOBASE: + return 0; + case PIPE_CAP_POLYGON_OFFSET_CLAMP: + return 1; + case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: + return 0; + case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: + return 0; // xxx + case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: + return 0; + case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: + return 0; + case PIPE_CAP_DEPTH_BOUNDS_TEST: + return 0; // xxx + case PIPE_CAP_TEXTURE_FLOAT_LINEAR: + case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: + return 1; + case PIPE_CAP_TGSI_TXQS: + case PIPE_CAP_FORCE_PERSAMPLE_INTERP: + case PIPE_CAP_SHAREABLE_SHADERS: + case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_CLEAR_TEXTURE: + case PIPE_CAP_DRAW_PARAMETERS: + case PIPE_CAP_TGSI_PACK_HALF_FLOAT: + case PIPE_CAP_MULTI_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: + case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: + case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: + case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: + case PIPE_CAP_INVALIDATE_BUFFER: + case PIPE_CAP_GENERATE_MIPMAP: + case PIPE_CAP_STRING_MARKER: + case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY: + case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: + case PIPE_CAP_QUERY_BUFFER_OBJECT: + case PIPE_CAP_QUERY_MEMORY_INFO: + return 0; + } + + /* should only get here on unhandled cases */ + debug_printf("Unexpected PIPE_CAP %d query\n", param); + return 0; +} + +static int +swr_get_shader_param(struct pipe_screen *screen, + unsigned shader, + enum pipe_shader_cap param) +{ + if (shader == PIPE_SHADER_VERTEX || shader == PIPE_SHADER_FRAGMENT) + return gallivm_get_shader_param(param); + + // Todo: geometry, tesselation, compute + return 0; +} + + +static float +swr_get_paramf(struct pipe_screen *screen, enum pipe_capf param) +{ + switch (param) { + case PIPE_CAPF_MAX_LINE_WIDTH: + case PIPE_CAPF_MAX_LINE_WIDTH_AA: + case PIPE_CAPF_MAX_POINT_WIDTH: + return 255.0; /* arbitrary */ + case PIPE_CAPF_MAX_POINT_WIDTH_AA: + return 0.0; + case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY: + return 0.0; + case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS: + return 0.0; + case PIPE_CAPF_GUARD_BAND_LEFT: + case PIPE_CAPF_GUARD_BAND_TOP: + case PIPE_CAPF_GUARD_BAND_RIGHT: + case PIPE_CAPF_GUARD_BAND_BOTTOM: + return 0.0; + } + /* should only get here on unhandled cases */ + debug_printf("Unexpected PIPE_CAPF %d query\n", param); + return 0.0; +} + +SWR_FORMAT +mesa_to_swr_format(enum pipe_format format) +{ + const struct util_format_description *format_desc = + util_format_description(format); + if (!format_desc) + return (SWR_FORMAT)-1; + + // more robust check would be comparing all attributes of the formats + // luckily format names are mostly standardized + for (int i = 0; i < NUM_SWR_FORMATS; i++) { + const SWR_FORMAT_INFO &swr_desc = GetFormatInfo((SWR_FORMAT)i); + + if (!strcasecmp(format_desc->short_name, swr_desc.name)) + return (SWR_FORMAT)i; + } + + // ... with some exceptions + switch (format) { + case PIPE_FORMAT_R8G8B8A8_SRGB: + return R8G8B8A8_UNORM_SRGB; + case PIPE_FORMAT_B8G8R8A8_SRGB: + return B8G8R8A8_UNORM_SRGB; + case PIPE_FORMAT_I8_UNORM: + return R8_UNORM; + case PIPE_FORMAT_Z16_UNORM: + return R16_UNORM; + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + return R24_UNORM_X8_TYPELESS; + case PIPE_FORMAT_Z32_FLOAT: + return R32_FLOAT; + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + return R32_FLOAT_X8X24_TYPELESS; + case PIPE_FORMAT_L8A8_UNORM: + return R8G8_UNORM; + default: + break; + } + + debug_printf("asked to convert unsupported format %s\n", + format_desc->name); + return (SWR_FORMAT)-1; +} + +static boolean +swr_displaytarget_layout(struct swr_screen *screen, struct swr_resource *res) +{ + struct sw_winsys *winsys = screen->winsys; + struct sw_displaytarget *dt; + + UINT stride; + dt = winsys->displaytarget_create(winsys, + res->base.bind, + res->base.format, + res->alignedWidth, + res->alignedHeight, + 64, NULL, + &stride); + + if (dt == NULL) + return FALSE; + + void *map = winsys->displaytarget_map(winsys, dt, 0); + + res->display_target = dt; + res->swr.pBaseAddress = (uint8_t*) map; + + /* Clear the display target surface */ + if (map) + memset(map, 0, res->alignedHeight * stride); + + winsys->displaytarget_unmap(winsys, dt); + + return TRUE; +} + +static boolean +swr_texture_layout(struct swr_screen *screen, + struct swr_resource *res, + boolean allocate) +{ + struct pipe_resource *pt = &res->base; + + pipe_format fmt = pt->format; + const struct util_format_description *desc = util_format_description(fmt); + + res->has_depth = util_format_has_depth(desc); + res->has_stencil = util_format_has_stencil(desc); + + if (res->has_stencil && !res->has_depth) + fmt = PIPE_FORMAT_R8_UINT; + + res->swr.width = pt->width0; + res->swr.height = pt->height0; + res->swr.depth = pt->depth0; + res->swr.type = swr_convert_target_type(pt->target); + res->swr.tileMode = SWR_TILE_NONE; + res->swr.format = mesa_to_swr_format(fmt); + res->swr.numSamples = (1 << pt->nr_samples); + + SWR_FORMAT_INFO finfo = GetFormatInfo(res->swr.format); + + unsigned total_size = 0; + unsigned width = pt->width0; + unsigned height = pt->height0; + unsigned depth = pt->depth0; + unsigned layers = pt->array_size; + + for (int level = 0; level <= pt->last_level; level++) { + unsigned alignedWidth, alignedHeight; + unsigned num_slices; + + if (pt->bind & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DEPTH_STENCIL)) { + alignedWidth = align(width, KNOB_MACROTILE_X_DIM); + alignedHeight = align(height, KNOB_MACROTILE_Y_DIM); + } else { + alignedWidth = width; + alignedHeight = height; + } + + if (level == 0) { + res->alignedWidth = alignedWidth; + res->alignedHeight = alignedHeight; + } + + res->row_stride[level] = alignedWidth * finfo.Bpp; + res->img_stride[level] = res->row_stride[level] * alignedHeight; + res->mip_offsets[level] = total_size; + + if (pt->target == PIPE_TEXTURE_3D) + num_slices = depth; + else if (pt->target == PIPE_TEXTURE_1D_ARRAY + || pt->target == PIPE_TEXTURE_2D_ARRAY + || pt->target == PIPE_TEXTURE_CUBE + || pt->target == PIPE_TEXTURE_CUBE_ARRAY) + num_slices = layers; + else + num_slices = 1; + + total_size += res->img_stride[level] * num_slices; + if (total_size > SWR_MAX_TEXTURE_SIZE) + return FALSE; + + width = u_minify(width, 1); + height = u_minify(height, 1); + depth = u_minify(depth, 1); + } + + res->swr.halign = res->alignedWidth; + res->swr.valign = res->alignedHeight; + res->swr.pitch = res->row_stride[0]; + + if (allocate) { + res->swr.pBaseAddress = (uint8_t *)_aligned_malloc(total_size, 64); + + if (res->has_depth && res->has_stencil) { + SWR_FORMAT_INFO finfo = GetFormatInfo(res->secondary.format); + res->secondary.width = pt->width0; + res->secondary.height = pt->height0; + res->secondary.depth = pt->depth0; + res->secondary.type = SURFACE_2D; + res->secondary.tileMode = SWR_TILE_NONE; + res->secondary.format = R8_UINT; + res->secondary.numSamples = (1 << pt->nr_samples); + res->secondary.pitch = res->alignedWidth * finfo.Bpp; + + res->secondary.pBaseAddress = (uint8_t *)_aligned_malloc( + res->alignedHeight * res->secondary.pitch, 64); + } + } + + return TRUE; +} + +static boolean +swr_can_create_resource(struct pipe_screen *screen, + const struct pipe_resource *templat) +{ + struct swr_resource res; + memset(&res, 0, sizeof(res)); + res.base = *templat; + return swr_texture_layout(swr_screen(screen), &res, false); +} + +static struct pipe_resource * +swr_resource_create(struct pipe_screen *_screen, + const struct pipe_resource *templat) +{ + struct swr_screen *screen = swr_screen(_screen); + struct swr_resource *res = CALLOC_STRUCT(swr_resource); + if (!res) + return NULL; + + res->base = *templat; + pipe_reference_init(&res->base.reference, 1); + res->base.screen = &screen->base; + + if (swr_resource_is_texture(&res->base)) { + if (res->base.bind & (PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT + | PIPE_BIND_SHARED)) { + /* displayable surface + * first call swr_texture_layout without allocating to finish + * filling out the SWR_SURFAE_STATE in res */ + swr_texture_layout(screen, res, false); + if (!swr_displaytarget_layout(screen, res)) + goto fail; + } else { + /* texture map */ + if (!swr_texture_layout(screen, res, true)) + goto fail; + } + } else { + /* other data (vertex buffer, const buffer, etc) */ + assert(util_format_get_blocksize(templat->format) == 1); + assert(templat->height0 == 1); + assert(templat->depth0 == 1); + assert(templat->last_level == 0); + + /* Easiest to just call swr_texture_layout, as it sets up + * SWR_SURFAE_STATE in res */ + if (!swr_texture_layout(screen, res, true)) + goto fail; + } + + return &res->base; + +fail: + FREE(res); + return NULL; +} + +static void +swr_resource_destroy(struct pipe_screen *p_screen, struct pipe_resource *pt) +{ + struct swr_screen *screen = swr_screen(p_screen); + struct swr_resource *spr = swr_resource(pt); + struct pipe_context *pipe = spr->bound_to_context; + + /* Only wait on fence if the resource is being used */ + if (pipe && spr->status) { + /* But, if there's no fence pending, submit one. + * XXX: Remove once draw timestamps are implmented. */ + if (!swr_is_fence_pending(screen->flush_fence)) + swr_fence_submit(swr_context(pipe), screen->flush_fence); + + swr_fence_finish(p_screen, screen->flush_fence, 0); + swr_resource_unused(pipe, spr); + } + + /* + * Free resource primary surface. If resource is display target, winsys + * manages the buffer and will free it on displaytarget_destroy. + */ + if (spr->display_target) { + /* display target */ + struct sw_winsys *winsys = screen->winsys; + winsys->displaytarget_destroy(winsys, spr->display_target); + } else + _aligned_free(spr->swr.pBaseAddress); + + _aligned_free(spr->secondary.pBaseAddress); + + FREE(spr); +} + + +static void +swr_flush_frontbuffer(struct pipe_screen *p_screen, + struct pipe_resource *resource, + unsigned level, + unsigned layer, + void *context_private, + struct pipe_box *sub_box) +{ + struct swr_screen *screen = swr_screen(p_screen); + struct sw_winsys *winsys = screen->winsys; + struct swr_resource *spr = swr_resource(resource); + struct pipe_context *pipe = spr->bound_to_context; + + if (pipe) { + swr_fence_finish(p_screen, screen->flush_fence, 0); + swr_resource_unused(pipe, spr); + SwrEndFrame(swr_context(pipe)->swrContext); + } + + debug_assert(spr->display_target); + if (spr->display_target) + winsys->displaytarget_display( + winsys, spr->display_target, context_private, sub_box); +} + + +static void +swr_destroy_screen(struct pipe_screen *p_screen) +{ + struct swr_screen *screen = swr_screen(p_screen); + struct sw_winsys *winsys = screen->winsys; + + fprintf(stderr, "SWR destroy screen!\n"); + + swr_fence_finish(p_screen, screen->flush_fence, 0); + swr_fence_reference(p_screen, &screen->flush_fence, NULL); + + JitDestroyContext(screen->hJitMgr); + + if (winsys->destroy) + winsys->destroy(winsys); + + FREE(screen); +} + +PUBLIC +struct pipe_screen * +swr_create_screen(struct sw_winsys *winsys) +{ + struct swr_screen *screen = CALLOC_STRUCT(swr_screen); + + if (!screen) + return NULL; + + if (!getenv("KNOB_MAX_PRIMS_PER_DRAW")) { + g_GlobalKnobs.MAX_PRIMS_PER_DRAW.Value(49152); + } + + screen->winsys = winsys; + screen->base.get_name = swr_get_name; + screen->base.get_vendor = swr_get_vendor; + screen->base.is_format_supported = swr_is_format_supported; + screen->base.context_create = swr_create_context; + screen->base.can_create_resource = swr_can_create_resource; + + screen->base.destroy = swr_destroy_screen; + screen->base.get_param = swr_get_param; + screen->base.get_shader_param = swr_get_shader_param; + screen->base.get_paramf = swr_get_paramf; + + screen->base.resource_create = swr_resource_create; + screen->base.resource_destroy = swr_resource_destroy; + + screen->base.flush_frontbuffer = swr_flush_frontbuffer; + + screen->hJitMgr = JitCreateContext(KNOB_SIMD_WIDTH, KNOB_ARCH_STR); + + swr_fence_init(&screen->base); + + return &screen->base; +} + +struct sw_winsys * +swr_get_winsys(struct pipe_screen *pipe) +{ + return ((struct swr_screen *)pipe)->winsys; +} + +struct sw_displaytarget * +swr_get_displaytarget(struct pipe_resource *resource) +{ + return ((struct swr_resource *)resource)->display_target; +} diff --git a/src/gallium/drivers/swr/swr_screen.h b/src/gallium/drivers/swr/swr_screen.h new file mode 100644 index 00000000000..a96dc44cf66 --- /dev/null +++ b/src/gallium/drivers/swr/swr_screen.h @@ -0,0 +1,52 @@ +/**************************************************************************** + * Copyright (C) 2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************/ + +#ifndef SWR_SCREEN_H +#define SWR_SCREEN_H + +#include "pipe/p_screen.h" +#include "pipe/p_defines.h" +#include "api.h" + +struct sw_winsys; + +struct swr_screen { + struct pipe_screen base; + + struct pipe_fence_handle *flush_fence; + + struct sw_winsys *winsys; + + HANDLE hJitMgr; +}; + +static INLINE struct swr_screen * +swr_screen(struct pipe_screen *pipe) +{ + return (struct swr_screen *)pipe; +} + +SWR_FORMAT +mesa_to_swr_format(enum pipe_format format); + +#endif diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp new file mode 100644 index 00000000000..ff16d0f2f11 --- /dev/null +++ b/src/gallium/drivers/swr/swr_shader.cpp @@ -0,0 +1,591 @@ +/**************************************************************************** + * Copyright (C) 2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************/ + +#include "JitManager.h" +#include "state.h" +#include "state_llvm.h" +#include "builder.h" + +#include "llvm-c/Core.h" +#include "llvm/Support/CBindingWrapping.h" + +#include "tgsi/tgsi_strings.h" +#include "gallivm/lp_bld_init.h" +#include "gallivm/lp_bld_flow.h" +#include "gallivm/lp_bld_struct.h" +#include "gallivm/lp_bld_tgsi.h" + +#include "swr_context.h" +#include "swr_context_llvm.h" +#include "swr_state.h" +#include "swr_screen.h" + +bool operator==(const swr_jit_key &lhs, const swr_jit_key &rhs) +{ + return !memcmp(&lhs, &rhs, sizeof(lhs)); +} + +void +swr_generate_fs_key(struct swr_jit_key &key, + struct swr_context *ctx, + swr_fragment_shader *swr_fs) +{ + key.nr_cbufs = ctx->framebuffer.nr_cbufs; + key.light_twoside = ctx->rasterizer->light_twoside; + memcpy(&key.vs_output_semantic_name, + &ctx->vs->info.base.output_semantic_name, + sizeof(key.vs_output_semantic_name)); + memcpy(&key.vs_output_semantic_idx, + &ctx->vs->info.base.output_semantic_index, + sizeof(key.vs_output_semantic_idx)); + + key.nr_samplers = swr_fs->info.base.file_max[TGSI_FILE_SAMPLER] + 1; + + for (unsigned i = 0; i < key.nr_samplers; i++) { + if (swr_fs->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { + lp_sampler_static_sampler_state( + &key.sampler[i].sampler_state, + ctx->samplers[PIPE_SHADER_FRAGMENT][i]); + } + } + + /* + * XXX If TGSI_FILE_SAMPLER_VIEW exists assume all texture opcodes + * are dx10-style? Can't really have mixed opcodes, at least not + * if we want to skip the holes here (without rescanning tgsi). + */ + if (swr_fs->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) { + key.nr_sampler_views = + swr_fs->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1; + for (unsigned i = 0; i < key.nr_sampler_views; i++) { + if (swr_fs->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << i)) { + lp_sampler_static_texture_state( + &key.sampler[i].texture_state, + ctx->sampler_views[PIPE_SHADER_FRAGMENT][i]); + } + } + } else { + key.nr_sampler_views = key.nr_samplers; + for (unsigned i = 0; i < key.nr_sampler_views; i++) { + if (swr_fs->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { + lp_sampler_static_texture_state( + &key.sampler[i].texture_state, + ctx->sampler_views[PIPE_SHADER_FRAGMENT][i]); + } + } + } +} + +struct BuilderSWR : public Builder { + BuilderSWR(JitManager *pJitMgr) + : Builder(pJitMgr) + { + pJitMgr->SetupNewModule(); + } + + PFN_VERTEX_FUNC + CompileVS(struct pipe_context *ctx, swr_vertex_shader *swr_vs); + PFN_PIXEL_KERNEL CompileFS(struct swr_context *ctx, swr_jit_key &key); +}; + +PFN_VERTEX_FUNC +BuilderSWR::CompileVS(struct pipe_context *ctx, swr_vertex_shader *swr_vs) +{ + swr_vs->linkageMask = 0; + + for (unsigned i = 0; i < swr_vs->info.base.num_outputs; i++) { + switch (swr_vs->info.base.output_semantic_name[i]) { + case TGSI_SEMANTIC_POSITION: + break; + default: + swr_vs->linkageMask |= (1 << i); + break; + } + } + + // tgsi_dump(swr_vs->pipe.tokens, 0); + + struct gallivm_state *gallivm = + gallivm_create("VS", wrap(&JM()->mContext)); + gallivm->module = wrap(JM()->mpCurrentModule); + + LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS]; + LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS]; + + memset(outputs, 0, sizeof(outputs)); + + AttrBuilder attrBuilder; + attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float)); + AttributeSet attrSet = AttributeSet::get( + JM()->mContext, AttributeSet::FunctionIndex, attrBuilder); + + std::vector<Type *> vsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0), + PointerType::get(Gen_SWR_VS_CONTEXT(JM()), 0)}; + FunctionType *vsFuncType = + FunctionType::get(Type::getVoidTy(JM()->mContext), vsArgs, false); + + // create new vertex shader function + auto pFunction = Function::Create(vsFuncType, + GlobalValue::ExternalLinkage, + "VS", + JM()->mpCurrentModule); + pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet); + + BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction); + IRB()->SetInsertPoint(block); + LLVMPositionBuilderAtEnd(gallivm->builder, wrap(block)); + + auto argitr = pFunction->arg_begin(); + Value *hPrivateData = &*argitr++; + hPrivateData->setName("hPrivateData"); + Value *pVsCtx = &*argitr++; + pVsCtx->setName("vsCtx"); + + Value *consts_ptr = GEP(hPrivateData, {C(0), C(swr_draw_context_constantVS)}); + + consts_ptr->setName("vs_constants"); + Value *const_sizes_ptr = + GEP(hPrivateData, {0, swr_draw_context_num_constantsVS}); + const_sizes_ptr->setName("num_vs_constants"); + + Value *vtxInput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVin}); + + for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_INPUTS; attrib++) { + const unsigned mask = swr_vs->info.base.input_usage_mask[attrib]; + for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) { + if (mask & (1 << channel)) { + inputs[attrib][channel] = + wrap(LOAD(vtxInput, {0, 0, attrib, channel})); + } + } + } + + struct lp_bld_tgsi_system_values system_values; + memset(&system_values, 0, sizeof(system_values)); + system_values.instance_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_InstanceID})); + system_values.vertex_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_VertexID})); + + lp_build_tgsi_soa(gallivm, + swr_vs->pipe.tokens, + lp_type_float_vec(32, 32 * 8), + NULL, // mask + wrap(consts_ptr), + wrap(const_sizes_ptr), + &system_values, + inputs, + outputs, + NULL, // wrap(hPrivateData), (sampler context) + NULL, // thread data + NULL, // sampler + &swr_vs->info.base, + NULL); // geometry shader face + + IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); + + Value *vtxOutput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVout}); + + for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) { + for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_OUTPUTS; attrib++) { + if (!outputs[attrib][channel]) + continue; + + Value *val = LOAD(unwrap(outputs[attrib][channel])); + + uint32_t outSlot = attrib; + if (swr_vs->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_PSIZE) + outSlot = VERTEX_POINT_SIZE_SLOT; + STORE(val, vtxOutput, {0, 0, outSlot, channel}); + } + } + + RET_VOID(); + + gallivm_verify_function(gallivm, wrap(pFunction)); + gallivm_compile_module(gallivm); + + // lp_debug_dump_value(func); + + PFN_VERTEX_FUNC pFunc = + (PFN_VERTEX_FUNC)gallivm_jit_function(gallivm, wrap(pFunction)); + + debug_printf("vert shader %p\n", pFunc); + assert(pFunc && "Error: VertShader = NULL"); + +#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR >= 5) + JM()->mIsModuleFinalized = true; +#endif + + return pFunc; +} + +PFN_VERTEX_FUNC +swr_compile_vs(struct pipe_context *ctx, swr_vertex_shader *swr_vs) +{ + BuilderSWR builder( + reinterpret_cast<JitManager *>(swr_screen(ctx->screen)->hJitMgr)); + return builder.CompileVS(ctx, swr_vs); +} + +static unsigned +locate_linkage(ubyte name, ubyte index, struct tgsi_shader_info *info) +{ + for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) { + if ((info->output_semantic_name[i] == name) + && (info->output_semantic_index[i] == index)) { + return i - 1; // position is not part of the linkage + } + } + + if (name == TGSI_SEMANTIC_COLOR) { // BCOLOR fallback + for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) { + if ((info->output_semantic_name[i] == TGSI_SEMANTIC_BCOLOR) + && (info->output_semantic_index[i] == index)) { + return i - 1; // position is not part of the linkage + } + } + } + + return 0xFFFFFFFF; +} + +PFN_PIXEL_KERNEL +BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_key &key) +{ + struct swr_fragment_shader *swr_fs = ctx->fs; + + // tgsi_dump(swr_fs->pipe.tokens, 0); + + struct gallivm_state *gallivm = + gallivm_create("FS", wrap(&JM()->mContext)); + gallivm->module = wrap(JM()->mpCurrentModule); + + LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS]; + LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS]; + + memset(inputs, 0, sizeof(inputs)); + memset(outputs, 0, sizeof(outputs)); + + struct lp_build_sampler_soa *sampler = NULL; + + AttrBuilder attrBuilder; + attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float)); + AttributeSet attrSet = AttributeSet::get( + JM()->mContext, AttributeSet::FunctionIndex, attrBuilder); + + std::vector<Type *> fsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0), + PointerType::get(Gen_SWR_PS_CONTEXT(JM()), 0)}; + FunctionType *funcType = + FunctionType::get(Type::getVoidTy(JM()->mContext), fsArgs, false); + + auto pFunction = Function::Create(funcType, + GlobalValue::ExternalLinkage, + "FS", + JM()->mpCurrentModule); + pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet); + + BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction); + IRB()->SetInsertPoint(block); + LLVMPositionBuilderAtEnd(gallivm->builder, wrap(block)); + + auto args = pFunction->arg_begin(); + Value *hPrivateData = &*args++; + hPrivateData->setName("hPrivateData"); + Value *pPS = &*args++; + pPS->setName("psCtx"); + + Value *consts_ptr = GEP(hPrivateData, {0, swr_draw_context_constantFS}); + consts_ptr->setName("fs_constants"); + Value *const_sizes_ptr = + GEP(hPrivateData, {0, swr_draw_context_num_constantsFS}); + const_sizes_ptr->setName("num_fs_constants"); + + // xxx should check for flat shading versus interpolation + + + // load *pAttribs, *pPerspAttribs + Value *pRawAttribs = LOAD(pPS, {0, SWR_PS_CONTEXT_pAttribs}, "pRawAttribs"); + Value *pPerspAttribs = + LOAD(pPS, {0, SWR_PS_CONTEXT_pPerspAttribs}, "pPerspAttribs"); + + swr_fs->constantMask = 0; + swr_fs->pointSpriteMask = 0; + + for (int attrib = 0; attrib < PIPE_MAX_SHADER_INPUTS; attrib++) { + const unsigned mask = swr_fs->info.base.input_usage_mask[attrib]; + const unsigned interpMode = swr_fs->info.base.input_interpolate[attrib]; + const unsigned interpLoc = swr_fs->info.base.input_interpolate_loc[attrib]; + + if (!mask) + continue; + + // load i,j + Value *vi = nullptr, *vj = nullptr; + switch (interpLoc) { + case TGSI_INTERPOLATE_LOC_CENTER: + vi = LOAD(pPS, {0, SWR_PS_CONTEXT_vI, PixelPositions_center}, "i"); + vj = LOAD(pPS, {0, SWR_PS_CONTEXT_vJ, PixelPositions_center}, "j"); + break; + case TGSI_INTERPOLATE_LOC_CENTROID: + vi = LOAD(pPS, {0, SWR_PS_CONTEXT_vI, PixelPositions_centroid}, "i"); + vj = LOAD(pPS, {0, SWR_PS_CONTEXT_vJ, PixelPositions_centroid}, "j"); + break; + case TGSI_INTERPOLATE_LOC_SAMPLE: + vi = LOAD(pPS, {0, SWR_PS_CONTEXT_vI, PixelPositions_sample}, "i"); + vj = LOAD(pPS, {0, SWR_PS_CONTEXT_vJ, PixelPositions_sample}, "j"); + break; + } + + // load/compute w + Value *vw = nullptr, *pAttribs; + if (interpMode == TGSI_INTERPOLATE_PERSPECTIVE) { + pAttribs = pPerspAttribs; + switch (interpLoc) { + case TGSI_INTERPOLATE_LOC_CENTER: + vw = VRCP(LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW, PixelPositions_center})); + break; + case TGSI_INTERPOLATE_LOC_CENTROID: + vw = VRCP(LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW, PixelPositions_centroid})); + break; + case TGSI_INTERPOLATE_LOC_SAMPLE: + vw = VRCP(LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW, PixelPositions_sample})); + break; + } + } else { + pAttribs = pRawAttribs; + vw = VIMMED1(1.f); + } + + vw->setName("w"); + + ubyte semantic_name = swr_fs->info.base.input_semantic_name[attrib]; + ubyte semantic_idx = swr_fs->info.base.input_semantic_index[attrib]; + + if (semantic_name == TGSI_SEMANTIC_FACE) { + Value *ff = + UI_TO_FP(LOAD(pPS, {0, SWR_PS_CONTEXT_frontFace}), mFP32Ty); + ff = FSUB(FMUL(ff, C(2.0f)), C(1.0f)); + ff = VECTOR_SPLAT(JM()->mVWidth, ff, "vFrontFace"); + + inputs[attrib][0] = wrap(ff); + inputs[attrib][1] = wrap(VIMMED1(0.0f)); + inputs[attrib][2] = wrap(VIMMED1(0.0f)); + inputs[attrib][3] = wrap(VIMMED1(1.0f)); + continue; + } else if (semantic_name == TGSI_SEMANTIC_POSITION) { // gl_FragCoord + inputs[attrib][0] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vX, PixelPositions_center}, "vX")); + inputs[attrib][1] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vY, PixelPositions_center}, "vY")); + inputs[attrib][2] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vZ}, "vZ")); + inputs[attrib][3] = + wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW, PixelPositions_center}, "vOneOverW")); + continue; + } else if (semantic_name == TGSI_SEMANTIC_PRIMID) { + Value *primID = LOAD(pPS, {0, SWR_PS_CONTEXT_primID}, "primID"); + inputs[attrib][0] = wrap(VECTOR_SPLAT(JM()->mVWidth, primID)); + inputs[attrib][1] = wrap(VIMMED1(0)); + inputs[attrib][2] = wrap(VIMMED1(0)); + inputs[attrib][3] = wrap(VIMMED1(0)); + continue; + } + + unsigned linkedAttrib = + locate_linkage(semantic_name, semantic_idx, &ctx->vs->info.base); + if (linkedAttrib == 0xFFFFFFFF) { + // not found - check for point sprite + if (ctx->rasterizer->sprite_coord_enable) { + linkedAttrib = ctx->vs->info.base.num_outputs - 1; + swr_fs->pointSpriteMask |= (1 << linkedAttrib); + } else { + fprintf(stderr, + "Missing %s[%d]\n", + tgsi_semantic_names[semantic_name], + semantic_idx); + assert(0 && "attribute linkage not found"); + } + } + + if (interpMode == TGSI_INTERPOLATE_CONSTANT) { + swr_fs->constantMask |= 1 << linkedAttrib; + } + + for (int channel = 0; channel < TGSI_NUM_CHANNELS; channel++) { + if (mask & (1 << channel)) { + Value *indexA = C(linkedAttrib * 12 + channel); + Value *indexB = C(linkedAttrib * 12 + channel + 4); + Value *indexC = C(linkedAttrib * 12 + channel + 8); + + if ((semantic_name == TGSI_SEMANTIC_COLOR) + && ctx->rasterizer->light_twoside) { + unsigned bcolorAttrib = locate_linkage( + TGSI_SEMANTIC_BCOLOR, semantic_idx, &ctx->vs->info.base); + + unsigned diff = 12 * (bcolorAttrib - linkedAttrib); + + Value *back = + XOR(C(1), LOAD(pPS, {0, SWR_PS_CONTEXT_frontFace}), "backFace"); + + Value *offset = MUL(back, C(diff)); + offset->setName("offset"); + + indexA = ADD(indexA, offset); + indexB = ADD(indexB, offset); + indexC = ADD(indexC, offset); + + if (interpMode == TGSI_INTERPOLATE_CONSTANT) { + swr_fs->constantMask |= 1 << bcolorAttrib; + } + } + + Value *va = VBROADCAST(LOAD(GEP(pAttribs, indexA))); + Value *vb = VBROADCAST(LOAD(GEP(pAttribs, indexB))); + Value *vc = VBROADCAST(LOAD(GEP(pAttribs, indexC))); + + if (interpMode == TGSI_INTERPOLATE_CONSTANT) { + inputs[attrib][channel] = wrap(va); + } else { + Value *vk = FSUB(FSUB(VIMMED1(1.0f), vi), vj); + + vc = FMUL(vk, vc); + + Value *interp = FMUL(va, vi); + Value *interp1 = FMUL(vb, vj); + interp = FADD(interp, interp1); + interp = FADD(interp, vc); + if (interpMode == TGSI_INTERPOLATE_PERSPECTIVE) + interp = FMUL(interp, vw); + inputs[attrib][channel] = wrap(interp); + } + } + } + } + + sampler = swr_sampler_soa_create(key.sampler); + + struct lp_bld_tgsi_system_values system_values; + memset(&system_values, 0, sizeof(system_values)); + + struct lp_build_mask_context mask; + + if (swr_fs->info.base.uses_kill) { + Value *mask_val = LOAD(pPS, {0, SWR_PS_CONTEXT_activeMask}, "activeMask"); + lp_build_mask_begin( + &mask, gallivm, lp_type_float_vec(32, 32 * 8), wrap(mask_val)); + } + + lp_build_tgsi_soa(gallivm, + swr_fs->pipe.tokens, + lp_type_float_vec(32, 32 * 8), + swr_fs->info.base.uses_kill ? &mask : NULL, // mask + wrap(consts_ptr), + wrap(const_sizes_ptr), + &system_values, + inputs, + outputs, + wrap(hPrivateData), + NULL, // thread data + sampler, // sampler + &swr_fs->info.base, + NULL); // geometry shader face + + IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); + + for (uint32_t attrib = 0; attrib < swr_fs->info.base.num_outputs; + attrib++) { + switch (swr_fs->info.base.output_semantic_name[attrib]) { + case TGSI_SEMANTIC_POSITION: { + // write z + LLVMValueRef outZ = + LLVMBuildLoad(gallivm->builder, outputs[attrib][2], ""); + STORE(unwrap(outZ), pPS, {0, SWR_PS_CONTEXT_vZ}); + break; + } + case TGSI_SEMANTIC_COLOR: { + for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) { + if (!outputs[attrib][channel]) + continue; + + LLVMValueRef out = + LLVMBuildLoad(gallivm->builder, outputs[attrib][channel], ""); + if (swr_fs->info.base.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS]) { + for (uint32_t rt = 0; rt < key.nr_cbufs; rt++) { + STORE(unwrap(out), + pPS, + {0, SWR_PS_CONTEXT_shaded, rt, channel}); + } + } else { + STORE(unwrap(out), + pPS, + {0, + SWR_PS_CONTEXT_shaded, + swr_fs->info.base.output_semantic_index[attrib], + channel}); + } + } + break; + } + default: { + fprintf(stderr, + "unknown output from FS %s[%d]\n", + tgsi_semantic_names[swr_fs->info.base + .output_semantic_name[attrib]], + swr_fs->info.base.output_semantic_index[attrib]); + break; + } + } + } + + LLVMValueRef mask_result = 0; + if (swr_fs->info.base.uses_kill) { + mask_result = lp_build_mask_end(&mask); + } + + IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); + + if (swr_fs->info.base.uses_kill) { + STORE(unwrap(mask_result), pPS, {0, SWR_PS_CONTEXT_activeMask}); + } + + RET_VOID(); + + gallivm_verify_function(gallivm, wrap(pFunction)); + + gallivm_compile_module(gallivm); + + PFN_PIXEL_KERNEL kernel = + (PFN_PIXEL_KERNEL)gallivm_jit_function(gallivm, wrap(pFunction)); + debug_printf("frag shader %p\n", kernel); + assert(kernel && "Error: FragShader = NULL"); + +#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR >= 5) + JM()->mIsModuleFinalized = true; +#endif + + return kernel; +} + +PFN_PIXEL_KERNEL +swr_compile_fs(struct swr_context *ctx, swr_jit_key &key) +{ + BuilderSWR builder( + reinterpret_cast<JitManager *>(swr_screen(ctx->pipe.screen)->hJitMgr)); + return builder.CompileFS(ctx, key); +} diff --git a/src/gallium/drivers/swr/swr_shader.h b/src/gallium/drivers/swr/swr_shader.h new file mode 100644 index 00000000000..e22a7c48c2a --- /dev/null +++ b/src/gallium/drivers/swr/swr_shader.h @@ -0,0 +1,60 @@ +/**************************************************************************** + * Copyright (C) 2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************/ + +#pragma once + +class swr_vertex_shader; +class swr_fragment_shader; +class swr_jit_key; + +PFN_VERTEX_FUNC +swr_compile_vs(struct pipe_context *ctx, swr_vertex_shader *swr_vs); + +PFN_PIXEL_KERNEL +swr_compile_fs(struct swr_context *ctx, swr_jit_key &key); + +void swr_generate_fs_key(struct swr_jit_key &key, + struct swr_context *ctx, + swr_fragment_shader *swr_fs); + +struct swr_jit_key { + unsigned nr_cbufs; + unsigned light_twoside; + ubyte vs_output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; + ubyte vs_output_semantic_idx[PIPE_MAX_SHADER_OUTPUTS]; + unsigned nr_samplers; + unsigned nr_sampler_views; + struct swr_sampler_static_state sampler[PIPE_MAX_SHADER_SAMPLER_VIEWS]; +}; + +namespace std +{ +template <> struct hash<swr_jit_key> { + std::size_t operator()(const swr_jit_key &k) const + { + return util_hash_crc32(&k, sizeof(k)); + } +}; +}; + +bool operator==(const swr_jit_key &lhs, const swr_jit_key &rhs); diff --git a/src/gallium/drivers/swr/swr_state.cpp b/src/gallium/drivers/swr/swr_state.cpp new file mode 100644 index 00000000000..47ee3cb2664 --- /dev/null +++ b/src/gallium/drivers/swr/swr_state.cpp @@ -0,0 +1,1437 @@ +/**************************************************************************** + * Copyright (C) 2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************/ + +#include "common/os.h" +#include "jit_api.h" +#include "JitManager.h" +#include "state_llvm.h" + +#include "gallivm/lp_bld_tgsi.h" +#include "util/u_format.h" + +#include "util/u_memory.h" +#include "util/u_inlines.h" +#include "util/u_helpers.h" +#include "util/u_framebuffer.h" + +#include "swr_state.h" +#include "swr_context.h" +#include "swr_context_llvm.h" +#include "swr_screen.h" +#include "swr_resource.h" +#include "swr_tex_sample.h" +#include "swr_scratch.h" +#include "swr_shader.h" +#include "swr_fence.h" + +/* These should be pulled out into separate files as necessary + * Just initializing everything here to get going. */ + +static void * +swr_create_blend_state(struct pipe_context *pipe, + const struct pipe_blend_state *blend) +{ + struct swr_blend_state *state = CALLOC_STRUCT(swr_blend_state); + + memcpy(&state->pipe, blend, sizeof(*blend)); + + struct pipe_blend_state *pipe_blend = &state->pipe; + + for (int target = 0; + target < std::min(SWR_NUM_RENDERTARGETS, PIPE_MAX_COLOR_BUFS); + target++) { + + struct pipe_rt_blend_state *rt_blend = &pipe_blend->rt[target]; + SWR_RENDER_TARGET_BLEND_STATE &blendState = + state->blendState.renderTarget[target]; + RENDER_TARGET_BLEND_COMPILE_STATE &compileState = + state->compileState[target]; + + if (target != 0 && !pipe_blend->independent_blend_enable) { + memcpy(&compileState, + &state->compileState[0], + sizeof(RENDER_TARGET_BLEND_COMPILE_STATE)); + continue; + } + + compileState.blendEnable = rt_blend->blend_enable; + if (compileState.blendEnable) { + compileState.sourceAlphaBlendFactor = + swr_convert_blend_factor(rt_blend->alpha_src_factor); + compileState.destAlphaBlendFactor = + swr_convert_blend_factor(rt_blend->alpha_dst_factor); + compileState.sourceBlendFactor = + swr_convert_blend_factor(rt_blend->rgb_src_factor); + compileState.destBlendFactor = + swr_convert_blend_factor(rt_blend->rgb_dst_factor); + + compileState.colorBlendFunc = + swr_convert_blend_func(rt_blend->rgb_func); + compileState.alphaBlendFunc = + swr_convert_blend_func(rt_blend->alpha_func); + } + compileState.logicOpEnable = state->pipe.logicop_enable; + if (compileState.logicOpEnable) { + compileState.logicOpFunc = + swr_convert_logic_op(state->pipe.logicop_func); + } + + blendState.writeDisableRed = + (rt_blend->colormask & PIPE_MASK_R) ? 0 : 1; + blendState.writeDisableGreen = + (rt_blend->colormask & PIPE_MASK_G) ? 0 : 1; + blendState.writeDisableBlue = + (rt_blend->colormask & PIPE_MASK_B) ? 0 : 1; + blendState.writeDisableAlpha = + (rt_blend->colormask & PIPE_MASK_A) ? 0 : 1; + + if (rt_blend->colormask == 0) + compileState.blendEnable = false; + } + + return state; +} + +static void +swr_bind_blend_state(struct pipe_context *pipe, void *blend) +{ + struct swr_context *ctx = swr_context(pipe); + + if (ctx->blend == blend) + return; + + ctx->blend = (swr_blend_state *)blend; + + ctx->dirty |= SWR_NEW_BLEND; +} + +static void +swr_delete_blend_state(struct pipe_context *pipe, void *blend) +{ + FREE(blend); +} + +static void +swr_set_blend_color(struct pipe_context *pipe, + const struct pipe_blend_color *color) +{ + struct swr_context *ctx = swr_context(pipe); + + ctx->blend_color = *color; + + ctx->dirty |= SWR_NEW_BLEND; +} + +static void +swr_set_stencil_ref(struct pipe_context *pipe, + const struct pipe_stencil_ref *ref) +{ + struct swr_context *ctx = swr_context(pipe); + + ctx->stencil_ref = *ref; + + ctx->dirty |= SWR_NEW_DEPTH_STENCIL_ALPHA; +} + +static void * +swr_create_depth_stencil_state( + struct pipe_context *pipe, + const struct pipe_depth_stencil_alpha_state *depth_stencil) +{ + struct pipe_depth_stencil_alpha_state *state; + + state = (pipe_depth_stencil_alpha_state *)mem_dup(depth_stencil, + sizeof *depth_stencil); + + return state; +} + +static void +swr_bind_depth_stencil_state(struct pipe_context *pipe, void *depth_stencil) +{ + struct swr_context *ctx = swr_context(pipe); + + if (ctx->depth_stencil == (pipe_depth_stencil_alpha_state *)depth_stencil) + return; + + ctx->depth_stencil = (pipe_depth_stencil_alpha_state *)depth_stencil; + + ctx->dirty |= SWR_NEW_DEPTH_STENCIL_ALPHA; +} + +static void +swr_delete_depth_stencil_state(struct pipe_context *pipe, void *depth) +{ + FREE(depth); +} + + +static void * +swr_create_rasterizer_state(struct pipe_context *pipe, + const struct pipe_rasterizer_state *rast) +{ + struct pipe_rasterizer_state *state; + state = (pipe_rasterizer_state *)mem_dup(rast, sizeof *rast); + + return state; +} + +static void +swr_bind_rasterizer_state(struct pipe_context *pipe, void *handle) +{ + struct swr_context *ctx = swr_context(pipe); + const struct pipe_rasterizer_state *rasterizer = + (const struct pipe_rasterizer_state *)handle; + + if (ctx->rasterizer == (pipe_rasterizer_state *)rasterizer) + return; + + ctx->rasterizer = (pipe_rasterizer_state *)rasterizer; + + ctx->dirty |= SWR_NEW_RASTERIZER; +} + +static void +swr_delete_rasterizer_state(struct pipe_context *pipe, void *rasterizer) +{ + FREE(rasterizer); +} + + +static void * +swr_create_sampler_state(struct pipe_context *pipe, + const struct pipe_sampler_state *sampler) +{ + struct pipe_sampler_state *state = + (pipe_sampler_state *)mem_dup(sampler, sizeof *sampler); + + return state; +} + +static void +swr_bind_sampler_states(struct pipe_context *pipe, + unsigned shader, + unsigned start, + unsigned num, + void **samplers) +{ + struct swr_context *ctx = swr_context(pipe); + unsigned i; + + assert(shader < PIPE_SHADER_TYPES); + assert(start + num <= Elements(ctx->samplers[shader])); + + /* set the new samplers */ + ctx->num_samplers[shader] = num; + for (i = 0; i < num; i++) { + ctx->samplers[shader][start + i] = (pipe_sampler_state *)samplers[i]; + } + + ctx->dirty |= SWR_NEW_SAMPLER; +} + +static void +swr_delete_sampler_state(struct pipe_context *pipe, void *sampler) +{ + FREE(sampler); +} + + +static struct pipe_sampler_view * +swr_create_sampler_view(struct pipe_context *pipe, + struct pipe_resource *texture, + const struct pipe_sampler_view *templ) +{ + struct pipe_sampler_view *view = CALLOC_STRUCT(pipe_sampler_view); + + if (view) { + *view = *templ; + view->reference.count = 1; + view->texture = NULL; + pipe_resource_reference(&view->texture, texture); + view->context = pipe; + } + + return view; +} + +static void +swr_set_sampler_views(struct pipe_context *pipe, + unsigned shader, + unsigned start, + unsigned num, + struct pipe_sampler_view **views) +{ + struct swr_context *ctx = swr_context(pipe); + uint i; + + assert(num <= PIPE_MAX_SHADER_SAMPLER_VIEWS); + + assert(shader < PIPE_SHADER_TYPES); + assert(start + num <= Elements(ctx->sampler_views[shader])); + + /* set the new sampler views */ + ctx->num_sampler_views[shader] = num; + for (i = 0; i < num; i++) { + /* Note: we're using pipe_sampler_view_release() here to work around + * a possible crash when the old view belongs to another context that + * was already destroyed. + */ + pipe_sampler_view_release(pipe, &ctx->sampler_views[shader][start + i]); + pipe_sampler_view_reference(&ctx->sampler_views[shader][start + i], + views[i]); + } + + ctx->dirty |= SWR_NEW_SAMPLER_VIEW; +} + +static void +swr_sampler_view_destroy(struct pipe_context *pipe, + struct pipe_sampler_view *view) +{ + pipe_resource_reference(&view->texture, NULL); + FREE(view); +} + +static void * +swr_create_vs_state(struct pipe_context *pipe, + const struct pipe_shader_state *vs) +{ + struct swr_vertex_shader *swr_vs = + (swr_vertex_shader *)CALLOC_STRUCT(swr_vertex_shader); + if (!swr_vs) + return NULL; + + swr_vs->pipe.tokens = tgsi_dup_tokens(vs->tokens); + swr_vs->pipe.stream_output = vs->stream_output; + + lp_build_tgsi_info(vs->tokens, &swr_vs->info); + + swr_vs->func = swr_compile_vs(pipe, swr_vs); + + swr_vs->soState = {0}; + + if (swr_vs->pipe.stream_output.num_outputs) { + pipe_stream_output_info *stream_output = &swr_vs->pipe.stream_output; + + swr_vs->soState.soEnable = true; + // soState.rasterizerDisable set on state dirty + // soState.streamToRasterizer not used + + for (uint32_t i = 0; i < stream_output->num_outputs; i++) { + swr_vs->soState.streamMasks[stream_output->output[i].stream] |= + 1 << (stream_output->output[i].register_index - 1); + } + for (uint32_t i = 0; i < MAX_SO_STREAMS; i++) { + swr_vs->soState.streamNumEntries[i] = + _mm_popcnt_u32(swr_vs->soState.streamMasks[i]); + } + } + + return swr_vs; +} + +static void +swr_bind_vs_state(struct pipe_context *pipe, void *vs) +{ + struct swr_context *ctx = swr_context(pipe); + + if (ctx->vs == vs) + return; + + ctx->vs = (swr_vertex_shader *)vs; + ctx->dirty |= SWR_NEW_VS; +} + +static void +swr_delete_vs_state(struct pipe_context *pipe, void *vs) +{ + struct swr_vertex_shader *swr_vs = (swr_vertex_shader *)vs; + FREE((void *)swr_vs->pipe.tokens); + FREE(vs); +} + +static void * +swr_create_fs_state(struct pipe_context *pipe, + const struct pipe_shader_state *fs) +{ + struct swr_fragment_shader *swr_fs = new swr_fragment_shader; + if (!swr_fs) + return NULL; + + swr_fs->pipe.tokens = tgsi_dup_tokens(fs->tokens); + + lp_build_tgsi_info(fs->tokens, &swr_fs->info); + + return swr_fs; +} + + +static void +swr_bind_fs_state(struct pipe_context *pipe, void *fs) +{ + struct swr_context *ctx = swr_context(pipe); + + if (ctx->fs == fs) + return; + + ctx->fs = (swr_fragment_shader *)fs; + ctx->dirty |= SWR_NEW_FS; +} + +static void +swr_delete_fs_state(struct pipe_context *pipe, void *fs) +{ + struct swr_fragment_shader *swr_fs = (swr_fragment_shader *)fs; + FREE((void *)swr_fs->pipe.tokens); + delete swr_fs; +} + + +static void +swr_set_constant_buffer(struct pipe_context *pipe, + uint shader, + uint index, + struct pipe_constant_buffer *cb) +{ + struct swr_context *ctx = swr_context(pipe); + struct pipe_resource *constants = cb ? cb->buffer : NULL; + + assert(shader < PIPE_SHADER_TYPES); + assert(index < Elements(ctx->constants[shader])); + + /* note: reference counting */ + util_copy_constant_buffer(&ctx->constants[shader][index], cb); + + if (shader == PIPE_SHADER_VERTEX || shader == PIPE_SHADER_GEOMETRY) { + ctx->dirty |= SWR_NEW_VSCONSTANTS; + } else if (shader == PIPE_SHADER_FRAGMENT) { + ctx->dirty |= SWR_NEW_FSCONSTANTS; + } + + if (cb && cb->user_buffer) { + pipe_resource_reference(&constants, NULL); + } +} + + +static void * +swr_create_vertex_elements_state(struct pipe_context *pipe, + unsigned num_elements, + const struct pipe_vertex_element *attribs) +{ + struct swr_vertex_element_state *velems; + assert(num_elements <= PIPE_MAX_ATTRIBS); + velems = CALLOC_STRUCT(swr_vertex_element_state); + if (velems) { + velems->fsState.numAttribs = num_elements; + for (unsigned i = 0; i < num_elements; i++) { + // XXX: we should do this keyed on the VS usage info + + const struct util_format_description *desc = + util_format_description(attribs[i].src_format); + + velems->fsState.layout[i].AlignedByteOffset = attribs[i].src_offset; + velems->fsState.layout[i].Format = + mesa_to_swr_format(attribs[i].src_format); + velems->fsState.layout[i].StreamIndex = + attribs[i].vertex_buffer_index; + velems->fsState.layout[i].InstanceEnable = + attribs[i].instance_divisor != 0; + velems->fsState.layout[i].ComponentControl0 = + desc->channel[0].type != UTIL_FORMAT_TYPE_VOID + ? ComponentControl::StoreSrc + : ComponentControl::Store0; + velems->fsState.layout[i].ComponentControl1 = + desc->channel[1].type != UTIL_FORMAT_TYPE_VOID + ? ComponentControl::StoreSrc + : ComponentControl::Store0; + velems->fsState.layout[i].ComponentControl2 = + desc->channel[2].type != UTIL_FORMAT_TYPE_VOID + ? ComponentControl::StoreSrc + : ComponentControl::Store0; + velems->fsState.layout[i].ComponentControl3 = + desc->channel[3].type != UTIL_FORMAT_TYPE_VOID + ? ComponentControl::StoreSrc + : ComponentControl::Store1Fp; + velems->fsState.layout[i].ComponentPacking = ComponentEnable::XYZW; + velems->fsState.layout[i].InstanceDataStepRate = + attribs[i].instance_divisor; + + /* Calculate the pitch of each stream */ + const SWR_FORMAT_INFO &swr_desc = GetFormatInfo( + mesa_to_swr_format(attribs[i].src_format)); + velems->stream_pitch[attribs[i].vertex_buffer_index] += swr_desc.Bpp; + } + } + + return velems; +} + +static void +swr_bind_vertex_elements_state(struct pipe_context *pipe, void *velems) +{ + struct swr_context *ctx = swr_context(pipe); + struct swr_vertex_element_state *swr_velems = + (struct swr_vertex_element_state *)velems; + + ctx->velems = swr_velems; + ctx->dirty |= SWR_NEW_VERTEX; +} + +static void +swr_delete_vertex_elements_state(struct pipe_context *pipe, void *velems) +{ + /* XXX Need to destroy fetch shader? */ + FREE(velems); +} + + +static void +swr_set_vertex_buffers(struct pipe_context *pipe, + unsigned start_slot, + unsigned num_elements, + const struct pipe_vertex_buffer *buffers) +{ + struct swr_context *ctx = swr_context(pipe); + + assert(num_elements <= PIPE_MAX_ATTRIBS); + + util_set_vertex_buffers_count(ctx->vertex_buffer, + &ctx->num_vertex_buffers, + buffers, + start_slot, + num_elements); + + ctx->dirty |= SWR_NEW_VERTEX; +} + + +static void +swr_set_index_buffer(struct pipe_context *pipe, + const struct pipe_index_buffer *ib) +{ + struct swr_context *ctx = swr_context(pipe); + + if (ib) + memcpy(&ctx->index_buffer, ib, sizeof(ctx->index_buffer)); + else + memset(&ctx->index_buffer, 0, sizeof(ctx->index_buffer)); + + ctx->dirty |= SWR_NEW_VERTEX; +} + +static void +swr_set_polygon_stipple(struct pipe_context *pipe, + const struct pipe_poly_stipple *stipple) +{ + struct swr_context *ctx = swr_context(pipe); + + ctx->poly_stipple = *stipple; /* struct copy */ + ctx->dirty |= SWR_NEW_STIPPLE; +} + +static void +swr_set_clip_state(struct pipe_context *pipe, + const struct pipe_clip_state *clip) +{ + struct swr_context *ctx = swr_context(pipe); + + ctx->clip = *clip; + /* XXX Unimplemented, but prevents crash */ + + ctx->dirty |= SWR_NEW_CLIP; +} + + +static void +swr_set_scissor_states(struct pipe_context *pipe, + unsigned start_slot, + unsigned num_viewports, + const struct pipe_scissor_state *scissor) +{ + struct swr_context *ctx = swr_context(pipe); + + ctx->scissor = *scissor; + ctx->dirty |= SWR_NEW_SCISSOR; +} + +static void +swr_set_viewport_states(struct pipe_context *pipe, + unsigned start_slot, + unsigned num_viewports, + const struct pipe_viewport_state *vpt) +{ + struct swr_context *ctx = swr_context(pipe); + + ctx->viewport = *vpt; + ctx->dirty |= SWR_NEW_VIEWPORT; +} + + +static void +swr_set_framebuffer_state(struct pipe_context *pipe, + const struct pipe_framebuffer_state *fb) +{ + struct swr_context *ctx = swr_context(pipe); + + boolean changed = !util_framebuffer_state_equal(&ctx->framebuffer, fb); + + assert(fb->width <= KNOB_GUARDBAND_WIDTH); + assert(fb->height <= KNOB_GUARDBAND_HEIGHT); + + if (changed) { + unsigned i; + for (i = 0; i < fb->nr_cbufs; ++i) + pipe_surface_reference(&ctx->framebuffer.cbufs[i], fb->cbufs[i]); + for (; i < ctx->framebuffer.nr_cbufs; ++i) + pipe_surface_reference(&ctx->framebuffer.cbufs[i], NULL); + + ctx->framebuffer.nr_cbufs = fb->nr_cbufs; + + ctx->framebuffer.width = fb->width; + ctx->framebuffer.height = fb->height; + + pipe_surface_reference(&ctx->framebuffer.zsbuf, fb->zsbuf); + + ctx->dirty |= SWR_NEW_FRAMEBUFFER; + } +} + + +static void +swr_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask) +{ + struct swr_context *ctx = swr_context(pipe); + + if (sample_mask != ctx->sample_mask) { + ctx->sample_mask = sample_mask; + ctx->dirty |= SWR_NEW_RASTERIZER; + } +} + +/* + * Update resource in-use status + * All resources bound to color or depth targets marked as WRITE resources. + * VBO Vertex/index buffers and texture views marked as READ resources. + */ +void +swr_update_resource_status(struct pipe_context *pipe, + const struct pipe_draw_info *p_draw_info) +{ + struct swr_context *ctx = swr_context(pipe); + struct pipe_framebuffer_state *fb = &ctx->framebuffer; + + /* colorbuffer targets */ + if (fb->nr_cbufs) + for (uint32_t i = 0; i < fb->nr_cbufs; ++i) + if (fb->cbufs[i]) + swr_resource_write(pipe, swr_resource(fb->cbufs[i]->texture)); + + /* depth/stencil target */ + if (fb->zsbuf) + swr_resource_write(pipe, swr_resource(fb->zsbuf->texture)); + + /* VBO vertex buffers */ + for (uint32_t i = 0; i < ctx->num_vertex_buffers; i++) { + struct pipe_vertex_buffer *vb = &ctx->vertex_buffer[i]; + if (!vb->user_buffer) + swr_resource_read(pipe, swr_resource(vb->buffer)); + } + + /* VBO index buffer */ + if (p_draw_info && p_draw_info->indexed) { + struct pipe_index_buffer *ib = &ctx->index_buffer; + if (!ib->user_buffer) + swr_resource_read(pipe, swr_resource(ib->buffer)); + } + + /* texture sampler views */ + for (uint32_t i = 0; i < PIPE_MAX_SHADER_SAMPLER_VIEWS; i++) { + struct pipe_sampler_view *view = + ctx->sampler_views[PIPE_SHADER_FRAGMENT][i]; + if (view) + swr_resource_read(pipe, swr_resource(view->texture)); + } +} + +void +swr_update_derived(struct pipe_context *pipe, + const struct pipe_draw_info *p_draw_info) +{ + struct swr_context *ctx = swr_context(pipe); + struct swr_screen *screen = swr_screen(ctx->pipe.screen); + + /* Any state that requires dirty flags to be re-triggered sets this mask */ + /* For example, user_buffer vertex and index buffers. */ + unsigned post_update_dirty_flags = 0; + + /* Render Targets */ + if (ctx->dirty & SWR_NEW_FRAMEBUFFER) { + struct pipe_framebuffer_state *fb = &ctx->framebuffer; + SWR_SURFACE_STATE *new_attachment[SWR_NUM_ATTACHMENTS] = {0}; + UINT i; + + /* colorbuffer targets */ + if (fb->nr_cbufs) + for (i = 0; i < fb->nr_cbufs; ++i) + if (fb->cbufs[i]) { + struct swr_resource *colorBuffer = + swr_resource(fb->cbufs[i]->texture); + new_attachment[SWR_ATTACHMENT_COLOR0 + i] = &colorBuffer->swr; + } + + /* depth/stencil target */ + if (fb->zsbuf) { + struct swr_resource *depthStencilBuffer = + swr_resource(fb->zsbuf->texture); + if (depthStencilBuffer->has_depth) { + new_attachment[SWR_ATTACHMENT_DEPTH] = &depthStencilBuffer->swr; + + if (depthStencilBuffer->has_stencil) + new_attachment[SWR_ATTACHMENT_STENCIL] = + &depthStencilBuffer->secondary; + + } else if (depthStencilBuffer->has_stencil) + new_attachment[SWR_ATTACHMENT_STENCIL] = &depthStencilBuffer->swr; + } + + /* Make the attachment updates */ + swr_draw_context *pDC = &ctx->swrDC; + SWR_SURFACE_STATE *renderTargets = pDC->renderTargets; + unsigned need_fence = FALSE; + for (i = 0; i < SWR_NUM_ATTACHMENTS; i++) { + void *new_base = nullptr; + if (new_attachment[i]) + new_base = new_attachment[i]->pBaseAddress; + + /* StoreTile for changed target */ + if (renderTargets[i].pBaseAddress != new_base) { + if (renderTargets[i].pBaseAddress) { + /* If changing attachment to a new target, mark tiles as + * INVALID so they are reloaded from surface. + * If detaching attachment, mark tiles as RESOLVED so core + * won't try to load from non-existent target. */ + enum SWR_TILE_STATE post_state = (new_attachment[i] + ? SWR_TILE_INVALID : SWR_TILE_RESOLVED); + swr_store_render_target(pipe, i, post_state); + + need_fence |= TRUE; + } + + /* Make new attachment */ + if (new_attachment[i]) + renderTargets[i] = *new_attachment[i]; + else + if (renderTargets[i].pBaseAddress) + renderTargets[i] = {0}; + } + } + + /* This fence ensures any attachment changes are resolved before the + * next draw */ + if (need_fence) + swr_fence_submit(ctx, screen->flush_fence); + } + + /* Raster state */ + if (ctx->dirty & (SWR_NEW_RASTERIZER | SWR_NEW_FRAMEBUFFER)) { + pipe_rasterizer_state *rasterizer = ctx->rasterizer; + pipe_framebuffer_state *fb = &ctx->framebuffer; + + SWR_RASTSTATE *rastState = &ctx->derived.rastState; + rastState->cullMode = swr_convert_cull_mode(rasterizer->cull_face); + rastState->frontWinding = rasterizer->front_ccw + ? SWR_FRONTWINDING_CCW + : SWR_FRONTWINDING_CW; + rastState->scissorEnable = rasterizer->scissor; + rastState->pointSize = rasterizer->point_size > 0.0f + ? rasterizer->point_size + : 1.0f; + rastState->lineWidth = rasterizer->line_width > 0.0f + ? rasterizer->line_width + : 1.0f; + + rastState->pointParam = rasterizer->point_size_per_vertex; + + rastState->pointSpriteEnable = rasterizer->sprite_coord_enable; + rastState->pointSpriteTopOrigin = + rasterizer->sprite_coord_mode == PIPE_SPRITE_COORD_UPPER_LEFT; + + /* XXX TODO: Add multisample */ + rastState->msaaRastEnable = false; + rastState->rastMode = SWR_MSAA_RASTMODE_OFF_PIXEL; + rastState->sampleCount = SWR_MULTISAMPLE_1X; + rastState->bForcedSampleCount = false; + + bool do_offset = false; + switch (rasterizer->fill_front) { + case PIPE_POLYGON_MODE_FILL: + do_offset = rasterizer->offset_tri; + break; + case PIPE_POLYGON_MODE_LINE: + do_offset = rasterizer->offset_line; + break; + case PIPE_POLYGON_MODE_POINT: + do_offset = rasterizer->offset_point; + break; + } + + if (do_offset) { + rastState->depthBias = rasterizer->offset_units; + rastState->slopeScaledDepthBias = rasterizer->offset_scale; + rastState->depthBiasClamp = rasterizer->offset_clamp; + } else { + rastState->depthBias = 0; + rastState->slopeScaledDepthBias = 0; + rastState->depthBiasClamp = 0; + } + struct pipe_surface *zb = fb->zsbuf; + if (zb && swr_resource(zb->texture)->has_depth) + rastState->depthFormat = swr_resource(zb->texture)->swr.format; + + rastState->depthClipEnable = rasterizer->depth_clip; + + SwrSetRastState(ctx->swrContext, rastState); + } + + /* Scissor */ + if (ctx->dirty & SWR_NEW_SCISSOR) { + pipe_scissor_state *scissor = &ctx->scissor; + BBOX bbox(scissor->miny, scissor->maxy, + scissor->minx, scissor->maxx); + SwrSetScissorRects(ctx->swrContext, 1, &bbox); + } + + /* Viewport */ + if (ctx->dirty & (SWR_NEW_VIEWPORT | SWR_NEW_FRAMEBUFFER + | SWR_NEW_RASTERIZER)) { + pipe_viewport_state *state = &ctx->viewport; + pipe_framebuffer_state *fb = &ctx->framebuffer; + pipe_rasterizer_state *rasterizer = ctx->rasterizer; + + SWR_VIEWPORT *vp = &ctx->derived.vp; + SWR_VIEWPORT_MATRIX *vpm = &ctx->derived.vpm; + + vp->x = state->translate[0] - state->scale[0]; + vp->width = state->translate[0] + state->scale[0]; + vp->y = state->translate[1] - fabs(state->scale[1]); + vp->height = state->translate[1] + fabs(state->scale[1]); + if (rasterizer->clip_halfz == 0) { + vp->minZ = state->translate[2] - state->scale[2]; + vp->maxZ = state->translate[2] + state->scale[2]; + } else { + vp->minZ = state->translate[2]; + vp->maxZ = state->translate[2] + state->scale[2]; + } + + vpm->m00 = state->scale[0]; + vpm->m11 = state->scale[1]; + vpm->m22 = state->scale[2]; + vpm->m30 = state->translate[0]; + vpm->m31 = state->translate[1]; + vpm->m32 = state->translate[2]; + + /* Now that the matrix is calculated, clip the view coords to screen + * size. OpenGL allows for -ve x,y in the viewport. */ + vp->x = std::max(vp->x, 0.0f); + vp->y = std::max(vp->y, 0.0f); + vp->width = std::min(vp->width, (float)fb->width); + vp->height = std::min(vp->height, (float)fb->height); + + SwrSetViewports(ctx->swrContext, 1, vp, vpm); + } + + /* Set vertex & index buffers */ + /* (using draw info if called by swr_draw_vbo) */ + if (ctx->dirty & SWR_NEW_VERTEX) { + uint32_t size, pitch, max_vertex, partial_inbounds; + const uint8_t *p_data; + + /* If being called by swr_draw_vbo, copy draw details */ + struct pipe_draw_info info = {0}; + if (p_draw_info) + info = *p_draw_info; + + /* vertex buffers */ + SWR_VERTEX_BUFFER_STATE swrVertexBuffers[PIPE_MAX_ATTRIBS]; + for (UINT i = 0; i < ctx->num_vertex_buffers; i++) { + struct pipe_vertex_buffer *vb = &ctx->vertex_buffer[i]; + + pitch = vb->stride; + if (!vb->user_buffer) { + /* VBO + * size is based on buffer->width0 rather than info.max_index + * to prevent having to validate VBO on each draw */ + size = vb->buffer->width0; + max_vertex = size / pitch; + partial_inbounds = size % pitch; + + p_data = (const uint8_t *)swr_resource_data(vb->buffer) + + vb->buffer_offset; + } else { + /* Client buffer + * client memory is one-time use, re-trigger SWR_NEW_VERTEX to + * revalidate on each draw */ + post_update_dirty_flags |= SWR_NEW_VERTEX; + + if (pitch) { + size = (info.max_index - info.min_index + 1) * pitch; + } else { + /* pitch = 0, means constant value + * set size to 1 vertex */ + size = ctx->velems->stream_pitch[i]; + } + + max_vertex = info.max_index + 1; + partial_inbounds = 0; + + /* Copy only needed vertices to scratch space */ + size = AlignUp(size, 4); + const void *ptr = (const uint8_t *) vb->user_buffer + + info.min_index * pitch; + ptr = swr_copy_to_scratch_space( + ctx, &ctx->scratch->vertex_buffer, ptr, size); + p_data = (const uint8_t *)ptr - info.min_index * pitch; + } + + swrVertexBuffers[i] = {0}; + swrVertexBuffers[i].index = i; + swrVertexBuffers[i].pitch = pitch; + swrVertexBuffers[i].pData = p_data; + swrVertexBuffers[i].size = size; + swrVertexBuffers[i].maxVertex = max_vertex; + swrVertexBuffers[i].partialInboundsSize = partial_inbounds; + } + + SwrSetVertexBuffers( + ctx->swrContext, ctx->num_vertex_buffers, swrVertexBuffers); + + /* index buffer, if required (info passed in by swr_draw_vbo) */ + SWR_FORMAT index_type = R32_UINT; /* Default for non-indexed draws */ + if (info.indexed) { + struct pipe_index_buffer *ib = &ctx->index_buffer; + + pitch = ib->index_size ? ib->index_size : sizeof(uint32_t); + index_type = swr_convert_index_type(pitch); + + if (!ib->user_buffer) { + /* VBO + * size is based on buffer->width0 rather than info.count + * to prevent having to validate VBO on each draw */ + size = ib->buffer->width0; + p_data = + (const uint8_t *)swr_resource_data(ib->buffer) + ib->offset; + } else { + /* Client buffer + * client memory is one-time use, re-trigger SWR_NEW_VERTEX to + * revalidate on each draw */ + post_update_dirty_flags |= SWR_NEW_VERTEX; + + size = info.count * pitch; + size = AlignUp(size, 4); + + /* Copy indices to scratch space */ + const void *ptr = ib->user_buffer; + ptr = swr_copy_to_scratch_space( + ctx, &ctx->scratch->index_buffer, ptr, size); + p_data = (const uint8_t *)ptr; + } + + SWR_INDEX_BUFFER_STATE swrIndexBuffer; + swrIndexBuffer.format = swr_convert_index_type(ib->index_size); + swrIndexBuffer.pIndices = p_data; + swrIndexBuffer.size = size; + + SwrSetIndexBuffer(ctx->swrContext, &swrIndexBuffer); + } + + struct swr_vertex_element_state *velems = ctx->velems; + if (velems && velems->fsState.indexType != index_type) { + velems->fsFunc = NULL; + velems->fsState.indexType = index_type; + } + } + + /* VertexShader */ + if (ctx->dirty & (SWR_NEW_VS | SWR_NEW_FRAMEBUFFER)) { + SwrSetVertexFunc(ctx->swrContext, ctx->vs->func); + } + + swr_jit_key key; + if (ctx->dirty & (SWR_NEW_FS | SWR_NEW_SAMPLER | SWR_NEW_SAMPLER_VIEW + | SWR_NEW_RASTERIZER | SWR_NEW_FRAMEBUFFER)) { + memset(&key, 0, sizeof(key)); + swr_generate_fs_key(key, ctx, ctx->fs); + auto search = ctx->fs->map.find(key); + PFN_PIXEL_KERNEL func; + if (search != ctx->fs->map.end()) { + func = search->second; + } else { + func = swr_compile_fs(ctx, key); + ctx->fs->map.insert(std::make_pair(key, func)); + } + SWR_PS_STATE psState = {0}; + psState.pfnPixelShader = func; + psState.killsPixel = ctx->fs->info.base.uses_kill; + psState.inputCoverage = SWR_INPUT_COVERAGE_NORMAL; + psState.writesODepth = ctx->fs->info.base.writes_z; + psState.usesSourceDepth = ctx->fs->info.base.reads_z; + psState.shadingRate = SWR_SHADING_RATE_PIXEL; // XXX + psState.numRenderTargets = ctx->framebuffer.nr_cbufs; + psState.posOffset = SWR_PS_POSITION_SAMPLE_NONE; // XXX msaa + uint32_t barycentricsMask = 0; +#if 0 + // when we switch to mesa-master + if (ctx->fs->info.base.uses_persp_center || + ctx->fs->info.base.uses_linear_center) + barycentricsMask |= SWR_BARYCENTRIC_PER_PIXEL_MASK; + if (ctx->fs->info.base.uses_persp_centroid || + ctx->fs->info.base.uses_linear_centroid) + barycentricsMask |= SWR_BARYCENTRIC_CENTROID_MASK; + if (ctx->fs->info.base.uses_persp_sample || + ctx->fs->info.base.uses_linear_sample) + barycentricsMask |= SWR_BARYCENTRIC_PER_SAMPLE_MASK; +#else + for (unsigned i = 0; i < ctx->fs->info.base.num_inputs; i++) { + switch (ctx->fs->info.base.input_interpolate_loc[i]) { + case TGSI_INTERPOLATE_LOC_CENTER: + barycentricsMask |= SWR_BARYCENTRIC_PER_PIXEL_MASK; + break; + case TGSI_INTERPOLATE_LOC_CENTROID: + barycentricsMask |= SWR_BARYCENTRIC_CENTROID_MASK; + break; + case TGSI_INTERPOLATE_LOC_SAMPLE: + barycentricsMask |= SWR_BARYCENTRIC_PER_SAMPLE_MASK; + break; + } + } +#endif + psState.barycentricsMask = barycentricsMask; + psState.usesUAV = false; // XXX + psState.forceEarlyZ = false; + SwrSetPixelShaderState(ctx->swrContext, &psState); + } + + /* JIT sampler state */ + if (ctx->dirty & SWR_NEW_SAMPLER) { + swr_draw_context *pDC = &ctx->swrDC; + + for (unsigned i = 0; i < key.nr_samplers; i++) { + const struct pipe_sampler_state *sampler = + ctx->samplers[PIPE_SHADER_FRAGMENT][i]; + + if (sampler) { + pDC->samplersFS[i].min_lod = sampler->min_lod; + pDC->samplersFS[i].max_lod = sampler->max_lod; + pDC->samplersFS[i].lod_bias = sampler->lod_bias; + COPY_4V(pDC->samplersFS[i].border_color, sampler->border_color.f); + } + } + } + + /* JIT sampler view state */ + if (ctx->dirty & (SWR_NEW_SAMPLER_VIEW | SWR_NEW_FRAMEBUFFER)) { + swr_draw_context *pDC = &ctx->swrDC; + + for (unsigned i = 0; i < key.nr_sampler_views; i++) { + struct pipe_sampler_view *view = + ctx->sampler_views[PIPE_SHADER_FRAGMENT][i]; + + if (view) { + struct pipe_resource *res = view->texture; + struct swr_resource *swr_res = swr_resource(res); + struct swr_jit_texture *jit_tex = &pDC->texturesFS[i]; + memset(jit_tex, 0, sizeof(*jit_tex)); + jit_tex->width = res->width0; + jit_tex->height = res->height0; + jit_tex->depth = res->depth0; + jit_tex->first_level = view->u.tex.first_level; + jit_tex->last_level = view->u.tex.last_level; + jit_tex->base_ptr = swr_res->swr.pBaseAddress; + + for (unsigned level = jit_tex->first_level; + level <= jit_tex->last_level; + level++) { + jit_tex->row_stride[level] = swr_res->row_stride[level]; + jit_tex->img_stride[level] = swr_res->img_stride[level]; + jit_tex->mip_offsets[level] = swr_res->mip_offsets[level]; + } + } + } + } + + /* VertexShader Constants */ + if (ctx->dirty & SWR_NEW_VSCONSTANTS) { + swr_draw_context *pDC = &ctx->swrDC; + + for (UINT i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) { + const pipe_constant_buffer *cb = + &ctx->constants[PIPE_SHADER_VERTEX][i]; + pDC->num_constantsVS[i] = cb->buffer_size; + if (cb->buffer) + pDC->constantVS[i] = + (const float *)((const uint8_t *)cb->buffer + cb->buffer_offset); + else { + /* Need to copy these constants to scratch space */ + if (cb->user_buffer && cb->buffer_size) { + const void *ptr = + ((const uint8_t *)cb->user_buffer + cb->buffer_offset); + uint32_t size = AlignUp(cb->buffer_size, 4); + ptr = swr_copy_to_scratch_space( + ctx, &ctx->scratch->vs_constants, ptr, size); + pDC->constantVS[i] = (const float *)ptr; + } + } + } + } + + /* FragmentShader Constants */ + if (ctx->dirty & SWR_NEW_FSCONSTANTS) { + swr_draw_context *pDC = &ctx->swrDC; + + for (UINT i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) { + const pipe_constant_buffer *cb = + &ctx->constants[PIPE_SHADER_FRAGMENT][i]; + pDC->num_constantsFS[i] = cb->buffer_size; + if (cb->buffer) + pDC->constantFS[i] = + (const float *)((const uint8_t *)cb->buffer + cb->buffer_offset); + else { + /* Need to copy these constants to scratch space */ + if (cb->user_buffer && cb->buffer_size) { + const void *ptr = + ((const uint8_t *)cb->user_buffer + cb->buffer_offset); + uint32_t size = AlignUp(cb->buffer_size, 4); + ptr = swr_copy_to_scratch_space( + ctx, &ctx->scratch->fs_constants, ptr, size); + pDC->constantFS[i] = (const float *)ptr; + } + } + } + } + + /* Depth/stencil state */ + if (ctx->dirty & (SWR_NEW_DEPTH_STENCIL_ALPHA | SWR_NEW_FRAMEBUFFER)) { + struct pipe_depth_state *depth = &(ctx->depth_stencil->depth); + struct pipe_stencil_state *stencil = ctx->depth_stencil->stencil; + SWR_DEPTH_STENCIL_STATE depthStencilState = {{0}}; + + /* XXX, incomplete. Need to flesh out stencil & alpha test state + struct pipe_stencil_state *front_stencil = + ctx->depth_stencil.stencil[0]; + struct pipe_stencil_state *back_stencil = ctx->depth_stencil.stencil[1]; + struct pipe_alpha_state alpha; + */ + if (stencil[0].enabled) { + depthStencilState.stencilWriteEnable = 1; + depthStencilState.stencilTestEnable = 1; + depthStencilState.stencilTestFunc = + swr_convert_depth_func(stencil[0].func); + + depthStencilState.stencilPassDepthPassOp = + swr_convert_stencil_op(stencil[0].zpass_op); + depthStencilState.stencilPassDepthFailOp = + swr_convert_stencil_op(stencil[0].zfail_op); + depthStencilState.stencilFailOp = + swr_convert_stencil_op(stencil[0].fail_op); + depthStencilState.stencilWriteMask = stencil[0].writemask; + depthStencilState.stencilTestMask = stencil[0].valuemask; + depthStencilState.stencilRefValue = ctx->stencil_ref.ref_value[0]; + } + if (stencil[1].enabled) { + depthStencilState.doubleSidedStencilTestEnable = 1; + + depthStencilState.backfaceStencilTestFunc = + swr_convert_depth_func(stencil[1].func); + + depthStencilState.backfaceStencilPassDepthPassOp = + swr_convert_stencil_op(stencil[1].zpass_op); + depthStencilState.backfaceStencilPassDepthFailOp = + swr_convert_stencil_op(stencil[1].zfail_op); + depthStencilState.backfaceStencilFailOp = + swr_convert_stencil_op(stencil[1].fail_op); + depthStencilState.backfaceStencilWriteMask = stencil[1].writemask; + depthStencilState.backfaceStencilTestMask = stencil[1].valuemask; + + depthStencilState.backfaceStencilRefValue = + ctx->stencil_ref.ref_value[1]; + } + + depthStencilState.depthTestEnable = depth->enabled; + depthStencilState.depthTestFunc = swr_convert_depth_func(depth->func); + depthStencilState.depthWriteEnable = depth->writemask; + SwrSetDepthStencilState(ctx->swrContext, &depthStencilState); + } + + /* Blend State */ + if (ctx->dirty & (SWR_NEW_BLEND | + SWR_NEW_FRAMEBUFFER | + SWR_NEW_DEPTH_STENCIL_ALPHA)) { + struct pipe_framebuffer_state *fb = &ctx->framebuffer; + + SWR_BLEND_STATE blendState; + memcpy(&blendState, &ctx->blend->blendState, sizeof(blendState)); + blendState.constantColor[0] = ctx->blend_color.color[0]; + blendState.constantColor[1] = ctx->blend_color.color[1]; + blendState.constantColor[2] = ctx->blend_color.color[2]; + blendState.constantColor[3] = ctx->blend_color.color[3]; + blendState.alphaTestReference = + *((uint32_t*)&ctx->depth_stencil->alpha.ref_value); + + // XXX MSAA + blendState.sampleMask = 0; + blendState.sampleCount = SWR_MULTISAMPLE_1X; + + /* If there are no color buffers bound, disable writes on RT0 + * and skip loop */ + if (fb->nr_cbufs == 0) { + blendState.renderTarget[0].writeDisableRed = 1; + blendState.renderTarget[0].writeDisableGreen = 1; + blendState.renderTarget[0].writeDisableBlue = 1; + blendState.renderTarget[0].writeDisableAlpha = 1; + SwrSetBlendFunc(ctx->swrContext, 0, NULL); + } + else + for (int target = 0; + target < std::min(SWR_NUM_RENDERTARGETS, + PIPE_MAX_COLOR_BUFS); + target++) { + if (!fb->cbufs[target]) + continue; + + struct swr_resource *colorBuffer = + swr_resource(fb->cbufs[target]->texture); + + BLEND_COMPILE_STATE compileState; + memset(&compileState, 0, sizeof(compileState)); + compileState.format = colorBuffer->swr.format; + memcpy(&compileState.blendState, + &ctx->blend->compileState[target], + sizeof(compileState.blendState)); + + if (compileState.blendState.blendEnable == false && + compileState.blendState.logicOpEnable == false) { + SwrSetBlendFunc(ctx->swrContext, target, NULL); + continue; + } + + compileState.desc.alphaTestEnable = + ctx->depth_stencil->alpha.enabled; + compileState.desc.independentAlphaBlendEnable = + ctx->blend->pipe.independent_blend_enable; + compileState.desc.alphaToCoverageEnable = + ctx->blend->pipe.alpha_to_coverage; + compileState.desc.sampleMaskEnable = 0; // XXX + compileState.desc.numSamples = 1; // XXX + + compileState.alphaTestFunction = + swr_convert_depth_func(ctx->depth_stencil->alpha.func); + compileState.alphaTestFormat = ALPHA_TEST_FLOAT32; // xxx + + PFN_BLEND_JIT_FUNC func = NULL; + auto search = ctx->blendJIT->find(compileState); + if (search != ctx->blendJIT->end()) { + func = search->second; + } else { + HANDLE hJitMgr = screen->hJitMgr; + func = JitCompileBlend(hJitMgr, compileState); + debug_printf("BLEND shader %p\n", func); + assert(func && "Error: BlendShader = NULL"); + + ctx->blendJIT->insert(std::make_pair(compileState, func)); + } + SwrSetBlendFunc(ctx->swrContext, target, func); + } + + SwrSetBlendState(ctx->swrContext, &blendState); + } + + if (ctx->dirty & SWR_NEW_STIPPLE) { + /* XXX What to do with this one??? SWR doesn't stipple */ + } + + if (ctx->dirty & (SWR_NEW_VS | SWR_NEW_SO | SWR_NEW_RASTERIZER)) { + ctx->vs->soState.rasterizerDisable = + ctx->rasterizer->rasterizer_discard; + SwrSetSoState(ctx->swrContext, &ctx->vs->soState); + + pipe_stream_output_info *stream_output = &ctx->vs->pipe.stream_output; + + for (uint32_t i = 0; i < ctx->num_so_targets; i++) { + SWR_STREAMOUT_BUFFER buffer = {0}; + if (!ctx->so_targets[i]) + continue; + buffer.enable = true; + buffer.pBuffer = + (uint32_t *)swr_resource_data(ctx->so_targets[i]->buffer); + buffer.bufferSize = ctx->so_targets[i]->buffer_size >> 2; + buffer.pitch = stream_output->stride[i]; + buffer.streamOffset = ctx->so_targets[i]->buffer_offset >> 2; + + SwrSetSoBuffers(ctx->swrContext, &buffer, i); + } + } + + uint32_t linkage = ctx->vs->linkageMask; + if (ctx->rasterizer->sprite_coord_enable) + linkage |= (1 << ctx->vs->info.base.num_outputs); + + SwrSetLinkage(ctx->swrContext, linkage, NULL); + + // set up frontend state + SWR_FRONTEND_STATE feState = {0}; + SwrSetFrontendState(ctx->swrContext, &feState); + + // set up backend state + SWR_BACKEND_STATE backendState = {0}; + backendState.numAttributes = 1; + backendState.numComponents[0] = 4; + backendState.constantInterpolationMask = ctx->fs->constantMask; + backendState.pointSpriteTexCoordMask = ctx->fs->pointSpriteMask; + + SwrSetBackendState(ctx->swrContext, &backendState); + + /* Ensure that any in-progress attachment change StoreTiles finish */ + if (swr_is_fence_pending(screen->flush_fence)) + swr_fence_finish(pipe->screen, screen->flush_fence, 0); + + /* Finally, update the in-use status of all resources involved in draw */ + swr_update_resource_status(pipe, p_draw_info); + + ctx->dirty = post_update_dirty_flags; +} + + +static struct pipe_stream_output_target * +swr_create_so_target(struct pipe_context *pipe, + struct pipe_resource *buffer, + unsigned buffer_offset, + unsigned buffer_size) +{ + struct pipe_stream_output_target *target; + + target = CALLOC_STRUCT(pipe_stream_output_target); + if (!target) + return NULL; + + target->context = pipe; + target->reference.count = 1; + pipe_resource_reference(&target->buffer, buffer); + target->buffer_offset = buffer_offset; + target->buffer_size = buffer_size; + return target; +} + +static void +swr_destroy_so_target(struct pipe_context *pipe, + struct pipe_stream_output_target *target) +{ + pipe_resource_reference(&target->buffer, NULL); + FREE(target); +} + +static void +swr_set_so_targets(struct pipe_context *pipe, + unsigned num_targets, + struct pipe_stream_output_target **targets, + const unsigned *offsets) +{ + struct swr_context *swr = swr_context(pipe); + uint32_t i; + + assert(num_targets < MAX_SO_STREAMS); + + for (i = 0; i < num_targets; i++) { + pipe_so_target_reference( + (struct pipe_stream_output_target **)&swr->so_targets[i], + targets[i]); + } + + for (/* fall-through */; i < swr->num_so_targets; i++) { + pipe_so_target_reference( + (struct pipe_stream_output_target **)&swr->so_targets[i], NULL); + } + + swr->num_so_targets = num_targets; + + swr->dirty = SWR_NEW_SO; +} + + +void +swr_state_init(struct pipe_context *pipe) +{ + pipe->create_blend_state = swr_create_blend_state; + pipe->bind_blend_state = swr_bind_blend_state; + pipe->delete_blend_state = swr_delete_blend_state; + + pipe->create_depth_stencil_alpha_state = swr_create_depth_stencil_state; + pipe->bind_depth_stencil_alpha_state = swr_bind_depth_stencil_state; + pipe->delete_depth_stencil_alpha_state = swr_delete_depth_stencil_state; + + pipe->create_rasterizer_state = swr_create_rasterizer_state; + pipe->bind_rasterizer_state = swr_bind_rasterizer_state; + pipe->delete_rasterizer_state = swr_delete_rasterizer_state; + + pipe->create_sampler_state = swr_create_sampler_state; + pipe->bind_sampler_states = swr_bind_sampler_states; + pipe->delete_sampler_state = swr_delete_sampler_state; + + pipe->create_sampler_view = swr_create_sampler_view; + pipe->set_sampler_views = swr_set_sampler_views; + pipe->sampler_view_destroy = swr_sampler_view_destroy; + + pipe->create_vs_state = swr_create_vs_state; + pipe->bind_vs_state = swr_bind_vs_state; + pipe->delete_vs_state = swr_delete_vs_state; + + pipe->create_fs_state = swr_create_fs_state; + pipe->bind_fs_state = swr_bind_fs_state; + pipe->delete_fs_state = swr_delete_fs_state; + + pipe->set_constant_buffer = swr_set_constant_buffer; + + pipe->create_vertex_elements_state = swr_create_vertex_elements_state; + pipe->bind_vertex_elements_state = swr_bind_vertex_elements_state; + pipe->delete_vertex_elements_state = swr_delete_vertex_elements_state; + + pipe->set_vertex_buffers = swr_set_vertex_buffers; + pipe->set_index_buffer = swr_set_index_buffer; + + pipe->set_polygon_stipple = swr_set_polygon_stipple; + pipe->set_clip_state = swr_set_clip_state; + pipe->set_scissor_states = swr_set_scissor_states; + pipe->set_viewport_states = swr_set_viewport_states; + + pipe->set_framebuffer_state = swr_set_framebuffer_state; + + pipe->set_blend_color = swr_set_blend_color; + pipe->set_stencil_ref = swr_set_stencil_ref; + + pipe->set_sample_mask = swr_set_sample_mask; + + pipe->create_stream_output_target = swr_create_so_target; + pipe->stream_output_target_destroy = swr_destroy_so_target; + pipe->set_stream_output_targets = swr_set_so_targets; +} diff --git a/src/gallium/drivers/swr/swr_state.h b/src/gallium/drivers/swr/swr_state.h new file mode 100644 index 00000000000..f0a7ff3b185 --- /dev/null +++ b/src/gallium/drivers/swr/swr_state.h @@ -0,0 +1,307 @@ +/**************************************************************************** + * Copyright (C) 2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ***************************************************************************/ + +#ifndef SWR_STATE_H +#define SWR_STATE_H + +#include "pipe/p_defines.h" +#include "tgsi/tgsi_scan.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_dump.h" +#include "gallivm/lp_bld_tgsi.h" +#include "util/u_hash.h" +#include "api.h" +#include "swr_tex_sample.h" +#include "swr_shader.h" +#include <unordered_map> + +/* skeleton */ +struct swr_vertex_shader { + struct pipe_shader_state pipe; + struct lp_tgsi_info info; + unsigned linkageMask; + PFN_VERTEX_FUNC func; + SWR_STREAMOUT_STATE soState; + PFN_SO_FUNC soFunc[PIPE_PRIM_MAX]; +}; + +struct swr_fragment_shader { + struct pipe_shader_state pipe; + struct lp_tgsi_info info; + uint32_t constantMask; + uint32_t pointSpriteMask; + std::unordered_map<swr_jit_key, PFN_PIXEL_KERNEL> map; +}; + +/* Vertex element state */ +struct swr_vertex_element_state { + FETCH_COMPILE_STATE fsState; + PFN_FETCH_FUNC fsFunc; + uint32_t stream_pitch[PIPE_MAX_ATTRIBS]; +}; + +struct swr_blend_state { + struct pipe_blend_state pipe; + SWR_BLEND_STATE blendState; + RENDER_TARGET_BLEND_COMPILE_STATE compileState[PIPE_MAX_COLOR_BUFS]; +}; + +/* + * Derived SWR API DrawState + * For convenience of making simple changes without re-deriving state. + */ +struct swr_derived_state { + SWR_RASTSTATE rastState; + SWR_VIEWPORT vp; + SWR_VIEWPORT_MATRIX vpm; +}; + +void swr_update_derived(struct pipe_context *, + const struct pipe_draw_info * = nullptr); + +/* + * Conversion functions: Convert mesa state defines to SWR. + */ + +static INLINE SWR_LOGIC_OP +swr_convert_logic_op(const UINT op) +{ + switch (op) { + case PIPE_LOGICOP_CLEAR: + return LOGICOP_CLEAR; + case PIPE_LOGICOP_NOR: + return LOGICOP_NOR; + case PIPE_LOGICOP_AND_INVERTED: + return LOGICOP_CLEAR; + case PIPE_LOGICOP_COPY_INVERTED: + return LOGICOP_COPY_INVERTED; + case PIPE_LOGICOP_AND_REVERSE: + return LOGICOP_AND_REVERSE; + case PIPE_LOGICOP_INVERT: + return LOGICOP_INVERT; + case PIPE_LOGICOP_XOR: + return LOGICOP_XOR; + case PIPE_LOGICOP_NAND: + return LOGICOP_NAND; + case PIPE_LOGICOP_AND: + return LOGICOP_AND; + case PIPE_LOGICOP_EQUIV: + return LOGICOP_EQUIV; + case PIPE_LOGICOP_NOOP: + return LOGICOP_NOOP; + case PIPE_LOGICOP_OR_INVERTED: + return LOGICOP_OR_INVERTED; + case PIPE_LOGICOP_COPY: + return LOGICOP_COPY; + case PIPE_LOGICOP_OR_REVERSE: + return LOGICOP_OR_REVERSE; + case PIPE_LOGICOP_OR: + return LOGICOP_OR; + case PIPE_LOGICOP_SET: + return LOGICOP_SET; + default: + assert(0 && "Unsupported logic op"); + return LOGICOP_NOOP; + } +} + +static INLINE SWR_STENCILOP +swr_convert_stencil_op(const UINT op) +{ + switch (op) { + case PIPE_STENCIL_OP_KEEP: + return STENCILOP_KEEP; + case PIPE_STENCIL_OP_ZERO: + return STENCILOP_ZERO; + case PIPE_STENCIL_OP_REPLACE: + return STENCILOP_REPLACE; + case PIPE_STENCIL_OP_INCR: + return STENCILOP_INCRSAT; + case PIPE_STENCIL_OP_DECR: + return STENCILOP_DECRSAT; + case PIPE_STENCIL_OP_INCR_WRAP: + return STENCILOP_INCR; + case PIPE_STENCIL_OP_DECR_WRAP: + return STENCILOP_DECR; + case PIPE_STENCIL_OP_INVERT: + return STENCILOP_INVERT; + default: + assert(0 && "Unsupported stencil op"); + return STENCILOP_KEEP; + } +} + +static INLINE SWR_FORMAT +swr_convert_index_type(const UINT index_size) +{ + switch (index_size) { + case sizeof(unsigned char): + return R8_UINT; + case sizeof(unsigned short): + return R16_UINT; + case sizeof(unsigned int): + return R32_UINT; + default: + assert(0 && "Unsupported index type"); + return R32_UINT; + } +} + + +static INLINE SWR_ZFUNCTION +swr_convert_depth_func(const UINT pipe_func) +{ + switch (pipe_func) { + case PIPE_FUNC_NEVER: + return ZFUNC_NEVER; + case PIPE_FUNC_LESS: + return ZFUNC_LT; + case PIPE_FUNC_EQUAL: + return ZFUNC_EQ; + case PIPE_FUNC_LEQUAL: + return ZFUNC_LE; + case PIPE_FUNC_GREATER: + return ZFUNC_GT; + case PIPE_FUNC_NOTEQUAL: + return ZFUNC_NE; + case PIPE_FUNC_GEQUAL: + return ZFUNC_GE; + case PIPE_FUNC_ALWAYS: + return ZFUNC_ALWAYS; + default: + assert(0 && "Unsupported depth func"); + return ZFUNC_ALWAYS; + } +} + + +static INLINE SWR_CULLMODE +swr_convert_cull_mode(const UINT cull_face) +{ + switch (cull_face) { + case PIPE_FACE_NONE: + return SWR_CULLMODE_NONE; + case PIPE_FACE_FRONT: + return SWR_CULLMODE_FRONT; + case PIPE_FACE_BACK: + return SWR_CULLMODE_BACK; + case PIPE_FACE_FRONT_AND_BACK: + return SWR_CULLMODE_BOTH; + default: + assert(0 && "Invalid cull mode"); + return SWR_CULLMODE_NONE; + } +} + +static INLINE SWR_BLEND_OP +swr_convert_blend_func(const UINT blend_func) +{ + switch (blend_func) { + case PIPE_BLEND_ADD: + return BLENDOP_ADD; + case PIPE_BLEND_SUBTRACT: + return BLENDOP_SUBTRACT; + case PIPE_BLEND_REVERSE_SUBTRACT: + return BLENDOP_REVSUBTRACT; + case PIPE_BLEND_MIN: + return BLENDOP_MIN; + case PIPE_BLEND_MAX: + return BLENDOP_MAX; + default: + assert(0 && "Invalid blend func"); + return BLENDOP_ADD; + } +} + +static INLINE SWR_BLEND_FACTOR +swr_convert_blend_factor(const UINT blend_factor) +{ + switch (blend_factor) { + case PIPE_BLENDFACTOR_ONE: + return BLENDFACTOR_ONE; + case PIPE_BLENDFACTOR_SRC_COLOR: + return BLENDFACTOR_SRC_COLOR; + case PIPE_BLENDFACTOR_SRC_ALPHA: + return BLENDFACTOR_SRC_ALPHA; + case PIPE_BLENDFACTOR_DST_ALPHA: + return BLENDFACTOR_DST_ALPHA; + case PIPE_BLENDFACTOR_DST_COLOR: + return BLENDFACTOR_DST_COLOR; + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + return BLENDFACTOR_SRC_ALPHA_SATURATE; + case PIPE_BLENDFACTOR_CONST_COLOR: + return BLENDFACTOR_CONST_COLOR; + case PIPE_BLENDFACTOR_CONST_ALPHA: + return BLENDFACTOR_CONST_ALPHA; + case PIPE_BLENDFACTOR_SRC1_COLOR: + return BLENDFACTOR_SRC1_COLOR; + case PIPE_BLENDFACTOR_SRC1_ALPHA: + return BLENDFACTOR_SRC1_ALPHA; + case PIPE_BLENDFACTOR_ZERO: + return BLENDFACTOR_ZERO; + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + return BLENDFACTOR_INV_SRC_COLOR; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + return BLENDFACTOR_INV_SRC_ALPHA; + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + return BLENDFACTOR_INV_DST_ALPHA; + case PIPE_BLENDFACTOR_INV_DST_COLOR: + return BLENDFACTOR_INV_DST_COLOR; + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + return BLENDFACTOR_INV_CONST_COLOR; + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + return BLENDFACTOR_INV_CONST_ALPHA; + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + return BLENDFACTOR_INV_SRC1_COLOR; + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + return BLENDFACTOR_INV_SRC1_ALPHA; + default: + assert(0 && "Invalid blend factor"); + return BLENDFACTOR_ONE; + } +} + +static INLINE enum SWR_SURFACE_TYPE +swr_convert_target_type(const enum pipe_texture_target target) +{ + switch (target) { + case PIPE_BUFFER: + return SURFACE_BUFFER; + case PIPE_TEXTURE_1D: + case PIPE_TEXTURE_1D_ARRAY: + return SURFACE_1D; + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_2D_ARRAY: + case PIPE_TEXTURE_RECT: + return SURFACE_2D; + case PIPE_TEXTURE_3D: + return SURFACE_3D; + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_CUBE_ARRAY: + return SURFACE_CUBE; + default: + assert(0); + return SURFACE_NULL; + } +} +#endif diff --git a/src/gallium/drivers/swr/swr_tex_sample.cpp b/src/gallium/drivers/swr/swr_tex_sample.cpp new file mode 100644 index 00000000000..8e01e32e280 --- /dev/null +++ b/src/gallium/drivers/swr/swr_tex_sample.cpp @@ -0,0 +1,338 @@ +/************************************************************************** + * + * Copyright 2009 VMware, Inc. + * All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * Largely a copy of llvmpipe's lp_tex_sample.c + */ + +/** + * Texture sampling code generation + * + * This file is nothing more than ugly glue between three largely independent + * entities: + * - TGSI -> LLVM translation (i.e., lp_build_tgsi_soa) + * - texture sampling code generation (i.e., lp_build_sample_soa) + * - SWR driver + * + * All interesting code is in the functions mentioned above. There is really + * nothing to see here. + * + * @author Jose Fonseca <[email protected]> + */ + +#include "state.h" +#include "JitManager.h" +#include "state_llvm.h" + +#include "pipe/p_defines.h" +#include "pipe/p_shader_tokens.h" +#include "gallivm/lp_bld_debug.h" +#include "gallivm/lp_bld_const.h" +#include "gallivm/lp_bld_type.h" +#include "gallivm/lp_bld_sample.h" +#include "gallivm/lp_bld_tgsi.h" +#include "util/u_memory.h" + +#include "swr_tex_sample.h" +#include "swr_context_llvm.h" + + +/** + * This provides the bridge between the sampler state store in + * lp_jit_context and lp_jit_texture and the sampler code + * generator. It provides the texture layout information required by + * the texture sampler code generator in terms of the state stored in + * lp_jit_context and lp_jit_texture in runtime. + */ +struct swr_sampler_dynamic_state { + struct lp_sampler_dynamic_state base; + + const struct swr_sampler_static_state *static_state; +}; + + +/** + * This is the bridge between our sampler and the TGSI translator. + */ +struct swr_sampler_soa { + struct lp_build_sampler_soa base; + + struct swr_sampler_dynamic_state dynamic_state; +}; + + +/** + * Fetch the specified member of the lp_jit_texture structure. + * \param emit_load if TRUE, emit the LLVM load instruction to actually + * fetch the field's value. Otherwise, just emit the + * GEP code to address the field. + * + * @sa http://llvm.org/docs/GetElementPtr.html + */ +static LLVMValueRef +swr_texture_member(const struct lp_sampler_dynamic_state *base, + struct gallivm_state *gallivm, + LLVMValueRef context_ptr, + unsigned texture_unit, + unsigned member_index, + const char *member_name, + boolean emit_load) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef indices[4]; + LLVMValueRef ptr; + LLVMValueRef res; + + assert(texture_unit < PIPE_MAX_SHADER_SAMPLER_VIEWS); + + /* context[0] */ + indices[0] = lp_build_const_int32(gallivm, 0); + /* context[0].textures */ + indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesFS); + /* context[0].textures[unit] */ + indices[2] = lp_build_const_int32(gallivm, texture_unit); + /* context[0].textures[unit].member */ + indices[3] = lp_build_const_int32(gallivm, member_index); + + ptr = LLVMBuildGEP(builder, context_ptr, indices, Elements(indices), ""); + + if (emit_load) + res = LLVMBuildLoad(builder, ptr, ""); + else + res = ptr; + + lp_build_name(res, "context.texture%u.%s", texture_unit, member_name); + + return res; +} + + +/** + * Helper macro to instantiate the functions that generate the code to + * fetch the members of lp_jit_texture to fulfill the sampler code + * generator requests. + * + * This complexity is the price we have to pay to keep the texture + * sampler code generator a reusable module without dependencies to + * swr internals. + */ +#define SWR_TEXTURE_MEMBER(_name, _emit_load) \ + static LLVMValueRef swr_texture_##_name( \ + const struct lp_sampler_dynamic_state *base, \ + struct gallivm_state *gallivm, \ + LLVMValueRef context_ptr, \ + unsigned texture_unit) \ + { \ + return swr_texture_member(base, \ + gallivm, \ + context_ptr, \ + texture_unit, \ + swr_jit_texture_##_name, \ + #_name, \ + _emit_load); \ + } + + +SWR_TEXTURE_MEMBER(width, TRUE) +SWR_TEXTURE_MEMBER(height, TRUE) +SWR_TEXTURE_MEMBER(depth, TRUE) +SWR_TEXTURE_MEMBER(first_level, TRUE) +SWR_TEXTURE_MEMBER(last_level, TRUE) +SWR_TEXTURE_MEMBER(base_ptr, TRUE) +SWR_TEXTURE_MEMBER(row_stride, FALSE) +SWR_TEXTURE_MEMBER(img_stride, FALSE) +SWR_TEXTURE_MEMBER(mip_offsets, FALSE) + + +/** + * Fetch the specified member of the lp_jit_sampler structure. + * \param emit_load if TRUE, emit the LLVM load instruction to actually + * fetch the field's value. Otherwise, just emit the + * GEP code to address the field. + * + * @sa http://llvm.org/docs/GetElementPtr.html + */ +static LLVMValueRef +swr_sampler_member(const struct lp_sampler_dynamic_state *base, + struct gallivm_state *gallivm, + LLVMValueRef context_ptr, + unsigned sampler_unit, + unsigned member_index, + const char *member_name, + boolean emit_load) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef indices[4]; + LLVMValueRef ptr; + LLVMValueRef res; + + assert(sampler_unit < PIPE_MAX_SAMPLERS); + + /* context[0] */ + indices[0] = lp_build_const_int32(gallivm, 0); + /* context[0].samplers */ + indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersFS); + /* context[0].samplers[unit] */ + indices[2] = lp_build_const_int32(gallivm, sampler_unit); + /* context[0].samplers[unit].member */ + indices[3] = lp_build_const_int32(gallivm, member_index); + + ptr = LLVMBuildGEP(builder, context_ptr, indices, Elements(indices), ""); + + if (emit_load) + res = LLVMBuildLoad(builder, ptr, ""); + else + res = ptr; + + lp_build_name(res, "context.sampler%u.%s", sampler_unit, member_name); + + return res; +} + + +#define SWR_SAMPLER_MEMBER(_name, _emit_load) \ + static LLVMValueRef swr_sampler_##_name( \ + const struct lp_sampler_dynamic_state *base, \ + struct gallivm_state *gallivm, \ + LLVMValueRef context_ptr, \ + unsigned sampler_unit) \ + { \ + return swr_sampler_member(base, \ + gallivm, \ + context_ptr, \ + sampler_unit, \ + swr_jit_sampler_##_name, \ + #_name, \ + _emit_load); \ + } + + +SWR_SAMPLER_MEMBER(min_lod, TRUE) +SWR_SAMPLER_MEMBER(max_lod, TRUE) +SWR_SAMPLER_MEMBER(lod_bias, TRUE) +SWR_SAMPLER_MEMBER(border_color, FALSE) + + +static void +swr_sampler_soa_destroy(struct lp_build_sampler_soa *sampler) +{ + FREE(sampler); +} + + +/** + * Fetch filtered values from texture. + * The 'texel' parameter returns four vectors corresponding to R, G, B, A. + */ +static void +swr_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base, + struct gallivm_state *gallivm, + const struct lp_sampler_params *params) +{ + struct swr_sampler_soa *sampler = (struct swr_sampler_soa *)base; + unsigned texture_index = params->texture_index; + unsigned sampler_index = params->sampler_index; + + assert(sampler_index < PIPE_MAX_SAMPLERS); + assert(texture_index < PIPE_MAX_SHADER_SAMPLER_VIEWS); + +#if 0 + lp_build_sample_nop(gallivm, params->type, params->coords, params->texel); +#else + lp_build_sample_soa( + &sampler->dynamic_state.static_state[texture_index].texture_state, + &sampler->dynamic_state.static_state[sampler_index].sampler_state, + &sampler->dynamic_state.base, + gallivm, + params); +#endif +} + +/** + * Fetch the texture size. + */ +static void +swr_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base, + struct gallivm_state *gallivm, + struct lp_type type, + unsigned texture_unit, + unsigned target, + LLVMValueRef context_ptr, + boolean is_sviewinfo, + enum lp_sampler_lod_property lod_property, + LLVMValueRef explicit_lod, /* optional */ + LLVMValueRef *sizes_out) +{ + struct swr_sampler_soa *sampler = (struct swr_sampler_soa *)base; + + assert(texture_unit < PIPE_MAX_SHADER_SAMPLER_VIEWS); + + lp_build_size_query_soa( + gallivm, + &sampler->dynamic_state.static_state[texture_unit].texture_state, + &sampler->dynamic_state.base, + type, + texture_unit, + target, + context_ptr, + is_sviewinfo, + lod_property, + explicit_lod, + sizes_out); +} + + +struct lp_build_sampler_soa * +swr_sampler_soa_create(const struct swr_sampler_static_state *static_state) +{ + struct swr_sampler_soa *sampler; + + sampler = CALLOC_STRUCT(swr_sampler_soa); + if (!sampler) + return NULL; + + sampler->base.destroy = swr_sampler_soa_destroy; + sampler->base.emit_tex_sample = swr_sampler_soa_emit_fetch_texel; + sampler->base.emit_size_query = swr_sampler_soa_emit_size_query; + sampler->dynamic_state.base.width = swr_texture_width; + sampler->dynamic_state.base.height = swr_texture_height; + sampler->dynamic_state.base.depth = swr_texture_depth; + sampler->dynamic_state.base.first_level = swr_texture_first_level; + sampler->dynamic_state.base.last_level = swr_texture_last_level; + sampler->dynamic_state.base.base_ptr = swr_texture_base_ptr; + sampler->dynamic_state.base.row_stride = swr_texture_row_stride; + sampler->dynamic_state.base.img_stride = swr_texture_img_stride; + sampler->dynamic_state.base.mip_offsets = swr_texture_mip_offsets; + sampler->dynamic_state.base.min_lod = swr_sampler_min_lod; + sampler->dynamic_state.base.max_lod = swr_sampler_max_lod; + sampler->dynamic_state.base.lod_bias = swr_sampler_lod_bias; + sampler->dynamic_state.base.border_color = swr_sampler_border_color; + + sampler->dynamic_state.static_state = static_state; + + return &sampler->base; +} diff --git a/src/gallium/drivers/swr/swr_tex_sample.h b/src/gallium/drivers/swr/swr_tex_sample.h new file mode 100644 index 00000000000..f5c368c108d --- /dev/null +++ b/src/gallium/drivers/swr/swr_tex_sample.h @@ -0,0 +1,47 @@ +/************************************************************************** + * + * Copyright 2007 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#pragma once + +#include "gallivm/lp_bld.h" + +struct swr_sampler_static_state { + /* + * These attributes are effectively interleaved for more sane key handling. + * However, there might be lots of null space if the amount of samplers and + * textures isn't the same. + */ + struct lp_static_sampler_state sampler_state; + struct lp_static_texture_state texture_state; +}; + +/** + * Pure-LLVM texture sampling code generator. + * + */ +struct lp_build_sampler_soa * +swr_sampler_soa_create(const struct swr_sampler_static_state *key); diff --git a/src/gallium/drivers/trace/tr_screen.c b/src/gallium/drivers/trace/tr_screen.c index 0612109c800..b24e1856aca 100644 --- a/src/gallium/drivers/trace/tr_screen.c +++ b/src/gallium/drivers/trace/tr_screen.c @@ -313,7 +313,8 @@ trace_screen_resource_create(struct pipe_screen *_screen, static struct pipe_resource * trace_screen_resource_from_handle(struct pipe_screen *_screen, const struct pipe_resource *templ, - struct winsys_handle *handle) + struct winsys_handle *handle, + unsigned usage) { struct trace_screen *tr_screen = trace_screen(_screen); struct pipe_screen *screen = tr_screen->screen; @@ -321,7 +322,7 @@ trace_screen_resource_from_handle(struct pipe_screen *_screen, /* TODO trace call */ - result = screen->resource_from_handle(screen, templ, handle); + result = screen->resource_from_handle(screen, templ, handle, usage); result = trace_resource_create(trace_screen(_screen), result); @@ -331,7 +332,8 @@ trace_screen_resource_from_handle(struct pipe_screen *_screen, static boolean trace_screen_resource_get_handle(struct pipe_screen *_screen, struct pipe_resource *_resource, - struct winsys_handle *handle) + struct winsys_handle *handle, + unsigned usage) { struct trace_screen *tr_screen = trace_screen(_screen); struct trace_resource *tr_resource = trace_resource(_resource); @@ -340,7 +342,7 @@ trace_screen_resource_get_handle(struct pipe_screen *_screen, /* TODO trace call */ - return screen->resource_get_handle(screen, resource, handle); + return screen->resource_get_handle(screen, resource, handle, usage); } diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources index a9a2742ec66..c5df0f17986 100644 --- a/src/gallium/drivers/vc4/Makefile.sources +++ b/src/gallium/drivers/vc4/Makefile.sources @@ -28,7 +28,7 @@ C_SOURCES := \ vc4_opt_cse.c \ vc4_opt_dead_code.c \ vc4_opt_small_immediates.c \ - vc4_opt_vpm_writes.c \ + vc4_opt_vpm.c \ vc4_program.c \ vc4_qir.c \ vc4_qir_lower_uniforms.c \ diff --git a/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c b/src/gallium/drivers/vc4/vc4_opt_vpm.c index 73ded766db9..d15b0c1a39f 100644 --- a/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c +++ b/src/gallium/drivers/vc4/vc4_opt_vpm.c @@ -22,16 +22,18 @@ */ /** - * @file vc4_opt_vpm_writes.c + * @file vc4_opt_vpm.c * - * This modifies instructions that generate the value consumed by a VPM write - * to write directly into the VPM. + * This modifies instructions that: + * 1. exclusively consume a value read from the VPM to directly read the VPM if + * other operands allow it. + * 2. generate the value consumed by a VPM write to write directly into the VPM. */ #include "vc4_qir.h" bool -qir_opt_vpm_writes(struct vc4_compile *c) +qir_opt_vpm(struct vc4_compile *c) { if (c->stage == QSTAGE_FRAG) return false; @@ -52,8 +54,70 @@ qir_opt_vpm_writes(struct vc4_compile *c) } for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { - if (inst->src[i].file == QFILE_TEMP) - use_count[inst->src[i].index]++; + if (inst->src[i].file == QFILE_TEMP) { + uint32_t temp = inst->src[i].index; + use_count[temp]++; + } + } + } + + /* For instructions reading from a temporary that contains a VPM read + * result, try to move the instruction up in place of the VPM read. + */ + list_for_each_entry(struct qinst, inst, &c->instructions, link) { + if (!inst || qir_is_multi_instruction(inst)) + continue; + + if (qir_depends_on_flags(inst) || inst->sf) + continue; + + if (qir_has_side_effects(c, inst) || + qir_has_side_effect_reads(c, inst) || + qir_is_tex(inst)) + continue; + + for (int j = 0; j < qir_get_op_nsrc(inst->op); j++) { + if (inst->src[j].file != QFILE_TEMP || + inst->src[j].pack) + continue; + + uint32_t temp = inst->src[j].index; + + /* Since VPM reads pull from a FIFO, we only get to + * read each VPM entry once (unless we reset the read + * pointer). That means we can't copy-propagate a VPM + * read to multiple locations. + */ + if (use_count[temp] != 1) + continue; + + struct qinst *mov = c->defs[temp]; + if (!mov || + (mov->op != QOP_MOV && + mov->op != QOP_FMOV && + mov->op != QOP_MMOV) || + mov->src[0].file != QFILE_VPM) { + continue; + } + + uint32_t temps = 0; + for (int k = 0; k < qir_get_op_nsrc(inst->op); k++) { + if (inst->src[k].file == QFILE_TEMP) + temps++; + } + + /* The instruction is safe to reorder if its other + * sources are independent of previous instructions + */ + if (temps == 1) { + list_del(&inst->link); + inst->src[j] = mov->src[0]; + list_replace(&mov->link, &inst->link); + c->defs[temp] = NULL; + free(mov); + progress = true; + break; + } } } diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index 5c91c02b539..81e8e9150d6 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -1729,6 +1729,8 @@ nir_to_qir(struct vc4_compile *c) } static const nir_shader_compiler_options nir_options = { + .lower_extract_byte = true, + .lower_extract_word = true, .lower_ffma = true, .lower_flrp = true, .lower_fpow = true, diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c index f9eb0e151c5..65f0067c61e 100644 --- a/src/gallium/drivers/vc4/vc4_qir.c +++ b/src/gallium/drivers/vc4/vc4_qir.c @@ -526,7 +526,7 @@ qir_optimize(struct vc4_compile *c) OPTPASS(qir_opt_copy_propagation); OPTPASS(qir_opt_dead_code); OPTPASS(qir_opt_small_immediates); - OPTPASS(qir_opt_vpm_writes); + OPTPASS(qir_opt_vpm); if (!progress) break; diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index bae31768bd8..4f39d72f552 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -484,7 +484,7 @@ bool qir_opt_copy_propagation(struct vc4_compile *c); bool qir_opt_cse(struct vc4_compile *c); bool qir_opt_dead_code(struct vc4_compile *c); bool qir_opt_small_immediates(struct vc4_compile *c); -bool qir_opt_vpm_writes(struct vc4_compile *c); +bool qir_opt_vpm(struct vc4_compile *c); void vc4_nir_lower_blend(struct vc4_compile *c); void vc4_nir_lower_io(struct vc4_compile *c); nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b, diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c index 036da329987..ea212af0512 100644 --- a/src/gallium/drivers/vc4/vc4_resource.c +++ b/src/gallium/drivers/vc4/vc4_resource.c @@ -523,7 +523,8 @@ fail: static struct pipe_resource * vc4_resource_from_handle(struct pipe_screen *pscreen, const struct pipe_resource *tmpl, - struct winsys_handle *handle) + struct winsys_handle *handle, + unsigned usage) { struct vc4_resource *rsc = vc4_resource_setup(pscreen, tmpl); struct pipe_resource *prsc = &rsc->base.b; diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c index a4b3efcfda3..92d910ba6a5 100644 --- a/src/gallium/drivers/vc4/vc4_screen.c +++ b/src/gallium/drivers/vc4/vc4_screen.c @@ -203,6 +203,10 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: case PIPE_CAP_QUERY_BUFFER_OBJECT: case PIPE_CAP_QUERY_MEMORY_INFO: + case PIPE_CAP_PCI_GROUP: + case PIPE_CAP_PCI_BUS: + case PIPE_CAP_PCI_DEVICE: + case PIPE_CAP_PCI_FUNCTION: return 0; /* Stream output. */ diff --git a/src/gallium/drivers/virgl/virgl_resource.c b/src/gallium/drivers/virgl/virgl_resource.c index 0b2fc4ec497..2b3794765e2 100644 --- a/src/gallium/drivers/virgl/virgl_resource.c +++ b/src/gallium/drivers/virgl/virgl_resource.c @@ -64,7 +64,8 @@ static struct pipe_resource *virgl_resource_create(struct pipe_screen *screen, static struct pipe_resource *virgl_resource_from_handle(struct pipe_screen *screen, const struct pipe_resource *templ, - struct winsys_handle *whandle) + struct winsys_handle *whandle, + unsigned usage) { struct virgl_screen *vs = virgl_screen(screen); if (templ->target == PIPE_BUFFER) diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c index d3f4e259cad..8126bdec40c 100644 --- a/src/gallium/drivers/virgl/virgl_screen.c +++ b/src/gallium/drivers/virgl/virgl_screen.c @@ -235,6 +235,10 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: case PIPE_CAP_STRING_MARKER: case PIPE_CAP_QUERY_MEMORY_INFO: + case PIPE_CAP_PCI_GROUP: + case PIPE_CAP_PCI_BUS: + case PIPE_CAP_PCI_DEVICE: + case PIPE_CAP_PCI_FUNCTION: return 0; case PIPE_CAP_VENDOR_ID: return 0x1af4; diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h index 010be62e638..bdd76ab1f81 100644 --- a/src/gallium/include/pipe/p_defines.h +++ b/src/gallium/include/pipe/p_defines.h @@ -404,9 +404,9 @@ enum pipe_flush_flags * The third flag has been added to be able to force textures to be created * in linear mode (no tiling). */ -#define PIPE_BIND_SCANOUT (1 << 18) /* */ -#define PIPE_BIND_SHARED (1 << 19) /* get_texture_handle ??? */ -#define PIPE_BIND_LINEAR (1 << 20) +#define PIPE_BIND_SCANOUT (1 << 19) /* */ +#define PIPE_BIND_SHARED (1 << 20) /* get_texture_handle ??? */ +#define PIPE_BIND_LINEAR (1 << 21) /** @@ -530,6 +530,25 @@ enum pipe_reset_status /** + * resource_get_handle flags. + */ +/* Requires pipe_context::flush_resource before external use. */ +#define PIPE_HANDLE_USAGE_EXPLICIT_FLUSH (1 << 0) +/* Expected external use of the resource: */ +#define PIPE_HANDLE_USAGE_READ (1 << 1) +#define PIPE_HANDLE_USAGE_WRITE (1 << 2) +#define PIPE_HANDLE_USAGE_READ_WRITE (PIPE_HANDLE_USAGE_READ | \ + PIPE_HANDLE_USAGE_WRITE) + +/** + * pipe_image_view access flags. + */ +#define PIPE_IMAGE_ACCESS_READ (1 << 0) +#define PIPE_IMAGE_ACCESS_WRITE (1 << 1) +#define PIPE_IMAGE_ACCESS_READ_WRITE (PIPE_IMAGE_ACCESS_READ | \ + PIPE_IMAGE_ACCESS_WRITE) + +/** * Implementation capabilities/limits which are queried through * pipe_screen::get_param() */ @@ -658,6 +677,10 @@ enum pipe_cap PIPE_CAP_SURFACE_REINTERPRET_BLOCKS, PIPE_CAP_QUERY_BUFFER_OBJECT, PIPE_CAP_QUERY_MEMORY_INFO, + PIPE_CAP_PCI_GROUP, + PIPE_CAP_PCI_BUS, + PIPE_CAP_PCI_DEVICE, + PIPE_CAP_PCI_FUNCTION, }; #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0) diff --git a/src/gallium/include/pipe/p_format.h b/src/gallium/include/pipe/p_format.h index ab18523a80c..b22baa9c650 100644 --- a/src/gallium/include/pipe/p_format.h +++ b/src/gallium/include/pipe/p_format.h @@ -29,12 +29,12 @@ #ifndef PIPE_FORMAT_H #define PIPE_FORMAT_H +#include "p_config.h" + #ifdef __cplusplus extern "C" { #endif -#include "p_config.h" - /** * Formats for textures, surfaces and vertex data */ diff --git a/src/gallium/include/pipe/p_screen.h b/src/gallium/include/pipe/p_screen.h index 211bc2440f9..4f30e75ab49 100644 --- a/src/gallium/include/pipe/p_screen.h +++ b/src/gallium/include/pipe/p_screen.h @@ -182,10 +182,13 @@ struct pipe_screen { * NOTE: in the case of DRM_API_HANDLE_TYPE_FD handles, the caller * retains ownership of the FD. (This is consistent with * EGL_EXT_image_dma_buf_import) + * + * \param usage A combination of PIPE_HANDLE_USAGE_* flags. */ struct pipe_resource * (*resource_from_handle)(struct pipe_screen *, const struct pipe_resource *templat, - struct winsys_handle *handle); + struct winsys_handle *handle, + unsigned usage); /** * Create a resource from user memory. This maps the user memory into @@ -203,10 +206,13 @@ struct pipe_screen { * NOTE: in the case of DRM_API_HANDLE_TYPE_FD handles, the caller * takes ownership of the FD. (This is consistent with * EGL_MESA_image_dma_buf_export) + * + * \param usage A combination of PIPE_HANDLE_USAGE_* flags. */ boolean (*resource_get_handle)(struct pipe_screen *, struct pipe_resource *tex, - struct winsys_handle *handle); + struct winsys_handle *handle, + unsigned usage); void (*resource_destroy)(struct pipe_screen *, diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h index 9d4a96a5a7e..7a34841088a 100644 --- a/src/gallium/include/pipe/p_shader_tokens.h +++ b/src/gallium/include/pipe/p_shader_tokens.h @@ -277,7 +277,8 @@ union tgsi_immediate_data #define TGSI_PROPERTY_TES_POINT_MODE 14 #define TGSI_PROPERTY_NUM_CLIPDIST_ENABLED 15 #define TGSI_PROPERTY_NUM_CULLDIST_ENABLED 16 -#define TGSI_PROPERTY_COUNT 17 +#define TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL 17 +#define TGSI_PROPERTY_COUNT 18 struct tgsi_property { unsigned Type : 4; /**< TGSI_TOKEN_TYPE_PROPERTY */ @@ -743,7 +744,9 @@ struct tgsi_dst_register struct tgsi_instruction_memory { unsigned Qualifier : 3; /* TGSI_MEMORY_ */ - unsigned Padding : 29; + unsigned Texture : 8; /* only for images: TGSI_TEXTURE_ */ + unsigned Format : 10; /* only for images: PIPE_FORMAT_ */ + unsigned Padding : 11; }; #define TGSI_MEMBAR_SHADER_BUFFER (1 << 0) diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h index c568c483940..2e720ce25f3 100644 --- a/src/gallium/include/pipe/p_state.h +++ b/src/gallium/include/pipe/p_state.h @@ -393,13 +393,14 @@ struct pipe_sampler_view /** - * A description of a writable buffer or texture that can be bound to a shader + * A description of a buffer or texture image that can be bound to a shader * stage. */ struct pipe_image_view { struct pipe_resource *resource; /**< resource into which this is a view */ enum pipe_format format; /**< typed PIPE_FORMAT_x */ + unsigned access; /**< PIPE_IMAGE_ACCESS_x */ union { struct { diff --git a/src/gallium/include/pipe/p_video_codec.h b/src/gallium/include/pipe/p_video_codec.h index 196d00bc546..b5575ab9afa 100644 --- a/src/gallium/include/pipe/p_video_codec.h +++ b/src/gallium/include/pipe/p_video_codec.h @@ -28,12 +28,12 @@ #ifndef PIPE_VIDEO_CONTEXT_H #define PIPE_VIDEO_CONTEXT_H +#include "pipe/p_video_state.h" + #ifdef __cplusplus extern "C" { #endif -#include "pipe/p_video_state.h" - struct pipe_screen; struct pipe_surface; struct pipe_macroblock; diff --git a/src/gallium/include/pipe/p_video_enums.h b/src/gallium/include/pipe/p_video_enums.h index 9a20146f43e..aff7842a888 100644 --- a/src/gallium/include/pipe/p_video_enums.h +++ b/src/gallium/include/pipe/p_video_enums.h @@ -28,6 +28,10 @@ #ifndef PIPE_VIDEO_ENUMS_H #define PIPE_VIDEO_ENUMS_H +#ifdef __cplusplus +extern "C" { +#endif + enum pipe_video_format { PIPE_VIDEO_FORMAT_UNKNOWN = 0, @@ -87,4 +91,8 @@ enum pipe_video_entrypoint PIPE_VIDEO_ENTRYPOINT_ENCODE }; +#if defined(__cplusplus) +} +#endif + #endif /* PIPE_VIDEO_ENUMS_H */ diff --git a/src/gallium/state_trackers/clover/core/kernel.cpp b/src/gallium/state_trackers/clover/core/kernel.cpp index 8396be91553..c12755b0420 100644 --- a/src/gallium/state_trackers/clover/core/kernel.cpp +++ b/src/gallium/state_trackers/clover/core/kernel.cpp @@ -55,7 +55,7 @@ kernel::launch(command_queue &q, const auto reduced_grid_size = map(divides(), grid_size, block_size); void *st = exec.bind(&q, grid_offset); - struct pipe_grid_info info; + struct pipe_grid_info info = {}; // The handles are created during exec_context::bind(), so we need make // sure to call exec_context::bind() before retrieving them. diff --git a/src/gallium/state_trackers/dri/dri2.c b/src/gallium/state_trackers/dri/dri2.c index a11a6cbbb0c..7f7fbc47e6d 100644 --- a/src/gallium/state_trackers/dri/dri2.c +++ b/src/gallium/state_trackers/dri/dri2.c @@ -354,7 +354,8 @@ dri2_allocate_buffer(__DRIscreen *sPriv, whandle.type = DRM_API_HANDLE_TYPE_KMS; screen->base.screen->resource_get_handle(screen->base.screen, - buffer->resource, &whandle); + buffer->resource, &whandle, + PIPE_HANDLE_USAGE_EXPLICIT_FLUSH | PIPE_HANDLE_USAGE_READ); buffer->base.attachment = attachment; buffer->base.name = whandle.handle; @@ -539,7 +540,8 @@ dri2_allocate_textures(struct dri_context *ctx, whandle.type = DRM_API_HANDLE_TYPE_KMS; drawable->textures[statt] = screen->base.screen->resource_from_handle(screen->base.screen, - &templ, &whandle); + &templ, &whandle, + PIPE_HANDLE_USAGE_EXPLICIT_FLUSH | PIPE_HANDLE_USAGE_READ); assert(drawable->textures[statt]); } } @@ -756,7 +758,7 @@ dri2_create_image_from_winsys(__DRIscreen *_screen, whandle->stride = pitch * util_format_get_blocksize(pf); img->texture = screen->base.screen->resource_from_handle(screen->base.screen, - &templ, whandle); + &templ, whandle, PIPE_HANDLE_USAGE_READ_WRITE); if (!img->texture) { FREE(img); return NULL; @@ -765,6 +767,7 @@ dri2_create_image_from_winsys(__DRIscreen *_screen, img->level = 0; img->layer = 0; img->dri_format = format; + img->use = 0; img->loader_private = loaderPrivate; return img; @@ -884,6 +887,7 @@ dri2_create_image(__DRIscreen *_screen, img->layer = 0; img->dri_format = format; img->dri_components = 0; + img->use = use; img->loader_private = loaderPrivate; return img; @@ -893,31 +897,38 @@ static GLboolean dri2_query_image(__DRIimage *image, int attrib, int *value) { struct winsys_handle whandle; + unsigned usage; + + if (image->use & __DRI_IMAGE_USE_BACKBUFFER) + usage = PIPE_HANDLE_USAGE_EXPLICIT_FLUSH | PIPE_HANDLE_USAGE_READ; + else + usage = PIPE_HANDLE_USAGE_READ_WRITE; + memset(&whandle, 0, sizeof(whandle)); switch (attrib) { case __DRI_IMAGE_ATTRIB_STRIDE: whandle.type = DRM_API_HANDLE_TYPE_KMS; image->texture->screen->resource_get_handle(image->texture->screen, - image->texture, &whandle); + image->texture, &whandle, usage); *value = whandle.stride; return GL_TRUE; case __DRI_IMAGE_ATTRIB_HANDLE: whandle.type = DRM_API_HANDLE_TYPE_KMS; image->texture->screen->resource_get_handle(image->texture->screen, - image->texture, &whandle); + image->texture, &whandle, usage); *value = whandle.handle; return GL_TRUE; case __DRI_IMAGE_ATTRIB_NAME: whandle.type = DRM_API_HANDLE_TYPE_SHARED; image->texture->screen->resource_get_handle(image->texture->screen, - image->texture, &whandle); + image->texture, &whandle, usage); *value = whandle.handle; return GL_TRUE; case __DRI_IMAGE_ATTRIB_FD: whandle.type= DRM_API_HANDLE_TYPE_FD; image->texture->screen->resource_get_handle(image->texture->screen, - image->texture, &whandle); + image->texture, &whandle, usage); *value = whandle.handle; return GL_TRUE; case __DRI_IMAGE_ATTRIB_FORMAT: diff --git a/src/gallium/state_trackers/dri/dri_screen.h b/src/gallium/state_trackers/dri/dri_screen.h index 45459906588..dc4692a1c6b 100644 --- a/src/gallium/state_trackers/dri/dri_screen.h +++ b/src/gallium/state_trackers/dri/dri_screen.h @@ -109,6 +109,7 @@ struct __DRIimageRec { unsigned layer; uint32_t dri_format; uint32_t dri_components; + unsigned use; void *loader_private; diff --git a/src/gallium/state_trackers/glx/xlib/glx_api.c b/src/gallium/state_trackers/glx/xlib/glx_api.c index 0456d44104e..1c541b76db5 100644 --- a/src/gallium/state_trackers/glx/xlib/glx_api.c +++ b/src/gallium/state_trackers/glx/xlib/glx_api.c @@ -615,6 +615,7 @@ close_display_callback(Display *dpy, XExtCodes *codes) { xmesa_destroy_buffers_on_display(dpy); destroy_visuals_on_display(dpy); + xmesa_close_display(dpy); return 0; } diff --git a/src/gallium/state_trackers/glx/xlib/xm_api.c b/src/gallium/state_trackers/glx/xlib/xm_api.c index 2f5e1f5f1a8..5799cce033c 100644 --- a/src/gallium/state_trackers/glx/xlib/xm_api.c +++ b/src/gallium/state_trackers/glx/xlib/xm_api.c @@ -110,14 +110,6 @@ void xmesa_set_driver( const struct xm_driver *templ ) } -/* - * XXX replace this with a linked list, or better yet, try to attach the - * gallium/mesa extra bits to the X Display object with XAddExtension(). - */ -#define MAX_DISPLAYS 10 -static struct xmesa_display Displays[MAX_DISPLAYS]; -static int NumDisplays = 0; - static int xmesa_get_param(struct st_manager *smapi, enum st_manager_param param) @@ -130,61 +122,145 @@ xmesa_get_param(struct st_manager *smapi, } } +/* linked list of XMesaDisplay hooks per display */ +typedef struct _XMesaExtDisplayInfo { + struct _XMesaExtDisplayInfo *next; + Display *display; + struct xmesa_display mesaDisplay; +} XMesaExtDisplayInfo; + +typedef struct _XMesaExtInfo { + XMesaExtDisplayInfo *head; + int ndisplays; +} XMesaExtInfo; + +static XMesaExtInfo MesaExtInfo; + +/* hook to delete XMesaDisplay on XDestroyDisplay */ +extern void +xmesa_close_display(Display *display) +{ + XMesaExtDisplayInfo *info, *prev; + + assert(MesaExtInfo.ndisplays > 0); + assert(MesaExtInfo.head); + + _XLockMutex(_Xglobal_lock); + /* first find display */ + prev = NULL; + for (info = MesaExtInfo.head; info; info = info->next) { + if (info->display == display) { + prev = info; + break; + } + } + + if (info == NULL) { + /* no display found */ + _XUnlockMutex(_Xglobal_lock); + return; + } + + /* remove display entry from list */ + if (prev != MesaExtInfo.head) { + prev->next = info->next; + } else { + MesaExtInfo.head = info->next; + } + MesaExtInfo.ndisplays--; + + _XUnlockMutex(_Xglobal_lock); + + /* don't forget to clean up mesaDisplay */ + XMesaDisplay xmdpy = &info->mesaDisplay; + + /** + * XXX: Don't destroy the screens here, since there may still + * be some dangling screen pointers that are used after this point + * if (xmdpy->screen) { + * xmdpy->screen->destroy(xmdpy->screen); + * } + */ + free(xmdpy->smapi); + + XFree((char *) info); +} + static XMesaDisplay xmesa_init_display( Display *display ) { pipe_static_mutex(init_mutex); XMesaDisplay xmdpy; - int i; + XMesaExtDisplayInfo *info; + + if (display == NULL) { + return NULL; + } pipe_mutex_lock(init_mutex); - /* Look for XMesaDisplay which corresponds to 'display' */ - for (i = 0; i < NumDisplays; i++) { - if (Displays[i].display == display) { + /* Look for XMesaDisplay which corresponds to this display */ + info = MesaExtInfo.head; + while(info) { + if (info->display == display) { /* Found it */ pipe_mutex_unlock(init_mutex); - return &Displays[i]; + return &info->mesaDisplay; } + info = info->next; } - /* Create new XMesaDisplay */ + /* Not found. Create new XMesaDisplay */ + /* first allocate X-related resources and hook destroy callback */ - assert(NumDisplays < MAX_DISPLAYS); - xmdpy = &Displays[NumDisplays]; - NumDisplays++; - - if (!xmdpy->display && display) { - xmdpy->display = display; - xmdpy->screen = driver.create_pipe_screen(display); - xmdpy->smapi = CALLOC_STRUCT(st_manager); - if (xmdpy->smapi) { - xmdpy->smapi->screen = xmdpy->screen; - xmdpy->smapi->get_param = xmesa_get_param; - } + /* allocate mesa display info */ + info = (XMesaExtDisplayInfo *) Xmalloc(sizeof(XMesaExtDisplayInfo)); + if (info == NULL) { + pipe_mutex_unlock(init_mutex); + return NULL; + } + info->display = display; + xmdpy = &info->mesaDisplay; /* to be filled out below */ + + /* chain to the list of displays */ + _XLockMutex(_Xglobal_lock); + info->next = MesaExtInfo.head; + MesaExtInfo.head = info; + MesaExtInfo.ndisplays++; + _XUnlockMutex(_Xglobal_lock); + + /* now create the new XMesaDisplay info */ + assert(display); + + xmdpy->display = display; + xmdpy->screen = driver.create_pipe_screen(display); + xmdpy->smapi = CALLOC_STRUCT(st_manager); + xmdpy->pipe = NULL; + if (xmdpy->smapi) { + xmdpy->smapi->screen = xmdpy->screen; + xmdpy->smapi->get_param = xmesa_get_param; + } - if (xmdpy->screen && xmdpy->smapi) { - pipe_mutex_init(xmdpy->mutex); + if (xmdpy->screen && xmdpy->smapi) { + pipe_mutex_init(xmdpy->mutex); + } + else { + if (xmdpy->screen) { + xmdpy->screen->destroy(xmdpy->screen); + xmdpy->screen = NULL; } - else { - if (xmdpy->screen) { - xmdpy->screen->destroy(xmdpy->screen); - xmdpy->screen = NULL; - } - free(xmdpy->smapi); - xmdpy->smapi = NULL; + free(xmdpy->smapi); + xmdpy->smapi = NULL; - xmdpy->display = NULL; - } + xmdpy->display = NULL; } - if (!xmdpy->display || xmdpy->display != display) - xmdpy = NULL; pipe_mutex_unlock(init_mutex); return xmdpy; } + /**********************************************************************/ /***** X Utility Functions *****/ /**********************************************************************/ diff --git a/src/gallium/state_trackers/glx/xlib/xm_api.h b/src/gallium/state_trackers/glx/xlib/xm_api.h index ffdffc0940f..ccf35a5eb5a 100644 --- a/src/gallium/state_trackers/glx/xlib/xm_api.h +++ b/src/gallium/state_trackers/glx/xlib/xm_api.h @@ -378,6 +378,9 @@ xmesa_check_buffer_size(XMesaBuffer b); extern void xmesa_destroy_buffers_on_display(Display *dpy); +extern void +xmesa_close_display(Display *dpy); + static inline GLuint xmesa_buffer_width(XMesaBuffer b) { diff --git a/src/gallium/state_trackers/nine/swapchain9.c b/src/gallium/state_trackers/nine/swapchain9.c index 1ab339c459c..e2fb4d4a1e3 100644 --- a/src/gallium/state_trackers/nine/swapchain9.c +++ b/src/gallium/state_trackers/nine/swapchain9.c @@ -87,7 +87,9 @@ D3DWindowBuffer_create(struct NineSwapChain9 *This, memset(&whandle, 0, sizeof(whandle)); whandle.type = DRM_API_HANDLE_TYPE_FD; - This->screen->resource_get_handle(This->screen, resource, &whandle); + This->screen->resource_get_handle(This->screen, resource, &whandle, + PIPE_HANDLE_USAGE_EXPLICIT_FLUSH | + PIPE_HANDLE_USAGE_READ); stride = whandle.stride; dmaBufFd = whandle.handle; ID3DPresent_NewD3DWindowBufferFromDmaBuf(This->present, diff --git a/src/gallium/state_trackers/va/buffer.c b/src/gallium/state_trackers/va/buffer.c index c2c24d693f2..2fd86612e9a 100644 --- a/src/gallium/state_trackers/va/buffer.c +++ b/src/gallium/state_trackers/va/buffer.c @@ -302,7 +302,8 @@ vlVaAcquireBufferHandle(VADriverContextP ctx, VABufferID buf_id, memset(&whandle, 0, sizeof(whandle)); whandle.type = DRM_API_HANDLE_TYPE_FD; - if (!screen->resource_get_handle(screen, buf->derived_surface.resource, &whandle)) + if (!screen->resource_get_handle(screen, buf->derived_surface.resource, + &whandle, PIPE_HANDLE_USAGE_READ_WRITE)) return VA_STATUS_ERROR_INVALID_BUFFER; buf_info->handle = (intptr_t)whandle.handle; diff --git a/src/gallium/state_trackers/va/surface.c b/src/gallium/state_trackers/va/surface.c index 84a94949c47..861dac260a9 100644 --- a/src/gallium/state_trackers/va/surface.c +++ b/src/gallium/state_trackers/va/surface.c @@ -470,7 +470,8 @@ suface_from_external_memory(VADriverContextP ctx, vlVaSurface *surface, whandle.handle = memory_attibute->buffers[index]; whandle.stride = memory_attibute->pitches[index]; - resource = pscreen->resource_from_handle(pscreen, &res_templ, &whandle); + resource = pscreen->resource_from_handle(pscreen, &res_templ, &whandle, + PIPE_HANDLE_USAGE_READ_WRITE); if (!resource) return VA_STATUS_ERROR_ALLOCATION_FAILED; diff --git a/src/gallium/state_trackers/va/va_private.h b/src/gallium/state_trackers/va/va_private.h index 614fa98fef7..d91de442fa7 100644 --- a/src/gallium/state_trackers/va/va_private.h +++ b/src/gallium/state_trackers/va/va_private.h @@ -148,11 +148,12 @@ PipeToProfile(enum pipe_video_profile profile) return VAProfileH264High; case PIPE_VIDEO_PROFILE_HEVC_MAIN: return VAProfileHEVCMain; + case PIPE_VIDEO_PROFILE_HEVC_MAIN_10: + return VAProfileHEVCMain10; case PIPE_VIDEO_PROFILE_MPEG4_AVC_EXTENDED: case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH10: case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH422: case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH444: - case PIPE_VIDEO_PROFILE_HEVC_MAIN_10: case PIPE_VIDEO_PROFILE_HEVC_MAIN_12: case PIPE_VIDEO_PROFILE_HEVC_MAIN_STILL: case PIPE_VIDEO_PROFILE_HEVC_MAIN_444: @@ -190,6 +191,8 @@ ProfileToPipe(VAProfile profile) return PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH; case VAProfileHEVCMain: return PIPE_VIDEO_PROFILE_HEVC_MAIN; + case VAProfileHEVCMain10: + return PIPE_VIDEO_PROFILE_HEVC_MAIN_10; case VAProfileNone: return PIPE_VIDEO_PROFILE_UNKNOWN; default: diff --git a/src/gallium/state_trackers/xa/xa_tracker.c b/src/gallium/state_trackers/xa/xa_tracker.c index d57464b7d60..f09baed1d84 100644 --- a/src/gallium/state_trackers/xa/xa_tracker.c +++ b/src/gallium/state_trackers/xa/xa_tracker.c @@ -362,7 +362,8 @@ surface_create(struct xa_tracker *xa, template->bind |= PIPE_BIND_SCANOUT; if (whandle) - srf->tex = xa->screen->resource_from_handle(xa->screen, template, whandle); + srf->tex = xa->screen->resource_from_handle(xa->screen, template, whandle, + PIPE_HANDLE_USAGE_READ_WRITE); else srf->tex = xa->screen->resource_create(xa->screen, template); if (!srf->tex) @@ -548,7 +549,8 @@ xa_surface_handle(struct xa_surface *srf, memset(&whandle, 0, sizeof(whandle)); whandle.type = handle_type(type); - res = screen->resource_get_handle(screen, srf->tex, &whandle); + res = screen->resource_get_handle(screen, srf->tex, &whandle, + PIPE_HANDLE_USAGE_READ_WRITE); if (!res) return -XA_ERR_INVAL; diff --git a/src/gallium/targets/libgl-xlib/Makefile.am b/src/gallium/targets/libgl-xlib/Makefile.am index d99caae3cb0..3f1382e2848 100644 --- a/src/gallium/targets/libgl-xlib/Makefile.am +++ b/src/gallium/targets/libgl-xlib/Makefile.am @@ -81,6 +81,11 @@ AM_CPPFLAGS += -DGALLIUM_LLVMPIPE lib@GL_LIB@_la_LDFLAGS += $(LLVM_LDFLAGS) endif +if HAVE_GALLIUM_SWR +lib@GL_LIB@_la_LIBADD += $(top_builddir)/src/gallium/drivers/swr/libmesaswr.la $(LLVM_LIBS) +AM_CPPFLAGS += -DGALLIUM_SWR +endif + EXTRA_lib@GL_LIB@_la_DEPENDENCIES = libgl-xlib.sym EXTRA_DIST = SConscript libgl-xlib.sym diff --git a/src/gallium/targets/osmesa/Makefile.am b/src/gallium/targets/osmesa/Makefile.am index 38e515f8252..5d394866946 100644 --- a/src/gallium/targets/osmesa/Makefile.am +++ b/src/gallium/targets/osmesa/Makefile.am @@ -74,6 +74,12 @@ lib@OSMESA_LIB@_la_LDFLAGS += $(LLVM_LDFLAGS) lib@OSMESA_LIB@_la_LIBADD += $(top_builddir)/src/gallium/drivers/llvmpipe/libllvmpipe.la $(LLVM_LIBS) endif +if HAVE_GALLIUM_SWR +AM_CPPFLAGS += -DGALLIUM_SWR +lib@OSMESA_LIB@_la_LDFLAGS += $(LLVM_LDFLAGS) +lib@OSMESA_LIB@_la_LIBADD += $(top_builddir)/src/gallium/drivers/swr/libmesaswr.la $(LLVM_LIBS) +endif + EXTRA_lib@OSMESA_LIB@_la_DEPENDENCIES = osmesa.sym EXTRA_DIST = \ osmesa.sym \ diff --git a/src/gallium/tests/trivial/compute.c b/src/gallium/tests/trivial/compute.c index 288cf2ad629..5d5e0b0b8c3 100644 --- a/src/gallium/tests/trivial/compute.c +++ b/src/gallium/tests/trivial/compute.c @@ -421,7 +421,7 @@ static void destroy_globals(struct context *ctx) static void launch_grid(struct context *ctx, const uint *block_layout, const uint *grid_layout, uint32_t pc, - const void *input) + void *input) { struct pipe_context *pipe = ctx->pipe; struct pipe_grid_info info; diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/addrinterface.h b/src/gallium/winsys/amdgpu/drm/addrlib/addrinterface.h index 03fbf2bd0ee..ead603378cf 100644 --- a/src/gallium/winsys/amdgpu/drm/addrlib/addrinterface.h +++ b/src/gallium/winsys/amdgpu/drm/addrlib/addrinterface.h @@ -33,13 +33,13 @@ #ifndef __ADDR_INTERFACE_H__ #define __ADDR_INTERFACE_H__ +#include "addrtypes.h" + #if defined(__cplusplus) extern "C" { #endif -#include "addrtypes.h" - #define ADDRLIB_VERSION_MAJOR 5 #define ADDRLIB_VERSION_MINOR 25 #define ADDRLIB_VERSION ((ADDRLIB_VERSION_MAJOR << 16) | ADDRLIB_VERSION_MINOR) diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c index 59a801b1426..b670f263329 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c @@ -390,14 +390,8 @@ static unsigned eg_tile_split_rev(unsigned eg_tile_split) } } -static void amdgpu_bo_get_tiling(struct pb_buffer *_buf, - enum radeon_bo_layout *microtiled, - enum radeon_bo_layout *macrotiled, - unsigned *bankw, unsigned *bankh, - unsigned *tile_split, - unsigned *stencil_tile_split, - unsigned *mtilea, - bool *scanout) +static void amdgpu_buffer_get_metadata(struct pb_buffer *_buf, + struct radeon_bo_metadata *md) { struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); struct amdgpu_bo_info info = {0}; @@ -410,61 +404,54 @@ static void amdgpu_bo_get_tiling(struct pb_buffer *_buf, tiling_flags = info.metadata.tiling_info; - *microtiled = RADEON_LAYOUT_LINEAR; - *macrotiled = RADEON_LAYOUT_LINEAR; + md->microtile = RADEON_LAYOUT_LINEAR; + md->macrotile = RADEON_LAYOUT_LINEAR; if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4) /* 2D_TILED_THIN1 */ - *macrotiled = RADEON_LAYOUT_TILED; + md->macrotile = RADEON_LAYOUT_TILED; else if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 2) /* 1D_TILED_THIN1 */ - *microtiled = RADEON_LAYOUT_TILED; + md->microtile = RADEON_LAYOUT_TILED; - if (bankw && tile_split && mtilea && tile_split) { - *bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH); - *bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT); - *tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT)); - *mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT); - } - if (scanout) - *scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */ + md->bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH); + md->bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT); + md->tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT)); + md->mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT); + md->scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */ + + md->size_metadata = info.metadata.size_metadata; + memcpy(md->metadata, info.metadata.umd_metadata, sizeof(md->metadata)); } -static void amdgpu_bo_set_tiling(struct pb_buffer *_buf, - struct radeon_winsys_cs *rcs, - enum radeon_bo_layout microtiled, - enum radeon_bo_layout macrotiled, - unsigned pipe_config, - unsigned bankw, unsigned bankh, - unsigned tile_split, - unsigned stencil_tile_split, - unsigned mtilea, unsigned num_banks, - uint32_t pitch, - bool scanout) +static void amdgpu_buffer_set_metadata(struct pb_buffer *_buf, + struct radeon_bo_metadata *md) { struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); struct amdgpu_bo_metadata metadata = {0}; uint32_t tiling_flags = 0; - if (macrotiled == RADEON_LAYOUT_TILED) + if (md->macrotile == RADEON_LAYOUT_TILED) tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */ - else if (microtiled == RADEON_LAYOUT_TILED) + else if (md->microtile == RADEON_LAYOUT_TILED) tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 2); /* 1D_TILED_THIN1 */ else tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 1); /* LINEAR_ALIGNED */ - tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, pipe_config); - tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(bankw)); - tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(bankh)); - if (tile_split) - tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(tile_split)); - tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(mtilea)); - tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(num_banks)-1); + tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, md->pipe_config); + tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(md->bankw)); + tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(md->bankh)); + if (md->tile_split) + tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(md->tile_split)); + tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(md->mtilea)); + tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(md->num_banks)-1); - if (scanout) + if (md->scanout) tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */ else tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 1); /* THIN_MICRO_TILING */ metadata.tiling_info = tiling_flags; + metadata.size_metadata = md->size_metadata; + memcpy(metadata.umd_metadata, md->metadata, sizeof(md->metadata)); amdgpu_bo_set_metadata(bo->bo, &metadata); } @@ -720,8 +707,8 @@ static uint64_t amdgpu_bo_get_va(struct pb_buffer *buf) void amdgpu_bo_init_functions(struct amdgpu_winsys *ws) { - ws->base.buffer_set_tiling = amdgpu_bo_set_tiling; - ws->base.buffer_get_tiling = amdgpu_bo_get_tiling; + ws->base.buffer_set_metadata = amdgpu_buffer_set_metadata; + ws->base.buffer_get_metadata = amdgpu_buffer_get_metadata; ws->base.buffer_map = amdgpu_bo_map; ws->base.buffer_unmap = amdgpu_bo_unmap; ws->base.buffer_wait = amdgpu_bo_wait; diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h index 6ad3cddf7cb..a2fb44a4b0e 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h @@ -76,7 +76,7 @@ struct amdgpu_cs { uint8_t *flags; struct amdgpu_cs_buffer *buffers; - int buffer_indices_hashlist[512]; + int buffer_indices_hashlist[4096]; uint64_t used_vram; uint64_t used_gart; diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c index fc7562d8f57..938b9c244b2 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c @@ -93,13 +93,26 @@ static unsigned cik_get_num_tile_pipes(struct amdgpu_gpu_info *info) } /* Helper function to do the ioctls needed for setup and init. */ -static boolean do_winsys_init(struct amdgpu_winsys *ws) +static boolean do_winsys_init(struct amdgpu_winsys *ws, int fd) { struct amdgpu_buffer_size_alignments alignment_info = {}; struct amdgpu_heap_info vram, gtt; struct drm_amdgpu_info_hw_ip dma = {}, uvd = {}, vce = {}; uint32_t vce_version = 0, vce_feature = 0; int r, i, j; + drmDevicePtr devinfo; + + /* Get PCI info. */ + r = drmGetDevice(fd, &devinfo); + if (r) { + fprintf(stderr, "amdgpu: drmGetDevice failed.\n"); + goto fail; + } + ws->info.pci_domain = devinfo->businfo.pci->domain; + ws->info.pci_bus = devinfo->businfo.pci->bus; + ws->info.pci_dev = devinfo->businfo.pci->dev; + ws->info.pci_func = devinfo->businfo.pci->func; + drmFreeDevice(&devinfo); /* Query hardware and driver information. */ r = amdgpu_query_gpu_info(ws->dev, &ws->amdinfo); @@ -437,7 +450,7 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create) ws->info.drm_major = drm_major; ws->info.drm_minor = drm_minor; - if (!do_winsys_init(ws)) + if (!do_winsys_init(ws, fd)) goto fail; /* Create managers. */ diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c index 7e9ed0ca0fe..978df52447e 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c @@ -636,14 +636,8 @@ static unsigned eg_tile_split_rev(unsigned eg_tile_split) } } -static void radeon_bo_get_tiling(struct pb_buffer *_buf, - enum radeon_bo_layout *microtiled, - enum radeon_bo_layout *macrotiled, - unsigned *bankw, unsigned *bankh, - unsigned *tile_split, - unsigned *stencil_tile_split, - unsigned *mtilea, - bool *scanout) +static void radeon_bo_get_metadata(struct pb_buffer *_buf, + struct radeon_bo_metadata *md) { struct radeon_bo *bo = radeon_bo(_buf); struct drm_radeon_gem_set_tiling args; @@ -657,81 +651,63 @@ static void radeon_bo_get_tiling(struct pb_buffer *_buf, &args, sizeof(args)); - *microtiled = RADEON_LAYOUT_LINEAR; - *macrotiled = RADEON_LAYOUT_LINEAR; + md->microtile = RADEON_LAYOUT_LINEAR; + md->macrotile = RADEON_LAYOUT_LINEAR; if (args.tiling_flags & RADEON_TILING_MICRO) - *microtiled = RADEON_LAYOUT_TILED; + md->microtile = RADEON_LAYOUT_TILED; else if (args.tiling_flags & RADEON_TILING_MICRO_SQUARE) - *microtiled = RADEON_LAYOUT_SQUARETILED; + md->microtile = RADEON_LAYOUT_SQUARETILED; if (args.tiling_flags & RADEON_TILING_MACRO) - *macrotiled = RADEON_LAYOUT_TILED; - if (bankw && tile_split && stencil_tile_split && mtilea && tile_split) { - *bankw = (args.tiling_flags >> RADEON_TILING_EG_BANKW_SHIFT) & RADEON_TILING_EG_BANKW_MASK; - *bankh = (args.tiling_flags >> RADEON_TILING_EG_BANKH_SHIFT) & RADEON_TILING_EG_BANKH_MASK; - *tile_split = (args.tiling_flags >> RADEON_TILING_EG_TILE_SPLIT_SHIFT) & RADEON_TILING_EG_TILE_SPLIT_MASK; - *stencil_tile_split = (args.tiling_flags >> RADEON_TILING_EG_STENCIL_TILE_SPLIT_SHIFT) & RADEON_TILING_EG_STENCIL_TILE_SPLIT_MASK; - *mtilea = (args.tiling_flags >> RADEON_TILING_EG_MACRO_TILE_ASPECT_SHIFT) & RADEON_TILING_EG_MACRO_TILE_ASPECT_MASK; - *tile_split = eg_tile_split(*tile_split); - } - if (scanout) - *scanout = bo->rws->gen >= DRV_SI && !(args.tiling_flags & RADEON_TILING_R600_NO_SCANOUT); + md->macrotile = RADEON_LAYOUT_TILED; + + md->bankw = (args.tiling_flags >> RADEON_TILING_EG_BANKW_SHIFT) & RADEON_TILING_EG_BANKW_MASK; + md->bankh = (args.tiling_flags >> RADEON_TILING_EG_BANKH_SHIFT) & RADEON_TILING_EG_BANKH_MASK; + md->tile_split = (args.tiling_flags >> RADEON_TILING_EG_TILE_SPLIT_SHIFT) & RADEON_TILING_EG_TILE_SPLIT_MASK; + md->stencil_tile_split = (args.tiling_flags >> RADEON_TILING_EG_STENCIL_TILE_SPLIT_SHIFT) & RADEON_TILING_EG_STENCIL_TILE_SPLIT_MASK; + md->mtilea = (args.tiling_flags >> RADEON_TILING_EG_MACRO_TILE_ASPECT_SHIFT) & RADEON_TILING_EG_MACRO_TILE_ASPECT_MASK; + md->tile_split = eg_tile_split(md->tile_split); + md->scanout = bo->rws->gen >= DRV_SI && !(args.tiling_flags & RADEON_TILING_R600_NO_SCANOUT); } -static void radeon_bo_set_tiling(struct pb_buffer *_buf, - struct radeon_winsys_cs *rcs, - enum radeon_bo_layout microtiled, - enum radeon_bo_layout macrotiled, - unsigned pipe_config, - unsigned bankw, unsigned bankh, - unsigned tile_split, - unsigned stencil_tile_split, - unsigned mtilea, unsigned num_banks, - uint32_t pitch, - bool scanout) +static void radeon_bo_set_metadata(struct pb_buffer *_buf, + struct radeon_bo_metadata *md) { struct radeon_bo *bo = radeon_bo(_buf); - struct radeon_drm_cs *cs = radeon_drm_cs(rcs); struct drm_radeon_gem_set_tiling args; memset(&args, 0, sizeof(args)); - /* Tiling determines how DRM treats the buffer data. - * We must flush CS when changing it if the buffer is referenced. */ - if (cs && radeon_bo_is_referenced_by_cs(cs, bo)) { - cs->flush_cs(cs->flush_data, 0, NULL); - } - os_wait_until_zero(&bo->num_active_ioctls, PIPE_TIMEOUT_INFINITE); - if (microtiled == RADEON_LAYOUT_TILED) + if (md->microtile == RADEON_LAYOUT_TILED) args.tiling_flags |= RADEON_TILING_MICRO; - else if (microtiled == RADEON_LAYOUT_SQUARETILED) + else if (md->microtile == RADEON_LAYOUT_SQUARETILED) args.tiling_flags |= RADEON_TILING_MICRO_SQUARE; - if (macrotiled == RADEON_LAYOUT_TILED) + if (md->macrotile == RADEON_LAYOUT_TILED) args.tiling_flags |= RADEON_TILING_MACRO; - args.tiling_flags |= (bankw & RADEON_TILING_EG_BANKW_MASK) << + args.tiling_flags |= (md->bankw & RADEON_TILING_EG_BANKW_MASK) << RADEON_TILING_EG_BANKW_SHIFT; - args.tiling_flags |= (bankh & RADEON_TILING_EG_BANKH_MASK) << + args.tiling_flags |= (md->bankh & RADEON_TILING_EG_BANKH_MASK) << RADEON_TILING_EG_BANKH_SHIFT; - if (tile_split) { - args.tiling_flags |= (eg_tile_split_rev(tile_split) & + if (md->tile_split) { + args.tiling_flags |= (eg_tile_split_rev(md->tile_split) & RADEON_TILING_EG_TILE_SPLIT_MASK) << RADEON_TILING_EG_TILE_SPLIT_SHIFT; } - args.tiling_flags |= (stencil_tile_split & + args.tiling_flags |= (md->stencil_tile_split & RADEON_TILING_EG_STENCIL_TILE_SPLIT_MASK) << RADEON_TILING_EG_STENCIL_TILE_SPLIT_SHIFT; - args.tiling_flags |= (mtilea & RADEON_TILING_EG_MACRO_TILE_ASPECT_MASK) << + args.tiling_flags |= (md->mtilea & RADEON_TILING_EG_MACRO_TILE_ASPECT_MASK) << RADEON_TILING_EG_MACRO_TILE_ASPECT_SHIFT; - if (bo->rws->gen >= DRV_SI && !scanout) + if (bo->rws->gen >= DRV_SI && !md->scanout) args.tiling_flags |= RADEON_TILING_R600_NO_SCANOUT; args.handle = bo->handle; - args.pitch = pitch; + args.pitch = md->stride; drmCommandWriteRead(bo->rws->fd, DRM_RADEON_GEM_SET_TILING, @@ -1064,8 +1040,8 @@ static uint64_t radeon_winsys_bo_va(struct pb_buffer *buf) void radeon_drm_bo_init_functions(struct radeon_drm_winsys *ws) { - ws->base.buffer_set_tiling = radeon_bo_set_tiling; - ws->base.buffer_get_tiling = radeon_bo_get_tiling; + ws->base.buffer_set_metadata = radeon_bo_set_metadata; + ws->base.buffer_get_metadata = radeon_bo_get_metadata; ws->base.buffer_map = radeon_bo_map; ws->base.buffer_unmap = radeon_bo_unmap; ws->base.buffer_wait = radeon_bo_wait; diff --git a/src/gallium/winsys/svga/drm/vmw_context.c b/src/gallium/winsys/svga/drm/vmw_context.c index dae121e4053..8d23bff5d74 100644 --- a/src/gallium/winsys/svga/drm/vmw_context.c +++ b/src/gallium/winsys/svga/drm/vmw_context.c @@ -315,6 +315,13 @@ vmw_swc_reserve(struct svga_winsys_context *swc, return vswc->command.buffer + vswc->command.used; } +static unsigned +vmw_swc_get_command_buffer_size(struct svga_winsys_context *swc) +{ + const struct vmw_svga_winsys_context *vswc = vmw_svga_winsys_context(swc); + return vswc->command.used; +} + static void vmw_swc_context_relocation(struct svga_winsys_context *swc, uint32 *cid) @@ -761,6 +768,7 @@ vmw_svga_winsys_context_create(struct svga_winsys_screen *sws) vswc->base.destroy = vmw_swc_destroy; vswc->base.reserve = vmw_swc_reserve; + vswc->base.get_command_buffer_size = vmw_swc_get_command_buffer_size; vswc->base.surface_relocation = vmw_swc_surface_relocation; vswc->base.region_relocation = vmw_swc_region_relocation; vswc->base.mob_relocation = vmw_swc_mob_relocation; diff --git a/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c b/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c index c86d95a14fe..7fc93e74812 100644 --- a/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c +++ b/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c @@ -52,6 +52,7 @@ #include <unistd.h> #define VMW_MAX_DEFAULT_TEXTURE_SIZE (128 * 1024 * 1024) +#define VMW_FENCE_TIMEOUT_SECONDS 60 struct vmw_region { @@ -721,7 +722,7 @@ vmw_ioctl_fence_finish(struct vmw_winsys_screen *vws, memset(&arg, 0, sizeof(arg)); arg.handle = handle; - arg.timeout_us = 10*1000000; + arg.timeout_us = VMW_FENCE_TIMEOUT_SECONDS*1000000; arg.lazy = 0; arg.flags = vflags; diff --git a/src/gallium/winsys/svga/drm/vmw_surface.c b/src/gallium/winsys/svga/drm/vmw_surface.c index 6c0ad3bbf19..a438b1a7c5b 100644 --- a/src/gallium/winsys/svga/drm/vmw_surface.c +++ b/src/gallium/winsys/svga/drm/vmw_surface.c @@ -170,6 +170,8 @@ vmw_svga_winsys_surface_unmap(struct svga_winsys_context *swc, *rebind = vsrf->rebind; vsrf->rebind = FALSE; vmw_svga_winsys_buffer_unmap(&vsrf->screen->base, vsrf->buf); + } else { + *rebind = FALSE; } pipe_mutex_unlock(vsrf->mutex); } diff --git a/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c b/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c index 4d87a580cb1..e130cd256e9 100644 --- a/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c +++ b/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c @@ -185,7 +185,8 @@ wsw_dt_from_handle(struct sw_winsys *ws, struct wrapper_sw_winsys *wsw = wrapper_sw_winsys(ws); struct pipe_resource *tex; - tex = wsw->screen->resource_from_handle(wsw->screen, templ, whandle); + tex = wsw->screen->resource_from_handle(wsw->screen, templ, whandle, + PIPE_HANDLE_USAGE_READ_WRITE); if (!tex) return NULL; @@ -201,7 +202,8 @@ wsw_dt_get_handle(struct sw_winsys *ws, struct wrapper_sw_displaytarget *wdt = wrapper_sw_displaytarget(dt); struct pipe_resource *tex = wdt->tex; - return wsw->screen->resource_get_handle(wsw->screen, tex, whandle); + return wsw->screen->resource_get_handle(wsw->screen, tex, whandle, + PIPE_HANDLE_USAGE_READ_WRITE); } static void * diff --git a/src/gbm/main/gbm.h b/src/gbm/main/gbm.h index 8db2153e84b..63d9a9edfd6 100644 --- a/src/gbm/main/gbm.h +++ b/src/gbm/main/gbm.h @@ -28,16 +28,16 @@ #ifndef _GBM_H_ #define _GBM_H_ -#ifdef __cplusplus -extern "C" { -#endif - - #define __GBM__ 1 #include <stddef.h> #include <stdint.h> +#ifdef __cplusplus +extern "C" { +#endif + + /** * \file gbm.h * \brief Generic Buffer Manager diff --git a/src/loader/loader_dri3_helper.c b/src/loader/loader_dri3_helper.c index 62bfe845c08..896f2252f36 100644 --- a/src/loader/loader_dri3_helper.c +++ b/src/loader/loader_dri3_helper.c @@ -858,7 +858,8 @@ dri3_alloc_render_buffer(struct loader_dri3_drawable *draw, unsigned int format, width, height, format, __DRI_IMAGE_USE_SHARE | - __DRI_IMAGE_USE_SCANOUT, + __DRI_IMAGE_USE_SCANOUT | + __DRI_IMAGE_USE_BACKBUFFER, buffer); pixmap_buffer = buffer->image; @@ -878,7 +879,8 @@ dri3_alloc_render_buffer(struct loader_dri3_drawable *draw, unsigned int format, (draw->ext->image->createImage)(draw->dri_screen, width, height, format, __DRI_IMAGE_USE_SHARE | - __DRI_IMAGE_USE_LINEAR, + __DRI_IMAGE_USE_LINEAR | + __DRI_IMAGE_USE_BACKBUFFER, buffer); pixmap_buffer = buffer->linear_buffer; diff --git a/src/mapi/glapi/gen/ARB_direct_state_access.xml b/src/mapi/glapi/gen/ARB_direct_state_access.xml index 293d7164680..155b6f8d528 100644 --- a/src/mapi/glapi/gen/ARB_direct_state_access.xml +++ b/src/mapi/glapi/gen/ARB_direct_state_access.xml @@ -153,32 +153,32 @@ <!-- Framebuffer object functions --> - <function name="CreateFramebuffers" offset="assign"> + <function name="CreateFramebuffers"> <param name="n" type="GLsizei" /> <param name="framebuffers" type="GLuint *" /> </function> - <function name="NamedFramebufferRenderbuffer" offset="assign"> + <function name="NamedFramebufferRenderbuffer"> <param name="framebuffer" type="GLuint" /> <param name="attachment" type="GLenum" /> <param name="renderbuffertarget" type="GLenum" /> <param name="renderbuffer" type="GLuint" /> </function> - <function name="NamedFramebufferParameteri" offset="assign"> + <function name="NamedFramebufferParameteri"> <param name="framebuffer" type="GLuint" /> <param name="pname" type="GLenum" /> <param name="param" type="GLint" /> </function> - <function name="NamedFramebufferTexture" offset="assign"> + <function name="NamedFramebufferTexture"> <param name="framebuffer" type="GLuint" /> <param name="attachment" type="GLenum" /> <param name="texture" type="GLuint" /> <param name="level" type="GLint" /> </function> - <function name="NamedFramebufferTextureLayer" offset="assign"> + <function name="NamedFramebufferTextureLayer"> <param name="framebuffer" type="GLuint" /> <param name="attachment" type="GLenum" /> <param name="texture" type="GLuint" /> @@ -186,29 +186,29 @@ <param name="layer" type="GLint" /> </function> - <function name="NamedFramebufferDrawBuffer" offset="assign"> + <function name="NamedFramebufferDrawBuffer"> <param name="framebuffer" type="GLuint" /> <param name="buf" type="GLenum" /> </function> - <function name="NamedFramebufferDrawBuffers" offset="assign"> + <function name="NamedFramebufferDrawBuffers"> <param name="framebuffer" type="GLuint" /> <param name="n" type="GLsizei" /> <param name="bufs" type="const GLenum *" /> </function> - <function name="NamedFramebufferReadBuffer" offset="assign"> + <function name="NamedFramebufferReadBuffer"> <param name="framebuffer" type="GLuint" /> <param name="buf" type="GLenum" /> </function> - <function name="InvalidateNamedFramebufferData" offset="assign"> + <function name="InvalidateNamedFramebufferData"> <param name="framebuffer" type="GLuint" /> <param name="numAttachments" type="GLsizei" /> <param name="attachments" type="const GLenum *" /> </function> - <function name="InvalidateNamedFramebufferSubData" offset="assign"> + <function name="InvalidateNamedFramebufferSubData"> <param name="framebuffer" type="GLuint" /> <param name="numAttachments" type="GLsizei" /> <param name="attachments" type="const GLenum *" /> @@ -218,35 +218,35 @@ <param name="height" type="GLsizei" /> </function> - <function name="ClearNamedFramebufferiv" offset="assign"> + <function name="ClearNamedFramebufferiv"> <param name="framebuffer" type="GLuint" /> <param name="buffer" type="GLenum" /> <param name="drawbuffer" type="GLint" /> <param name="value" type="const GLint *" /> </function> - <function name="ClearNamedFramebufferuiv" offset="assign"> + <function name="ClearNamedFramebufferuiv"> <param name="framebuffer" type="GLuint" /> <param name="buffer" type="GLenum" /> <param name="drawbuffer" type="GLint" /> <param name="value" type="const GLuint *" /> </function> - <function name="ClearNamedFramebufferfv" offset="assign"> + <function name="ClearNamedFramebufferfv"> <param name="framebuffer" type="GLuint" /> <param name="buffer" type="GLenum" /> <param name="drawbuffer" type="GLint" /> <param name="value" type="const GLfloat *" /> </function> - <function name="ClearNamedFramebufferfi" offset="assign"> + <function name="ClearNamedFramebufferfi"> <param name="framebuffer" type="GLuint" /> <param name="buffer" type="GLenum" /> <param name="depth" type="GLfloat" /> <param name="stencil" type="GLint" /> </function> - <function name="BlitNamedFramebuffer" offset="assign"> + <function name="BlitNamedFramebuffer"> <param name="readFramebuffer" type="GLuint" /> <param name="drawFramebuffer" type="GLuint" /> <param name="srcX0" type="GLint" /> @@ -261,19 +261,19 @@ <param name="filter" type="GLenum" /> </function> - <function name="CheckNamedFramebufferStatus" offset="assign"> + <function name="CheckNamedFramebufferStatus"> <return type="GLenum" /> <param name="framebuffer" type="GLuint" /> <param name="target" type="GLenum" /> </function> - <function name="GetNamedFramebufferParameteriv" offset="assign"> + <function name="GetNamedFramebufferParameteriv"> <param name="framebuffer" type="GLuint" /> <param name="pname" type="GLenum" /> <param name="param" type="GLint *" /> </function> - <function name="GetNamedFramebufferAttachmentParameteriv" offset="assign"> + <function name="GetNamedFramebufferAttachmentParameteriv"> <param name="framebuffer" type="GLuint" /> <param name="attachment" type="GLenum" /> <param name="pname" type="GLenum" /> diff --git a/src/mapi/glapi/gen/ARB_get_texture_sub_image.xml b/src/mapi/glapi/gen/ARB_get_texture_sub_image.xml index 14e1c20b9d5..47e26abb28e 100644 --- a/src/mapi/glapi/gen/ARB_get_texture_sub_image.xml +++ b/src/mapi/glapi/gen/ARB_get_texture_sub_image.xml @@ -7,7 +7,7 @@ <category name="GL_ARB_get_texture_sub_image" number="165"> - <function name="GetTextureSubImage" offset="assign"> + <function name="GetTextureSubImage"> <param name="texture" type="GLuint"/> <param name="level" type="GLint"/> <param name="xoffset" type="GLint"/> @@ -22,7 +22,7 @@ <param name="pixels" type="GLvoid *"/> </function> - <function name="GetCompressedTextureSubImage" offset="assign"> + <function name="GetCompressedTextureSubImage"> <param name="texture" type="GLuint"/> <param name="level" type="GLint"/> <param name="xoffset" type="GLint"/> diff --git a/src/mapi/glapi/gen/ARB_internalformat_query2.xml b/src/mapi/glapi/gen/ARB_internalformat_query2.xml new file mode 100644 index 00000000000..9b0f320fba7 --- /dev/null +++ b/src/mapi/glapi/gen/ARB_internalformat_query2.xml @@ -0,0 +1,119 @@ +<?xml version="1.0"?> +<!DOCTYPE OpenGLAPI SYSTEM "gl_API.dtd"> + +<OpenGLAPI> + +<category name="GL_ARB_internalformat_query2" number="131"> + <!-- Other existing enums are reused for this extension. --> + + <enum name="INTERNALFORMAT_SUPPORTED" value="0x826F"/> + <enum name="INTERNALFORMAT_PREFERRED" value="0x8270"/> + <enum name="INTERNALFORMAT_RED_SIZE" value="0x8271"/> + <enum name="INTERNALFORMAT_GREEN_SIZE" value="0x8272"/> + <enum name="INTERNALFORMAT_BLUE_SIZE" value="0x8273"/> + <enum name="INTERNALFORMAT_ALPHA_SIZE" value="0x8274"/> + <enum name="INTERNALFORMAT_DEPTH_SIZE" value="0x8275"/> + <enum name="INTERNALFORMAT_STENCIL_SIZE" value="0x8276"/> + <enum name="INTERNALFORMAT_SHARED_SIZE" value="0x8277"/> + <enum name="INTERNALFORMAT_RED_TYPE" value="0x8278"/> + <enum name="INTERNALFORMAT_GREEN_TYPE" value="0x8279"/> + <enum name="INTERNALFORMAT_BLUE_TYPE" value="0x827A"/> + <enum name="INTERNALFORMAT_ALPHA_TYPE" value="0x827B"/> + <enum name="INTERNALFORMAT_DEPTH_TYPE" value="0x827C"/> + <enum name="INTERNALFORMAT_STENCIL_TYPE" value="0x827D"/> + <enum name="MAX_WIDTH" value="0x827E"/> + <enum name="MAX_HEIGHT" value="0x827F"/> + <enum name="MAX_DEPTH" value="0x8280"/> + <enum name="MAX_LAYERS" value="0x8281"/> + <enum name="MAX_COMBINED_DIMENSIONS" value="0x8282"/> + <enum name="COLOR_COMPONENTS" value="0x8283"/> + <enum name="DEPTH_COMPONENTS" value="0x8284"/> + <enum name="STENCIL_COMPONENTS" value="0x8285"/> + <enum name="COLOR_RENDERABLE" value="0x8286"/> + <enum name="DEPTH_RENDERABLE" value="0x8287"/> + <enum name="STENCIL_RENDERABLE" value="0x8288"/> + <enum name="FRAMEBUFFER_RENDERABLE" value="0x8289"/> + <enum name="FRAMEBUFFER_RENDERABLE_LAYERED" value="0x828A"/> + <enum name="FRAMEBUFFER_BLEND" value="0x828B"/> + <enum name="READ_PIXELS" value="0x828C"/> + <enum name="READ_PIXELS_FORMAT" value="0x828D"/> + <enum name="READ_PIXELS_TYPE" value="0x828E"/> + <enum name="TEXTURE_IMAGE_FORMAT" value="0x828F"/> + <enum name="TEXTURE_IMAGE_TYPE" value="0x8290"/> + <enum name="GET_TEXTURE_IMAGE_FORMAT" value="0x8291"/> + <enum name="GET_TEXTURE_IMAGE_TYPE" value="0x8292"/> + <enum name="MIPMAP" value="0x8293"/> + <enum name="MANUAL_GENERATE_MIPMAP" value="0x8294"/> + <enum name="AUTO_GENERATE_MIPMAP" value="0x8295"/> + <enum name="COLOR_ENCODING" value="0x8296"/> + <enum name="SRGB_READ" value="0x8297"/> + <enum name="SRGB_WRITE" value="0x8298"/> + <enum name="SRGB_DECODE_ARB" value="0x8299"/> + <enum name="FILTER" value="0x829A"/> + <enum name="VERTEX_TEXTURE" value="0x829B"/> + <enum name="TESS_CONTROL_TEXTURE" value="0x829C"/> + <enum name="TESS_EVALUATION_TEXTURE" value="0x829D"/> + <enum name="GEOMETRY_TEXTURE" value="0x829E"/> + <enum name="FRAGMENT_TEXTURE" value="0x829F"/> + <enum name="COMPUTE_TEXTURE" value="0x82A0"/> + <enum name="TEXTURE_SHADOW" value="0x82A1"/> + <enum name="TEXTURE_GATHER" value="0x82A2"/> + <enum name="TEXTURE_GATHER_SHADOW" value="0x82A3"/> + <enum name="SHADER_IMAGE_LOAD" value="0x82A4"/> + <enum name="SHADER_IMAGE_STORE" value="0x82A5"/> + <enum name="SHADER_IMAGE_ATOMIC" value="0x82A6"/> + <enum name="IMAGE_TEXEL_SIZE" value="0x82A7"/> + <enum name="IMAGE_COMPATIBILITY_CLASS" value="0x82A8"/> + <enum name="IMAGE_PIXEL_FORMAT" value="0x82A9"/> + <enum name="IMAGE_PIXEL_TYPE" value="0x82AA"/> + <enum name="SIMULTANEOUS_TEXTURE_AND_DEPTH_TEST" value="0x82AC"/> + <enum name="SIMULTANEOUS_TEXTURE_AND_STENCIL_TEST" value="0x82AD"/> + <enum name="SIMULTANEOUS_TEXTURE_AND_DEPTH_WRITE" value="0x82AE"/> + <enum name="SIMULTANEOUS_TEXTURE_AND_STENCIL_WRITE" value="0x82AF"/> + <enum name="TEXTURE_COMPRESSED" value="0x86A1"/> + <enum name="TEXTURE_COMPRESSED_BLOCK_WIDTH" value="0x82B1"/> + <enum name="TEXTURE_COMPRESSED_BLOCK_HEIGHT" value="0x82B2"/> + <enum name="TEXTURE_COMPRESSED_BLOCK_SIZE" value="0x82B3"/> + <enum name="CLEAR_BUFFER" value="0x82B4"/> + <enum name="TEXTURE_VIEW" value="0x82B5"/> + <enum name="VIEW_COMPATIBILITY_CLASS" value="0x82B6"/> + <enum name="FULL_SUPPORT" value="0x82B7"/> + <enum name="CAVEAT_SUPPORT" value="0x82B8"/> + <enum name="IMAGE_CLASS_4_X_32" value="0x82B9"/> + <enum name="IMAGE_CLASS_2_X_32" value="0x82BA"/> + <enum name="IMAGE_CLASS_1_X_32" value="0x82BB"/> + <enum name="IMAGE_CLASS_4_X_16" value="0x82BC"/> + <enum name="IMAGE_CLASS_2_X_16" value="0x82BD"/> + <enum name="IMAGE_CLASS_1_X_16" value="0x82BE"/> + <enum name="IMAGE_CLASS_4_X_8" value="0x82BF"/> + <enum name="IMAGE_CLASS_2_X_8" value="0x82C0"/> + <enum name="IMAGE_CLASS_1_X_8" value="0x82C1"/> + <enum name="IMAGE_CLASS_11_11_10" value="0x82C2"/> + <enum name="IMAGE_CLASS_10_10_10_2" value="0x82C3"/> + <enum name="VIEW_CLASS_128_BITS" value="0x82C4"/> + <enum name="VIEW_CLASS_96_BITS" value="0x82C5"/> + <enum name="VIEW_CLASS_64_BITS" value="0x82C6"/> + <enum name="VIEW_CLASS_48_BITS" value="0x82C7"/> + <enum name="VIEW_CLASS_32_BITS" value="0x82C8"/> + <enum name="VIEW_CLASS_24_BITS" value="0x82C9"/> + <enum name="VIEW_CLASS_16_BITS" value="0x82CA"/> + <enum name="VIEW_CLASS_8_BITS" value="0x82CB"/> + <enum name="VIEW_CLASS_S3TC_DXT1_RGB" value="0x82CC"/> + <enum name="VIEW_CLASS_S3TC_DXT1_RGBA" value="0x82CD"/> + <enum name="VIEW_CLASS_S3TC_DXT3_RGBA" value="0x82CE"/> + <enum name="VIEW_CLASS_S3TC_DXT5_RGBA" value="0x82CF"/> + <enum name="VIEW_CLASS_RGTC1_RED" value="0x82D0"/> + <enum name="VIEW_CLASS_RGTC2_RG" value="0x82D1"/> + <enum name="VIEW_CLASS_BPTC_UNORM" value="0x82D2"/> + <enum name="VIEW_CLASS_BPTC_FLOAT" value="0x82D3"/> + + <function name="GetInternalformati64v" es2="3.0"> + <param name="target" type="GLenum"/> + <param name="internalformat" type="GLenum"/> + <param name="pname" type="GLenum"/> + <param name="bufSize" type="GLsizei"/> + <param name="params" type="GLint64 *"/> + </function> +</category> + +</OpenGLAPI> diff --git a/src/mapi/glapi/gen/ARB_shader_subroutine.xml b/src/mapi/glapi/gen/ARB_shader_subroutine.xml index 04b75cb8f59..8a7d08c7f71 100644 --- a/src/mapi/glapi/gen/ARB_shader_subroutine.xml +++ b/src/mapi/glapi/gen/ARB_shader_subroutine.xml @@ -7,21 +7,21 @@ <category name="GL_ARB_shader_subroutine" number="90"> - <function name="GetSubroutineUniformLocation" offset="assign"> + <function name="GetSubroutineUniformLocation"> <param name="program" type="GLuint"/> <param name="shadertype" type="GLenum"/> <param name="name" type="const GLchar *"/> <return type="GLint"/> </function> - <function name="GetSubroutineIndex" offset="assign"> + <function name="GetSubroutineIndex"> <param name="program" type="GLuint"/> <param name="shadertype" type="GLenum"/> <param name="name" type="const GLchar *"/> <return type="GLuint"/> </function> - <function name="GetActiveSubroutineUniformiv" offset="assign"> + <function name="GetActiveSubroutineUniformiv"> <param name="program" type="GLuint"/> <param name="shadertype" type="GLenum"/> <param name="index" type="GLuint"/> @@ -29,7 +29,7 @@ <param name="values" type="GLint *" output="true"/> </function> - <function name="GetActiveSubroutineUniformName" offset="assign"> + <function name="GetActiveSubroutineUniformName"> <param name="program" type="GLuint"/> <param name="shadertype" type="GLenum"/> <param name="index" type="GLuint"/> @@ -38,7 +38,7 @@ <param name="name" type="GLchar *" output="true"/> </function> - <function name="GetActiveSubroutineName" offset="assign"> + <function name="GetActiveSubroutineName"> <param name="program" type="GLuint"/> <param name="shadertype" type="GLenum"/> <param name="index" type="GLuint"/> @@ -47,19 +47,19 @@ <param name="name" type="GLchar *" output="true"/> </function> - <function name="UniformSubroutinesuiv" offset="assign"> + <function name="UniformSubroutinesuiv"> <param name="shadertype" type="GLenum"/> <param name="count" type="GLsizei"/> <param name="indices" type="const GLuint *"/> </function> - <function name="GetUniformSubroutineuiv" offset="assign"> + <function name="GetUniformSubroutineuiv"> <param name="shadertype" type="GLenum"/> <param name="location" type="GLint"/> <param name="params" type="GLuint *" output="true"/> </function> - <function name="GetProgramStageiv" offset="assign"> + <function name="GetProgramStageiv"> <param name="program" type="GLuint"/> <param name="shadertype" type="GLenum"/> <param name="pname" type="GLenum"/> diff --git a/src/mapi/glapi/gen/ARB_tessellation_shader.xml b/src/mapi/glapi/gen/ARB_tessellation_shader.xml index 16a213933ef..77f2228ae3b 100644 --- a/src/mapi/glapi/gen/ARB_tessellation_shader.xml +++ b/src/mapi/glapi/gen/ARB_tessellation_shader.xml @@ -49,11 +49,11 @@ <enum value="0x8E89" name="MAX_TESS_CONTROL_UNIFORM_BLOCKS"/> <enum value="0x8E8A" name="MAX_TESS_EVALUATION_UNIFORM_BLOCKS"/> - <function name="PatchParameteri" offset="assign"> + <function name="PatchParameteri"> <param name="pname" type="GLenum"/> <param name="value" type="GLint"/> </function> - <function name="PatchParameterfv" offset="assign"> + <function name="PatchParameterfv"> <param name="pname" type="GLenum"/> <param name="values" type="const GLfloat *"/> </function> diff --git a/src/mapi/glapi/gen/Makefile.am b/src/mapi/glapi/gen/Makefile.am index cd7feabba24..8421af48854 100644 --- a/src/mapi/glapi/gen/Makefile.am +++ b/src/mapi/glapi/gen/Makefile.am @@ -148,6 +148,7 @@ API_XML = \ ARB_indirect_parameters.xml \ ARB_instanced_arrays.xml \ ARB_internalformat_query.xml \ + ARB_internalformat_query2.xml \ ARB_invalidate_subdata.xml \ ARB_map_buffer_range.xml \ ARB_multi_bind.xml \ diff --git a/src/mapi/glapi/gen/gl_API.xml b/src/mapi/glapi/gen/gl_API.xml index db98ac05fd9..8b49f915169 100644 --- a/src/mapi/glapi/gen/gl_API.xml +++ b/src/mapi/glapi/gen/gl_API.xml @@ -8195,7 +8195,7 @@ <xi:include href="ARB_framebuffer_no_attachments.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/> -<!-- ARB extensions #131 --> +<xi:include href="ARB_internalformat_query2.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/> <category name="GL_ARB_explicit_uniform_location" number="128"> <enum name="MAX_UNIFORM_LOCATIONS" count="1" value="0x826E" > diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c index 752aaf6c006..e96f92af5bb 100644 --- a/src/mesa/drivers/common/driverfuncs.c +++ b/src/mesa/drivers/common/driverfuncs.c @@ -90,7 +90,7 @@ _mesa_init_driver_functions(struct dd_function_table *driver) /* Texture functions */ driver->ChooseTextureFormat = _mesa_choose_tex_format; - driver->QuerySamplesForFormat = _mesa_query_samples_for_format; + driver->QueryInternalFormat = _mesa_query_internal_format_default; driver->TexImage = _mesa_store_teximage; driver->TexSubImage = _mesa_store_texsubimage; driver->GetTexSubImage = _mesa_meta_GetTexSubImage; diff --git a/src/mesa/drivers/common/meta.h b/src/mesa/drivers/common/meta.h index 60ae5f7577f..c2efa50a33d 100644 --- a/src/mesa/drivers/common/meta.h +++ b/src/mesa/drivers/common/meta.h @@ -536,7 +536,7 @@ _mesa_meta_pbo_TexSubImage(struct gl_context *ctx, GLuint dims, int xoffset, int yoffset, int zoffset, int width, int height, int depth, GLenum format, GLenum type, const void *pixels, - bool allocate_storage, bool create_pbo, + bool create_pbo, const struct gl_pixelstore_attrib *packing); extern bool diff --git a/src/mesa/drivers/common/meta_tex_subimage.c b/src/mesa/drivers/common/meta_tex_subimage.c index 639d3236359..dfd3327dd55 100644 --- a/src/mesa/drivers/common/meta_tex_subimage.c +++ b/src/mesa/drivers/common/meta_tex_subimage.c @@ -175,7 +175,7 @@ _mesa_meta_pbo_TexSubImage(struct gl_context *ctx, GLuint dims, int xoffset, int yoffset, int zoffset, int width, int height, int depth, GLenum format, GLenum type, const void *pixels, - bool allocate_storage, bool create_pbo, + bool create_pbo, const struct gl_pixelstore_attrib *packing) { struct gl_buffer_object *pbo = NULL; @@ -214,19 +214,18 @@ _mesa_meta_pbo_TexSubImage(struct gl_context *ctx, GLuint dims, */ image_height = packing->ImageHeight == 0 ? height : packing->ImageHeight; + _mesa_meta_begin(ctx, ~(MESA_META_PIXEL_TRANSFER | + MESA_META_PIXEL_STORE)); + pbo_tex_image = create_texture_for_pbo(ctx, create_pbo, GL_PIXEL_UNPACK_BUFFER, dims, width, height, depth, format, type, pixels, packing, &pbo, &pbo_tex); - if (!pbo_tex_image) + if (!pbo_tex_image) { + _mesa_meta_end(ctx); return false; - - if (allocate_storage) - ctx->Driver.AllocTextureImageBuffer(ctx, tex_image); - - _mesa_meta_begin(ctx, ~(MESA_META_PIXEL_TRANSFER | - MESA_META_PIXEL_STORE)); + } readFb = ctx->Driver.NewFramebuffer(ctx, 0xDEADBEEF); if (readFb == NULL) @@ -361,15 +360,18 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims, */ image_height = packing->ImageHeight == 0 ? height : packing->ImageHeight; + _mesa_meta_begin(ctx, ~(MESA_META_PIXEL_TRANSFER | + MESA_META_PIXEL_STORE)); + pbo_tex_image = create_texture_for_pbo(ctx, false, GL_PIXEL_PACK_BUFFER, dims, width, height, depth, format, type, pixels, packing, &pbo, &pbo_tex); - if (!pbo_tex_image) - return false; - _mesa_meta_begin(ctx, ~(MESA_META_PIXEL_TRANSFER | - MESA_META_PIXEL_STORE)); + if (!pbo_tex_image) { + _mesa_meta_end(ctx); + return false; + } /* GL_CLAMP_FRAGMENT_COLOR doesn't affect ReadPixels and GettexImage */ if (ctx->Extensions.ARB_color_buffer_float) diff --git a/src/mesa/drivers/dri/i915/intel_context.h b/src/mesa/drivers/dri/i915/intel_context.h index aecd7c23f45..39b328a3f3e 100644 --- a/src/mesa/drivers/dri/i915/intel_context.h +++ b/src/mesa/drivers/dri/i915/intel_context.h @@ -40,17 +40,16 @@ extern "C" { #define virtual virt #endif -#include "drm.h" -#include "intel_bufmgr.h" - -#include "intel_screen.h" -#include "intel_tex_obj.h" -#include "i915_drm.h" - +#include <drm.h> +#include <intel_bufmgr.h> +#include <i915_drm.h> #ifdef __cplusplus #undef virtual #endif +#include "intel_screen.h" +#include "intel_tex_obj.h" + #include "tnl/t_vertex.h" #define TAG(x) intel##x diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources index 8f92fd7cfd2..2802ec9887c 100644 --- a/src/mesa/drivers/dri/i965/Makefile.sources +++ b/src/mesa/drivers/dri/i965/Makefile.sources @@ -124,6 +124,7 @@ i965_FILES = \ brw_ff_gs.h \ brw_fs_channel_expressions.cpp \ brw_fs_vector_splitting.cpp \ + brw_formatquery.c \ brw_gs.c \ brw_gs.h \ brw_gs_state.c \ diff --git a/src/mesa/drivers/dri/i965/brw_clip_unfilled.c b/src/mesa/drivers/dri/i965/brw_clip_unfilled.c index 3c18858abf1..d333d10d299 100644 --- a/src/mesa/drivers/dri/i965/brw_clip_unfilled.c +++ b/src/mesa/drivers/dri/i965/brw_clip_unfilled.c @@ -85,8 +85,10 @@ static void compute_tri_direction( struct brw_clip_compile *c ) /* Take their crossproduct: */ brw_set_default_access_mode(p, BRW_ALIGN_16); - brw_MUL(p, vec4(brw_null_reg()), brw_swizzle(e, 1,2,0,3), brw_swizzle(f,2,0,1,3)); - brw_MAC(p, vec4(e), negate(brw_swizzle(e, 2,0,1,3)), brw_swizzle(f,1,2,0,3)); + brw_MUL(p, vec4(brw_null_reg()), brw_swizzle(e, BRW_SWIZZLE_YZXW), + brw_swizzle(f, BRW_SWIZZLE_ZXYW)); + brw_MAC(p, vec4(e), negate(brw_swizzle(e, BRW_SWIZZLE_ZXYW)), + brw_swizzle(f, BRW_SWIZZLE_YZXW)); brw_set_default_access_mode(p, BRW_ALIGN_1); brw_MUL(p, c->reg.dir, c->reg.dir, vec4(e)); diff --git a/src/mesa/drivers/dri/i965/brw_clip_util.c b/src/mesa/drivers/dri/i965/brw_clip_util.c index 7ef3305a25f..3e6664e4a82 100644 --- a/src/mesa/drivers/dri/i965/brw_clip_util.c +++ b/src/mesa/drivers/dri/i965/brw_clip_util.c @@ -98,7 +98,8 @@ void brw_clip_project_position(struct brw_clip_compile *c, struct brw_reg pos ) /* value.xyz *= value.rhw */ brw_set_default_access_mode(p, BRW_ALIGN_16); - brw_MUL(p, brw_writemask(pos, WRITEMASK_XYZ), pos, brw_swizzle1(pos, W)); + brw_MUL(p, brw_writemask(pos, WRITEMASK_XYZ), pos, + brw_swizzle(pos, BRW_SWIZZLE_WWWW)); brw_set_default_access_mode(p, BRW_ALIGN_1); } @@ -194,11 +195,11 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c, brw_set_default_access_mode(p, BRW_ALIGN_16); brw_MOV(p, brw_writemask(t_nopersp, WRITEMASK_ZW), - brw_swizzle(tmp, 0, 1, 0, 1)); + brw_swizzle(tmp, BRW_SWIZZLE_XYXY)); /* t_nopersp = vec4(v1.xy, dest.xy) - v0.xyxy */ brw_ADD(p, t_nopersp, t_nopersp, - negate(brw_swizzle(v0_ndc_copy, 0, 1, 0, 1))); + negate(brw_swizzle(v0_ndc_copy, BRW_SWIZZLE_XYXY))); /* Add the absolute values of the X and Y deltas so that if * the points aren't in the same place on the screen we get @@ -212,8 +213,8 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c, */ brw_ADD(p, brw_writemask(t_nopersp, WRITEMASK_XY), - brw_abs(brw_swizzle(t_nopersp, 0, 2, 0, 0)), - brw_abs(brw_swizzle(t_nopersp, 1, 3, 0, 0))); + brw_abs(brw_swizzle(t_nopersp, BRW_SWIZZLE_XZXZ)), + brw_abs(brw_swizzle(t_nopersp, BRW_SWIZZLE_YWYW))); brw_set_default_access_mode(p, BRW_ALIGN_1); /* If the points are in the same place, just substitute a @@ -234,7 +235,7 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c, brw_MUL(p, vec1(t_nopersp), vec1(t_nopersp), vec1(suboffset(t_nopersp, 1))); brw_set_default_access_mode(p, BRW_ALIGN_16); - brw_MOV(p, t_nopersp, brw_swizzle(t_nopersp, 0, 0, 0, 0)); + brw_MOV(p, t_nopersp, brw_swizzle(t_nopersp, BRW_SWIZZLE_XXXX)); brw_set_default_access_mode(p, BRW_ALIGN_1); release_tmp(c, tmp); diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index 31b6b2a3641..2d480d02366 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -75,53 +75,29 @@ * Mesa's Driver Functions ***************************************/ -static size_t -brw_query_samples_for_format(struct gl_context *ctx, GLenum target, - GLenum internalFormat, int samples[16]) -{ - struct brw_context *brw = brw_context(ctx); - - (void) target; - - switch (brw->gen) { - case 9: - samples[0] = 16; - samples[1] = 8; - samples[2] = 4; - samples[3] = 2; - return 4; - - case 8: - samples[0] = 8; - samples[1] = 4; - samples[2] = 2; - return 3; - - case 7: - samples[0] = 8; - samples[1] = 4; - return 2; - - case 6: - samples[0] = 4; - return 1; +const char *const brw_vendor_string = "Intel Open Source Technology Center"; +static const char * +get_bsw_model(const struct intel_screen *intelScreen) +{ + switch (intelScreen->eu_total) { + case 16: + return "405"; + case 12: + return "400"; default: - assert(brw->gen < 6); - samples[0] = 1; - return 1; + return " "; } } -const char *const brw_vendor_string = "Intel Open Source Technology Center"; - const char * -brw_get_renderer_string(unsigned deviceID) +brw_get_renderer_string(const struct intel_screen *intelScreen) { const char *chipset; static char buffer[128]; + char *bsw = NULL; - switch (deviceID) { + switch (intelScreen->deviceID) { #undef CHIPSET #define CHIPSET(id, symbol, str) case id: chipset = str; break; #include "pci_ids/i965_pci_ids.h" @@ -130,7 +106,18 @@ brw_get_renderer_string(unsigned deviceID) break; } + /* Braswell branding is funny, so we have to fix it up here */ + if (intelScreen->deviceID == 0x22B1) { + bsw = strdup(chipset); + char *needle = strstr(bsw, "XXX"); + if (needle) { + memcpy(needle, get_bsw_model(intelScreen), 3); + chipset = bsw; + } + } + (void) driGetRendererString(buffer, chipset, 0); + free(bsw); return buffer; } @@ -145,7 +132,7 @@ intel_get_string(struct gl_context * ctx, GLenum name) case GL_RENDERER: return - (GLubyte *) brw_get_renderer_string(brw->intelScreen->deviceID); + (GLubyte *) brw_get_renderer_string(brw->intelScreen); default: return NULL; @@ -379,7 +366,7 @@ brw_init_driver_functions(struct brw_context *brw, if (brw->gen >= 7) brw_init_conditional_render_functions(functions); - functions->QuerySamplesForFormat = brw_query_samples_for_format; + functions->QueryInternalFormat = brw_query_internal_format; functions->NewTransformFeedback = brw_new_transform_feedback; functions->DeleteTransformFeedback = brw_delete_transform_feedback; @@ -682,6 +669,11 @@ brw_initialize_context_constants(struct brw_context *brw) brw->intelScreen->compiler->glsl_compiler_options[i]; } + if (brw->gen >= 7) { + ctx->Const.MaxViewportWidth = 32768; + ctx->Const.MaxViewportHeight = 32768; + } + /* ARB_viewport_array */ if (brw->gen >= 6 && ctx->API == API_OPENGL_CORE) { ctx->Const.MaxViewports = GEN6_NUM_VIEWPORTS; @@ -698,8 +690,8 @@ brw_initialize_context_constants(struct brw_context *brw) ctx->Const.MaxVertexStreams = MIN2(4, MAX_VERTEX_STREAMS); /* ARB_framebuffer_no_attachments */ - ctx->Const.MaxFramebufferWidth = ctx->Const.MaxViewportWidth; - ctx->Const.MaxFramebufferHeight = ctx->Const.MaxViewportHeight; + ctx->Const.MaxFramebufferWidth = 16384; + ctx->Const.MaxFramebufferHeight = 16384; ctx->Const.MaxFramebufferLayers = ctx->Const.MaxArrayTextureLayers; ctx->Const.MaxFramebufferSamples = max_samples; } @@ -962,7 +954,18 @@ brwCreateContext(gl_api api, brw->max_ds_threads = devinfo->max_ds_threads; brw->max_gs_threads = devinfo->max_gs_threads; brw->max_wm_threads = devinfo->max_wm_threads; - brw->max_cs_threads = devinfo->max_cs_threads; + /* FINISHME: Do this for all platforms that the kernel supports */ + if (brw->is_cherryview && + screen->subslice_total > 0 && screen->eu_total > 0) { + /* Logical CS threads = EUs per subslice * 7 threads per EU */ + brw->max_cs_threads = screen->eu_total / screen->subslice_total * 7; + + /* Fuse configurations may give more threads than expected, never less. */ + if (brw->max_cs_threads < devinfo->max_cs_threads) + brw->max_cs_threads = devinfo->max_cs_threads; + } else { + brw->max_cs_threads = devinfo->max_cs_threads; + } brw->urb.size = devinfo->urb.size; brw->urb.min_vs_entries = devinfo->urb.min_vs_entries; brw->urb.max_vs_entries = devinfo->urb.max_vs_entries; diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 6b82bea52c0..a953745b114 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -1341,7 +1341,8 @@ extern void intelInitClearFuncs(struct dd_function_table *functions); */ extern const char *const brw_vendor_string; -extern const char *brw_get_renderer_string(unsigned deviceID); +extern const char * +brw_get_renderer_string(const struct intel_screen *intelScreen); enum { DRI_CONF_BO_REUSE_DISABLED, @@ -1875,6 +1876,11 @@ void brw_emit_depth_stall_flushes(struct brw_context *brw); void gen7_emit_vs_workaround_flush(struct brw_context *brw); void gen7_emit_cs_stall_flush(struct brw_context *brw); +/* brw_queryformat.c */ +void brw_query_internal_format(struct gl_context *ctx, GLenum target, + GLenum internalFormat, GLenum pname, + GLint *params); + #ifdef __cplusplus } #endif diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c b/src/mesa/drivers/dri/i965/brw_device_info.c index 38a27da898c..3666190fc36 100644 --- a/src/mesa/drivers/dri/i965/brw_device_info.c +++ b/src/mesa/drivers/dri/i965/brw_device_info.c @@ -312,7 +312,7 @@ static const struct brw_device_info brw_device_info_chv = { .max_ds_threads = 80, .max_gs_threads = 80, .max_wm_threads = 128, - .max_cs_threads = 28, + .max_cs_threads = 6 * 7, .urb = { .size = 192, .min_vs_entries = 34, diff --git a/src/mesa/drivers/dri/i965/brw_device_info.h b/src/mesa/drivers/dri/i965/brw_device_info.h index 48e0dee9084..4e7f3135960 100644 --- a/src/mesa/drivers/dri/i965/brw_device_info.h +++ b/src/mesa/drivers/dri/i965/brw_device_info.h @@ -25,6 +25,9 @@ #pragma once #include <stdbool.h> +/** + * Intel hardware information and quirks + */ struct brw_device_info { int gen; /**< Generation number: 4, 5, 6, 7, ... */ @@ -49,7 +52,7 @@ struct brw_device_info bool has_resource_streamer; /** - * Quirks: + * \name Intel hardware quirks * @{ */ bool has_negative_rhw_bug; @@ -65,26 +68,69 @@ struct brw_device_info /** @} */ /** - * GPU Limits: + * \name GPU hardware limits + * + * In general, you can find shader thread maximums by looking at the "Maximum + * Number of Threads" field in the Intel PRM description of the 3DSTATE_VS, + * 3DSTATE_GS, 3DSTATE_HS, 3DSTATE_DS, and 3DSTATE_PS commands. URB entry + * limits come from the "Number of URB Entries" field in the the + * 3DSTATE_URB_VS command and friends. + * + * These fields are used to calculate the scratch space to allocate. The + * amount of scratch space can be larger without being harmful on modern + * GPUs, however, prior to Haswell, programming the maximum number of threads + * to greater than the hardware maximum would cause GPU performance to tank. + * * @{ */ /** * Total number of slices present on the device whether or not they've been * fused off. + * + * XXX: CS thread counts are limited by the inability to do cross subslice + * communication. It is the effectively the number of logical threads which + * can be executed in a subslice. Fuse configurations may cause this number + * to change, so we program @max_cs_threads as the lower maximum. */ unsigned num_slices; - unsigned max_vs_threads; - unsigned max_hs_threads; - unsigned max_ds_threads; - unsigned max_gs_threads; + unsigned max_vs_threads; /**< Maximum Vertex Shader threads */ + unsigned max_hs_threads; /**< Maximum Hull Shader threads */ + unsigned max_ds_threads; /**< Maximum Domain Shader threads */ + unsigned max_gs_threads; /**< Maximum Geometry Shader threads. */ + /** + * Theoretical maximum number of Pixel Shader threads. + * + * PSD means Pixel Shader Dispatcher. On modern Intel GPUs, hardware will + * automatically scale pixel shader thread count, based on a single value + * programmed into 3DSTATE_PS. + * + * To calculate the maximum number of threads for Gen8 beyond (which have + * multiple Pixel Shader Dispatchers): + * + * - Look up 3DSTATE_PS and find "Maximum Number of Threads Per PSD" + * - Usually there's only one PSD per subslice, so use the number of + * subslices for number of PSDs. + * - For max_wm_threads, the total should be PSD threads * #PSDs. + */ unsigned max_wm_threads; + + /** + * Maximum Compute Shader threads. + * + * Thread count * number of EUs per subslice + */ unsigned max_cs_threads; struct { /** - * Hardware default URB size. The units this is expressed in are - * somewhat inconsistent: 512b units on Gen4-5, KB on Gen6-7, and KB - * times the slice count on Gen8+. + * Hardware default URB size. + * + * The units this is expressed in are somewhat inconsistent: 512b units + * on Gen4-5, KB on Gen6-7, and KB times the slice count on Gen8+. + * + * Look up "URB Size" in the "Device Attributes" page, and take the + * maximum. Look up the slice count for each GT SKU on the same page. + * urb.size = URB Size (kbytes) / slice count */ unsigned size; unsigned min_vs_entries; diff --git a/src/mesa/drivers/dri/i965/brw_eu.c b/src/mesa/drivers/dri/i965/brw_eu.c index 40ec87d38f0..6961a88c6a8 100644 --- a/src/mesa/drivers/dri/i965/brw_eu.c +++ b/src/mesa/drivers/dri/i965/brw_eu.c @@ -110,6 +110,50 @@ brw_swap_cmod(uint32_t cmod) } } +/** + * Get the least significant bit offset of the i+1-th component of immediate + * type \p type. For \p i equal to the two's complement of j, return the + * offset of the j-th component starting from the end of the vector. For + * scalar register types return zero. + */ +static unsigned +imm_shift(enum brw_reg_type type, unsigned i) +{ + assert(type != BRW_REGISTER_TYPE_UV && type != BRW_REGISTER_TYPE_V && + "Not implemented."); + + if (type == BRW_REGISTER_TYPE_VF) + return 8 * (i & 3); + else + return 0; +} + +/** + * Swizzle an arbitrary immediate \p x of the given type according to the + * permutation specified as \p swz. + */ +uint32_t +brw_swizzle_immediate(enum brw_reg_type type, uint32_t x, unsigned swz) +{ + if (imm_shift(type, 1)) { + const unsigned n = 32 / imm_shift(type, 1); + uint32_t y = 0; + + for (unsigned i = 0; i < n; i++) { + /* Shift the specified component all the way to the right and left to + * discard any undesired L/MSBs, then shift it right into component i. + */ + y |= x >> imm_shift(type, (i & ~3) + BRW_GET_SWZ(swz, i & 3)) + << imm_shift(type, ~0u) + >> imm_shift(type, ~0u - i); + } + + return y; + } else { + return x; + } +} + void brw_set_default_exec_size(struct brw_codegen *p, unsigned value) { diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c index 2ef1d7bb825..6f11f597492 100644 --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c @@ -3402,7 +3402,7 @@ brw_broadcast(struct brw_codegen *p, */ inst = brw_MOV(p, brw_null_reg(), - stride(brw_swizzle1(idx, 0), 0, 4, 1)); + stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 0, 4, 1)); brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE); brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ); brw_inst_set_flag_reg_nr(devinfo, inst, 1); diff --git a/src/mesa/drivers/dri/i965/brw_formatquery.c b/src/mesa/drivers/dri/i965/brw_formatquery.c new file mode 100644 index 00000000000..210109b39f7 --- /dev/null +++ b/src/mesa/drivers/dri/i965/brw_formatquery.c @@ -0,0 +1,171 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_context.h" +#include "brw_state.h" +#include "main/formatquery.h" +#include "main/glformats.h" + +static size_t +brw_query_samples_for_format(struct gl_context *ctx, GLenum target, + GLenum internalFormat, int samples[16]) +{ + struct brw_context *brw = brw_context(ctx); + + (void) target; + (void) internalFormat; + + switch (brw->gen) { + case 9: + samples[0] = 16; + samples[1] = 8; + samples[2] = 4; + samples[3] = 2; + return 4; + + case 8: + samples[0] = 8; + samples[1] = 4; + samples[2] = 2; + return 3; + + case 7: + samples[0] = 8; + samples[1] = 4; + return 2; + + case 6: + samples[0] = 4; + return 1; + + default: + assert(brw->gen < 6); + samples[0] = 1; + return 1; + } +} + +/** + * Returns a generic GL type from an internal format, so that it can be used + * together with the base format to obtain a mesa_format by calling + * mesa_format_from_format_and_type(). + */ +static GLenum +get_generic_type_for_internal_format(GLenum internalFormat) +{ + if (_mesa_is_color_format(internalFormat)) { + if (_mesa_is_enum_format_unsigned_int(internalFormat)) + return GL_UNSIGNED_BYTE; + else if (_mesa_is_enum_format_signed_int(internalFormat)) + return GL_BYTE; + } else { + switch (internalFormat) { + case GL_STENCIL_INDEX: + case GL_STENCIL_INDEX8: + return GL_UNSIGNED_BYTE; + case GL_DEPTH_COMPONENT: + case GL_DEPTH_COMPONENT16: + return GL_UNSIGNED_SHORT; + case GL_DEPTH_COMPONENT24: + case GL_DEPTH_COMPONENT32: + return GL_UNSIGNED_INT; + case GL_DEPTH_COMPONENT32F: + return GL_FLOAT; + case GL_DEPTH_STENCIL: + case GL_DEPTH24_STENCIL8: + return GL_UNSIGNED_INT_24_8; + case GL_DEPTH32F_STENCIL8: + return GL_FLOAT_32_UNSIGNED_INT_24_8_REV; + default: + /* fall-through */ + break; + } + } + + return GL_FLOAT; +} + +void +brw_query_internal_format(struct gl_context *ctx, GLenum target, + GLenum internalFormat, GLenum pname, GLint *params) +{ + /* The Mesa layer gives us a temporary params buffer that is guaranteed + * to be non-NULL, and have at least 16 elements. + */ + assert(params != NULL); + + switch (pname) { + case GL_SAMPLES: + brw_query_samples_for_format(ctx, target, internalFormat, params); + break; + + case GL_NUM_SAMPLE_COUNTS: { + size_t num_samples; + GLint dummy_buffer[16]; + + num_samples = brw_query_samples_for_format(ctx, target, internalFormat, + dummy_buffer); + params[0] = (GLint) num_samples; + break; + } + + case GL_INTERNALFORMAT_PREFERRED: { + params[0] = GL_NONE; + + /* We need to resolve an internal format that is compatible with + * the passed internal format, and optimal to the driver. By now, + * we just validate that the passed internal format is supported by + * the driver, and if so return the same internal format, otherwise + * return GL_NONE. + * + * For validating the internal format, we use the + * ctx->TextureFormatSupported map to check that a BRW surface format + * exists, that can be derived from the internal format. But this + * expects a mesa_format, not an internal format. So we need to "come up" + * with a type that is generic enough, to resolve the mesa_format first. + */ + GLenum type = get_generic_type_for_internal_format(internalFormat); + + /* Get a mesa_format from the internal format and type. */ + GLint base_format = _mesa_base_tex_format(ctx, internalFormat); + if (base_format != -1) { + mesa_format mesa_format = + _mesa_format_from_format_and_type(base_format, type); + + if (mesa_format < MESA_FORMAT_COUNT && + ctx->TextureFormatSupported[mesa_format]) { + params[0] = internalFormat; + } + } + break; + } + + default: + /* By default, we call the driver hook's fallback function from the frontend, + * which has generic implementation for all pnames. + */ + _mesa_query_internal_format_default(ctx, target, internalFormat, pname, + params); + break; + } +} diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 0f9de30f05b..b5f1a874368 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -1931,8 +1931,8 @@ fs_visitor::compact_virtual_grfs() void fs_visitor::assign_constant_locations() { - /* Only the first compile (SIMD8 mode) gets to decide on locations. */ - if (dispatch_width != 8) + /* Only the first compile gets to decide on locations. */ + if (dispatch_width != min_dispatch_width) return; bool is_live[uniforms]; @@ -2474,8 +2474,10 @@ fs_visitor::opt_sampler_eot() * we have enough space, but it will make sure the dead code eliminator kills * the instruction that this will replace. */ - if (tex_inst->header_size != 0) + if (tex_inst->header_size != 0) { + invalidate_live_intervals(); return true; + } fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F, load_payload->sources + 1); @@ -2506,6 +2508,7 @@ fs_visitor::opt_sampler_eot() tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload); tex_inst->src[0] = send_header; + invalidate_live_intervals(); return true; } @@ -5236,12 +5239,18 @@ fs_visitor::optimize() void fs_visitor::fixup_3src_null_dest() { + bool progress = false; + foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { if (inst->is_3src() && inst->dst.is_null()) { inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type); + progress = true; } } + + if (progress) + invalidate_live_intervals(); } void @@ -5277,7 +5286,7 @@ fs_visitor::allocate_registers() * SIMD8. There's probably actually some intermediate point where * SIMD16 with a couple of spills is still better. */ - if (dispatch_width == 16) { + if (dispatch_width == 16 && min_dispatch_width <= 8) { fail("Failure to register allocate. Reduce number of " "live scalar values to avoid this."); } else { @@ -5519,6 +5528,13 @@ fs_visitor::run_cs() if (shader_time_index >= 0) emit_shader_time_begin(); + if (devinfo->is_haswell && prog_data->total_shared > 0) { + /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */ + const fs_builder abld = bld.exec_all().group(1, 0); + abld.MOV(retype(suboffset(brw_sr0_reg(), 1), BRW_REGISTER_TYPE_UW), + suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1)); + } + emit_nir_code(); if (failed) @@ -5782,6 +5798,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, shader->info.cs.local_size[2]; unsigned max_cs_threads = compiler->devinfo->max_cs_threads; + unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, max_cs_threads); cfg_t *cfg = NULL; const char *fail_msg = NULL; @@ -5791,11 +5808,13 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base, NULL, /* Never used in core profile */ shader, 8, shader_time_index); - if (!v8.run_cs()) { - fail_msg = v8.fail_msg; - } else if (local_workgroup_size <= 8 * max_cs_threads) { - cfg = v8.cfg; - prog_data->simd_size = 8; + if (simd_required <= 8) { + if (!v8.run_cs()) { + fail_msg = v8.fail_msg; + } else { + cfg = v8.cfg; + prog_data->simd_size = 8; + } } fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base, @@ -5805,7 +5824,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, !fail_msg && !v8.simd16_unsupported && local_workgroup_size <= 16 * max_cs_threads) { /* Try a SIMD16 compile */ - v16.import_uniforms(&v8); + if (simd_required <= 8) + v16.import_uniforms(&v8); if (!v16.run_cs()) { compiler->shader_perf_log(log_data, "SIMD16 shader failed to compile: %s", diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index f1a81c13ef9..2b00129b4ba 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -272,6 +272,8 @@ public: void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst, unsigned wr_mask); + bool optimize_extract_to_float(nir_alu_instr *instr, + const fs_reg &result); bool optimize_frontfacing_ternary(nir_alu_instr *instr, const fs_reg &result); @@ -405,6 +407,7 @@ public: bool spilled_any_registers; const unsigned dispatch_width; /**< 8 or 16 */ + unsigned min_dispatch_width; int shader_time_index; diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp index 9dbe13df514..2616e65fc62 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp @@ -737,8 +737,7 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block, foreach_in_list(acp_entry, entry, &acp[inst->src[i].nr % ACP_HASH_SIZE]) { if (try_constant_propagate(inst, entry)) progress = true; - - if (try_copy_propagate(inst, i, entry)) + else if (try_copy_propagate(inst, i, entry)) progress = true; } } diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 8611b8dc443..29ef609fce3 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -491,6 +491,49 @@ fs_visitor::nir_emit_instr(nir_instr *instr) } } +/** + * Recognizes a parent instruction of nir_op_extract_* and changes the type to + * match instr. + */ +bool +fs_visitor::optimize_extract_to_float(nir_alu_instr *instr, + const fs_reg &result) +{ + if (!instr->src[0].src.is_ssa || + !instr->src[0].src.ssa->parent_instr) + return false; + + if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu) + return false; + + nir_alu_instr *src0 = + nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); + + if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 && + src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16) + return false; + + nir_const_value *element = nir_src_as_const_value(src0->src[1].src); + assert(element != NULL); + + enum opcode extract_op; + if (src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16) { + assert(element->u[0] <= 1); + extract_op = SHADER_OPCODE_EXTRACT_WORD; + } else { + assert(element->u[0] <= 3); + extract_op = SHADER_OPCODE_EXTRACT_BYTE; + } + + fs_reg op0 = get_nir_src(src0->src[0].src); + op0.type = brw_type_for_nir_type(nir_op_infos[src0->op].input_types[0]); + op0 = offset(op0, bld, src0->src[0].swizzle[0]); + + set_saturate(instr->dest.saturate, + bld.emit(extract_op, result, op0, brw_imm_ud(element->u[0]))); + return true; +} + bool fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr, const fs_reg &result) @@ -662,6 +705,9 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) switch (instr->op) { case nir_op_i2f: case nir_op_u2f: + if (optimize_extract_to_float(instr, result)) + return; + inst = bld.MOV(result, op[0]); inst->saturate = instr->dest.saturate; break; @@ -2458,8 +2504,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr case nir_intrinsic_atomic_counter_inc: case nir_intrinsic_atomic_counter_dec: case nir_intrinsic_atomic_counter_read: { - using namespace surface_access; - /* Get the arguments of the atomic intrinsic. */ const fs_reg offset = get_nir_src(instr->src[0]); const unsigned surface = (stage_prog_data->binding_table.abo_start + @@ -2985,12 +3029,11 @@ fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld, /* Emit the actual atomic operation operation */ - fs_reg atomic_result = - surface_access::emit_untyped_atomic(bld, surface, offset, - data1, data2, - 1 /* dims */, 1 /* rsize */, - op, - BRW_PREDICATE_NONE); + fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset, + data1, data2, + 1 /* dims */, 1 /* rsize */, + op, + BRW_PREDICATE_NONE); dest.type = atomic_result.type; bld.MOV(dest, atomic_result); } @@ -3012,12 +3055,11 @@ fs_visitor::nir_emit_shared_atomic(const fs_builder &bld, /* Emit the actual atomic operation operation */ - fs_reg atomic_result = - surface_access::emit_untyped_atomic(bld, surface, offset, - data1, data2, - 1 /* dims */, 1 /* rsize */, - op, - BRW_PREDICATE_NONE); + fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset, + data1, data2, + 1 /* dims */, 1 /* rsize */, + op, + BRW_PREDICATE_NONE); dest.type = atomic_result.type; bld.MOV(dest, atomic_result); } diff --git a/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp index dc2b0c8aa8d..f59fdbddfa6 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp @@ -73,7 +73,7 @@ opt_saturate_propagation_local(fs_visitor *v, bblock_t *block) if (scan_inst->saturate) { inst->saturate = false; progress = true; - } else if (src_end_ip <= ip || inst->dst.equals(inst->src[0])) { + } else if (src_end_ip == ip || inst->dst.equals(inst->src[0])) { if (scan_inst->can_do_saturate()) { if (scan_inst->dst.type != inst->dst.type) { scan_inst->dst.type = inst->dst.type; diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index b4b430dc140..f1da218ba63 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -1021,6 +1021,18 @@ fs_visitor::init() unreachable("unhandled shader stage"); } + if (stage == MESA_SHADER_COMPUTE) { + const brw_cs_prog_data *cs_prog_data = + (const brw_cs_prog_data *) prog_data; + unsigned size = cs_prog_data->local_size[0] * + cs_prog_data->local_size[1] * + cs_prog_data->local_size[2]; + size = DIV_ROUND_UP(size, devinfo->max_cs_threads); + min_dispatch_width = size > 16 ? 32 : (size > 8 ? 16 : 8); + } else { + min_dispatch_width = 8; + } + this->prog_data = this->stage_prog_data; this->failed = false; diff --git a/src/mesa/drivers/dri/i965/brw_ir_vec4.h b/src/mesa/drivers/dri/i965/brw_ir_vec4.h index 660becaafa7..2b6872e6d31 100644 --- a/src/mesa/drivers/dri/i965/brw_ir_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_ir_vec4.h @@ -76,7 +76,11 @@ offset(src_reg reg, unsigned delta) static inline src_reg swizzle(src_reg reg, unsigned swizzle) { - reg.swizzle = brw_compose_swizzle(swizzle, reg.swizzle); + if (reg.file == IMM) + reg.ud = brw_swizzle_immediate(reg.type, reg.ud, swizzle); + else + reg.swizzle = brw_compose_swizzle(swizzle, reg.swizzle); + return reg; } diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h index a2a4a40f373..4f9b2526e45 100644 --- a/src/mesa/drivers/dri/i965/brw_reg.h +++ b/src/mesa/drivers/dri/i965/brw_reg.h @@ -81,7 +81,9 @@ struct brw_device_info; #define BRW_SWIZZLE_ZZZZ BRW_SWIZZLE4(2,2,2,2) #define BRW_SWIZZLE_WWWW BRW_SWIZZLE4(3,3,3,3) #define BRW_SWIZZLE_XYXY BRW_SWIZZLE4(0,1,0,1) +#define BRW_SWIZZLE_XZXZ BRW_SWIZZLE4(0,2,0,2) #define BRW_SWIZZLE_YZXW BRW_SWIZZLE4(1,2,0,3) +#define BRW_SWIZZLE_YWYW BRW_SWIZZLE4(1,3,1,3) #define BRW_SWIZZLE_ZXYW BRW_SWIZZLE4(2,0,1,3) #define BRW_SWIZZLE_ZWZW BRW_SWIZZLE4(2,3,2,3) #define BRW_SWIZZLE_WZYX BRW_SWIZZLE4(3,2,1,0) @@ -221,6 +223,7 @@ enum PACKED brw_reg_type { unsigned brw_reg_type_to_hw_type(const struct brw_device_info *devinfo, enum brw_reg_type type, enum brw_reg_file file); const char *brw_reg_type_letters(unsigned brw_reg_type); +uint32_t brw_swizzle_immediate(enum brw_reg_type type, uint32_t x, unsigned swz); #define REG_SIZE (8*4) @@ -737,6 +740,22 @@ brw_notification_reg(void) } static inline struct brw_reg +brw_sr0_reg(void) +{ + return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_STATE, + 0, + 0, + 0, + BRW_REGISTER_TYPE_UD, + BRW_VERTICAL_STRIDE_8, + BRW_WIDTH_8, + BRW_HORIZONTAL_STRIDE_1, + BRW_SWIZZLE_XYZW, + WRITEMASK_XYZW); +} + +static inline struct brw_reg brw_acc_reg(unsigned width) { return brw_vecn_reg(width, BRW_ARCHITECTURE_REGISTER_FILE, @@ -871,24 +890,17 @@ get_element_d(struct brw_reg reg, unsigned elt) return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_D), elt)); } - static inline struct brw_reg -brw_swizzle(struct brw_reg reg, unsigned x, unsigned y, unsigned z, unsigned w) +brw_swizzle(struct brw_reg reg, unsigned swz) { - assert(reg.file != BRW_IMMEDIATE_VALUE); + if (reg.file == BRW_IMMEDIATE_VALUE) + reg.ud = brw_swizzle_immediate(reg.type, reg.ud, swz); + else + reg.swizzle = brw_compose_swizzle(swz, reg.swizzle); - reg.swizzle = brw_compose_swizzle(BRW_SWIZZLE4(x, y, z, w), - reg.swizzle); return reg; } - -static inline struct brw_reg -brw_swizzle1(struct brw_reg reg, unsigned x) -{ - return brw_swizzle(reg, x, x, x, x); -} - static inline struct brw_reg brw_writemask(struct brw_reg reg, unsigned mask) { diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp index 4f97577515a..5b54b51395c 100644 --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp @@ -1057,12 +1057,7 @@ fs_instruction_scheduler::calculate_deps() last_accumulator_write = NULL; last_fixed_grf_write = NULL; - exec_node *node; - exec_node *prev; - for (node = instructions.get_tail(), prev = node->prev; - !node->is_head_sentinel(); - node = prev, prev = node->prev) { - schedule_node *n = (schedule_node *)node; + foreach_in_list_reverse_safe(schedule_node, n, &instructions) { fs_inst *inst = (fs_inst *)n->inst; /* write-after-read deps. */ @@ -1284,12 +1279,7 @@ vec4_instruction_scheduler::calculate_deps() last_accumulator_write = NULL; last_fixed_grf_write = NULL; - exec_node *node; - exec_node *prev; - for (node = instructions.get_tail(), prev = node->prev; - !node->is_head_sentinel(); - node = prev, prev = node->prev) { - schedule_node *n = (schedule_node *)node; + foreach_in_list_reverse_safe(schedule_node, n, &instructions) { vec4_instruction *inst = (vec4_instruction *)n->inst; /* write-after-read deps. */ diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp index dfe6afcf6d0..21977a23130 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.cpp +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp @@ -1046,13 +1046,6 @@ backend_shader::calculate_cfg() cfg = new(mem_ctx) cfg_t(&this->instructions); } -void -backend_shader::invalidate_cfg() -{ - ralloc_free(this->cfg); - this->cfg = NULL; -} - /** * Sets up the starting offsets for the groups of binding table entries * commong to all pipeline stages. diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h index 82374a46c18..15bed78cb7c 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.h +++ b/src/mesa/drivers/dri/i965/brw_shader.h @@ -217,7 +217,6 @@ public: virtual void dump_instructions(const char *name); void calculate_cfg(); - void invalidate_cfg(); virtual void invalidate_live_intervals() = 0; }; diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index 0032634f023..65e57ba5e62 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -321,6 +321,28 @@ src_reg::equals(const src_reg &r) const } bool +vec4_visitor::vectorize_mov(bblock_t *block, vec4_instruction *inst, + uint8_t imm[4], vec4_instruction *imm_inst[4], + int inst_count, unsigned writemask) +{ + if (inst_count < 2) + return false; + + unsigned vf; + memcpy(&vf, imm, sizeof(vf)); + vec4_instruction *mov = MOV(imm_inst[0]->dst, brw_imm_vf(vf)); + mov->dst.type = BRW_REGISTER_TYPE_F; + mov->dst.writemask = writemask; + inst->insert_before(block, mov); + + for (int i = 0; i < inst_count; i++) { + imm_inst[i]->remove(block); + } + + return true; +} + +bool vec4_visitor::opt_vector_float() { bool progress = false; @@ -328,27 +350,38 @@ vec4_visitor::opt_vector_float() int last_reg = -1, last_reg_offset = -1; enum brw_reg_file last_reg_file = BAD_FILE; - int remaining_channels = 0; - uint8_t imm[4]; + uint8_t imm[4] = { 0 }; int inst_count = 0; vec4_instruction *imm_inst[4]; + unsigned writemask = 0; foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { if (last_reg != inst->dst.nr || last_reg_offset != inst->dst.reg_offset || last_reg_file != inst->dst.file) { + progress |= vectorize_mov(block, inst, imm, imm_inst, inst_count, + writemask); + inst_count = 0; + writemask = 0; last_reg = inst->dst.nr; last_reg_offset = inst->dst.reg_offset; last_reg_file = inst->dst.file; - remaining_channels = WRITEMASK_XYZW; - inst_count = 0; + for (int i = 0; i < 4; i++) { + imm[i] = 0; + } } if (inst->opcode != BRW_OPCODE_MOV || inst->dst.writemask == WRITEMASK_XYZW || - inst->src[0].file != IMM) + inst->src[0].file != IMM || + inst->predicate != BRW_PREDICATE_NONE) { + progress |= vectorize_mov(block, inst, imm, imm_inst, inst_count, + writemask); + inst_count = 0; + last_reg = -1; continue; + } int vf = brw_float_to_vf(inst->src[0].f); if (vf == -1) @@ -363,23 +396,8 @@ vec4_visitor::opt_vector_float() if ((inst->dst.writemask & WRITEMASK_W) != 0) imm[3] = vf; + writemask |= inst->dst.writemask; imm_inst[inst_count++] = inst; - - remaining_channels &= ~inst->dst.writemask; - if (remaining_channels == 0) { - unsigned vf; - memcpy(&vf, imm, sizeof(vf)); - vec4_instruction *mov = MOV(inst->dst, brw_imm_vf(vf)); - mov->dst.type = BRW_REGISTER_TYPE_F; - mov->dst.writemask = WRITEMASK_XYZW; - inst->insert_after(block, mov); - last_reg = -1; - - for (int i = 0; i < inst_count; i++) { - imm_inst[i]->remove(block); - } - progress = true; - } } if (progress) @@ -1027,6 +1045,7 @@ vec4_visitor::opt_register_coalesce() if (is_nop_mov) { inst->remove(block); + progress = true; continue; } } diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index 7b86e1bc050..6143f65efa1 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -274,13 +274,6 @@ public: void emit_shader_time_end(); void emit_shader_time_write(int shader_time_subindex, src_reg value); - void emit_untyped_atomic(unsigned atomic_op, unsigned surf_index, - dst_reg dst, src_reg offset, src_reg src0, - src_reg src1); - - void emit_untyped_surface_read(unsigned surf_index, dst_reg dst, - src_reg offset); - src_reg get_scratch_offset(bblock_t *block, vec4_instruction *inst, src_reg *reladdr, int reg_offset); void emit_scratch_read(bblock_t *block, vec4_instruction *inst, @@ -366,6 +359,10 @@ protected: virtual void gs_end_primitive(); private: + bool vectorize_mov(bblock_t *block, vec4_instruction *inst, + uint8_t imm[4], vec4_instruction *imm_inst[4], + int inst_count, unsigned writemask); + /** * If true, then register allocation should fail instead of spilling. */ diff --git a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp index 6bd992882b8..92423e1f942 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp @@ -76,22 +76,6 @@ is_channel_updated(vec4_instruction *inst, src_reg *values[4], int ch) inst->dst.writemask & (1 << BRW_GET_SWZ(src->swizzle, ch))); } -static unsigned -swizzle_vf_imm(unsigned vf4, unsigned swizzle) -{ - union { - unsigned vf4; - uint8_t vf[4]; - } v = { vf4 }, ret; - - ret.vf[0] = v.vf[BRW_GET_SWZ(swizzle, 0)]; - ret.vf[1] = v.vf[BRW_GET_SWZ(swizzle, 1)]; - ret.vf[2] = v.vf[BRW_GET_SWZ(swizzle, 2)]; - ret.vf[3] = v.vf[BRW_GET_SWZ(swizzle, 3)]; - - return ret.vf4; -} - static bool is_logic_op(enum opcode opcode) { @@ -101,21 +85,66 @@ is_logic_op(enum opcode opcode) opcode == BRW_OPCODE_NOT); } +/** + * Get the origin of a copy as a single register if all components present in + * the given readmask originate from the same register and have compatible + * regions, otherwise return a BAD_FILE register. + */ +static src_reg +get_copy_value(const copy_entry &entry, unsigned readmask) +{ + unsigned swz[4] = {}; + src_reg value; + + for (unsigned i = 0; i < 4; i++) { + if (readmask & (1 << i)) { + if (entry.value[i]) { + src_reg src = *entry.value[i]; + + if (src.file == IMM) { + swz[i] = i; + } else { + swz[i] = BRW_GET_SWZ(src.swizzle, i); + /* Overwrite the original swizzle so the src_reg::equals call + * below doesn't care about it, the correct swizzle will be + * calculated once the swizzles of all components are known. + */ + src.swizzle = BRW_SWIZZLE_XYZW; + } + + if (value.file == BAD_FILE) { + value = src; + } else if (!value.equals(src)) { + return src_reg(); + } + } else { + return src_reg(); + } + } + } + + return swizzle(value, + brw_compose_swizzle(brw_swizzle_for_mask(readmask), + BRW_SWIZZLE4(swz[0], swz[1], + swz[2], swz[3]))); +} + static bool try_constant_propagate(const struct brw_device_info *devinfo, vec4_instruction *inst, - int arg, struct copy_entry *entry) + int arg, const copy_entry *entry) { /* For constant propagation, we only handle the same constant * across all 4 channels. Some day, we should handle the 8-bit * float vector format, which would let us constant propagate * vectors better. + * We could be more aggressive here -- some channels might not get used + * based on the destination writemask. */ - src_reg value = *entry->value[0]; - for (int i = 1; i < 4; i++) { - if (!value.equals(*entry->value[i])) - return false; - } + src_reg value = + get_copy_value(*entry, + brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle, + WRITEMASK_XYZW)); if (value.file != IMM) return false; @@ -144,8 +173,7 @@ try_constant_propagate(const struct brw_device_info *devinfo, } } - if (value.type == BRW_REGISTER_TYPE_VF) - value.ud = swizzle_vf_imm(value.ud, inst->src[arg].swizzle); + value = swizzle(value, inst->src[arg].swizzle); switch (inst->opcode) { case BRW_OPCODE_MOV: @@ -255,38 +283,15 @@ try_constant_propagate(const struct brw_device_info *devinfo, static bool try_copy_propagate(const struct brw_device_info *devinfo, vec4_instruction *inst, int arg, - struct copy_entry *entry, int attributes_per_reg) + const copy_entry *entry, int attributes_per_reg) { /* Build up the value we are propagating as if it were the source of a * single MOV */ - /* For constant propagation, we only handle the same constant - * across all 4 channels. Some day, we should handle the 8-bit - * float vector format, which would let us constant propagate - * vectors better. - */ - src_reg value = *entry->value[0]; - for (int i = 1; i < 4; i++) { - /* This is equals() except we don't care about the swizzle. */ - if (value.file != entry->value[i]->file || - value.nr != entry->value[i]->nr || - value.reg_offset != entry->value[i]->reg_offset || - value.type != entry->value[i]->type || - value.negate != entry->value[i]->negate || - value.abs != entry->value[i]->abs) { - return false; - } - } - - /* Compute the swizzle of the original register by swizzling the - * component loaded from each value according to the swizzle of - * operand we're going to change. - */ - int s[4]; - for (int i = 0; i < 4; i++) { - s[i] = BRW_GET_SWZ(entry->value[i]->swizzle, i); - } - value.swizzle = BRW_SWIZZLE4(s[0], s[1], s[2], s[3]); + src_reg value = + get_copy_value(*entry, + brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle, + WRITEMASK_XYZW)); /* Check that we can propagate that value */ if (value.file != UNIFORM && @@ -435,43 +440,13 @@ vec4_visitor::opt_copy_propagation(bool do_constant_prop) if (inst->regs_read(i) != 1) continue; - int reg = (alloc.offsets[inst->src[i].nr] + - inst->src[i].reg_offset); - - /* Find the regs that each swizzle component came from. - */ - struct copy_entry entry; - memset(&entry, 0, sizeof(copy_entry)); - int c; - for (c = 0; c < 4; c++) { - int channel = BRW_GET_SWZ(inst->src[i].swizzle, c); - entry.value[c] = entries[reg].value[channel]; - - /* If there's no available copy for this channel, bail. - * We could be more aggressive here -- some channels might - * not get used based on the destination writemask. - */ - if (!entry.value[c]) - break; - - entry.saturatemask |= - (entries[reg].saturatemask & (1 << channel) ? 1 : 0) << c; - - /* We'll only be able to copy propagate if the sources are - * all from the same file -- there's no ability to swizzle - * 0 or 1 constants in with source registers like in i915. - */ - if (c > 0 && entry.value[c - 1]->file != entry.value[c]->file) - break; - } - - if (c != 4) - continue; + const unsigned reg = (alloc.offsets[inst->src[i].nr] + + inst->src[i].reg_offset); + const copy_entry &entry = entries[reg]; if (do_constant_prop && try_constant_propagate(devinfo, inst, i, &entry)) progress = true; - - if (try_copy_propagate(devinfo, inst, i, &entry, attributes_per_reg)) + else if (try_copy_propagate(devinfo, inst, i, &entry, attributes_per_reg)) progress = true; } diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp index 17d5f2aeff4..4686f2014c6 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp @@ -717,24 +717,34 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) (unsigned) instr->const_index[0]; src_reg offset = get_nir_src(instr->src[0], nir_type_int, instr->num_components); + const src_reg surface = brw_imm_ud(surf_index); + const vec4_builder bld = + vec4_builder(this).at_end().annotate(current_annotation, base_ir); + src_reg tmp; + dest = get_nir_dest(instr->dest); switch (instr->intrinsic) { - case nir_intrinsic_atomic_counter_inc: - emit_untyped_atomic(BRW_AOP_INC, surf_index, dest, offset, - src_reg(), src_reg()); - break; - case nir_intrinsic_atomic_counter_dec: - emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dest, offset, - src_reg(), src_reg()); - break; - case nir_intrinsic_atomic_counter_read: - emit_untyped_surface_read(surf_index, dest, offset); - break; - default: - unreachable("Unreachable"); + case nir_intrinsic_atomic_counter_inc: + tmp = emit_untyped_atomic(bld, surface, offset, + src_reg(), src_reg(), + 1, 1, + BRW_AOP_INC); + break; + case nir_intrinsic_atomic_counter_dec: + tmp = emit_untyped_atomic(bld, surface, offset, + src_reg(), src_reg(), + 1, 1, + BRW_AOP_PREDEC); + break; + case nir_intrinsic_atomic_counter_read: + tmp = emit_untyped_read(bld, surface, offset, 1, 1); + break; + default: + unreachable("Unreachable"); } + bld.MOV(retype(dest, tmp.type), tmp); brw_mark_surface_used(stage_prog_data, surf_index); break; } @@ -861,12 +871,11 @@ vec4_visitor::nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr) const vec4_builder bld = vec4_builder(this).at_end().annotate(current_annotation, base_ir); - src_reg atomic_result = - surface_access::emit_untyped_atomic(bld, surface, offset, - data1, data2, - 1 /* dims */, 1 /* rsize */, - op, - BRW_PREDICATE_NONE); + src_reg atomic_result = emit_untyped_atomic(bld, surface, offset, + data1, data2, + 1 /* dims */, 1 /* rsize */, + op, + BRW_PREDICATE_NONE); dest.type = atomic_result.type; bld.MOV(dest, atomic_result); } diff --git a/src/mesa/drivers/dri/i965/brw_vec4_surface_builder.cpp b/src/mesa/drivers/dri/i965/brw_vec4_surface_builder.cpp index 28002c56cdc..1db349ab8ce 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_surface_builder.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_surface_builder.cpp @@ -221,7 +221,7 @@ namespace brw { emit_insert(bld, addr, dims, has_simd4x2), has_simd4x2 ? 1 : dims, emit_insert(bld, src_reg(srcs), size, has_simd4x2), - has_simd4x2 ? 1 : size, + has_simd4x2 && size ? 1 : size, surface, op, rsize, pred); } diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp index 8418a3cdc01..4cfbc143d5a 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp @@ -1115,61 +1115,6 @@ vec4_visitor::gs_end_primitive() } void -vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index, - dst_reg dst, src_reg surf_offset, - src_reg src0, src_reg src1) -{ - unsigned mlen = 1 + (src0.file != BAD_FILE) + (src1.file != BAD_FILE); - src_reg src_payload(this, glsl_type::uint_type, mlen); - dst_reg payload(src_payload); - payload.writemask = WRITEMASK_X; - - /* Set the atomic operation offset. */ - emit(MOV(offset(payload, 0), surf_offset)); - unsigned i = 1; - - /* Set the atomic operation arguments. */ - if (src0.file != BAD_FILE) { - emit(MOV(offset(payload, i), src0)); - i++; - } - - if (src1.file != BAD_FILE) { - emit(MOV(offset(payload, i), src1)); - i++; - } - - /* Emit the instruction. Note that this maps to the normal SIMD8 - * untyped atomic message on Ivy Bridge, but that's OK because - * unused channels will be masked out. - */ - vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst, - src_payload, - brw_imm_ud(surf_index), brw_imm_ud(atomic_op)); - inst->mlen = mlen; -} - -void -vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst, - src_reg surf_offset) -{ - dst_reg offset(this, glsl_type::uint_type); - offset.writemask = WRITEMASK_X; - - /* Set the surface read offset. */ - emit(MOV(offset, surf_offset)); - - /* Emit the instruction. Note that this maps to the normal SIMD8 - * untyped surface read message, but that's OK because unused - * channels will be masked out. - */ - vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, - src_reg(offset), - brw_imm_ud(surf_index), brw_imm_d(1)); - inst->mlen = 1; -} - -void vec4_visitor::emit_ndc_computation() { if (output_reg[VARYING_SLOT_POS].file == BAD_FILE) diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp index 9935557ae70..08f9bb3330a 100644 --- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp @@ -611,11 +611,11 @@ gen6_gs_visitor::xfb_write() emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE)); emit(IF(BRW_PREDICATE_NORMAL)); { - src_reg destination_indices_uw = - retype(destination_indices, BRW_REGISTER_TYPE_UW); - - vec4_instruction *inst = emit(MOV(dst_reg(destination_indices_uw), - brw_imm_v(0x00020100))); /* (0, 1, 2) */ + vec4_instruction *inst = emit(MOV(dst_reg(destination_indices), + brw_imm_vf4(brw_float_to_vf(0.0), + brw_float_to_vf(1.0), + brw_float_to_vf(2.0), + brw_float_to_vf(0.0)))); inst->force_writemask_all = true; emit(ADD(dst_reg(this->destination_indices), diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c index a39693b68f7..60ac124ecd0 100644 --- a/src/mesa/drivers/dri/i965/intel_extensions.c +++ b/src/mesa/drivers/dri/i965/intel_extensions.c @@ -196,6 +196,7 @@ intelInitExtensions(struct gl_context *ctx) ctx->Extensions.ARB_half_float_vertex = true; ctx->Extensions.ARB_instanced_arrays = true; ctx->Extensions.ARB_internalformat_query = true; + ctx->Extensions.ARB_internalformat_query2 = true; ctx->Extensions.ARB_map_buffer_range = true; ctx->Extensions.ARB_occlusion_query = true; ctx->Extensions.ARB_occlusion_query2 = true; diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c index 3a4a53a07e6..b7b679686e5 100644 --- a/src/mesa/drivers/dri/i965/intel_fbo.c +++ b/src/mesa/drivers/dri/i965/intel_fbo.c @@ -289,7 +289,7 @@ intel_alloc_private_renderbuffer_storage(struct gl_context * ctx, struct gl_rend rb->NumSamples = intel_quantize_num_samples(screen, rb->NumSamples); rb->Width = width; rb->Height = height; - rb->_BaseFormat = _mesa_base_fbo_format(ctx, internalFormat); + rb->_BaseFormat = _mesa_get_format_base_format(rb->Format); intel_miptree_release(&irb->mt); diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c index ee7c1d7bc2c..c6eb50aaba8 100644 --- a/src/mesa/drivers/dri/i965/intel_screen.c +++ b/src/mesa/drivers/dri/i965/intel_screen.c @@ -891,7 +891,7 @@ brw_query_renderer_string(__DRIscreen *psp, int param, const char **value) value[0] = brw_vendor_string; return 0; case __DRI2_RENDERER_DEVICE_ID: - value[0] = brw_get_renderer_string(intelScreen->deviceID); + value[0] = brw_get_renderer_string(intelScreen); return 0; default: break; @@ -1082,6 +1082,7 @@ static bool intel_init_bufmgr(struct intel_screen *intelScreen) { __DRIscreen *spriv = intelScreen->driScrnPriv; + bool devid_override = getenv("INTEL_DEVID_OVERRIDE") != NULL; intelScreen->no_hw = getenv("INTEL_NO_HW") != NULL; @@ -1099,6 +1100,25 @@ intel_init_bufmgr(struct intel_screen *intelScreen) return false; } + intelScreen->subslice_total = -1; + intelScreen->eu_total = -1; + + /* Everything below this is for real hardware only */ + if (intelScreen->no_hw || devid_override) + return true; + + intel_get_param(spriv, I915_PARAM_SUBSLICE_TOTAL, + &intelScreen->subslice_total); + intel_get_param(spriv, I915_PARAM_EU_TOTAL, &intelScreen->eu_total); + + /* Without this information, we cannot get the right Braswell brandstrings, + * and we have to use conservative numbers for GPGPU on many platforms, but + * otherwise, things will just work. + */ + if (intelScreen->subslice_total == -1 || intelScreen->eu_total == -1) + _mesa_warning(NULL, + "Kernel 4.1 required to properly query GPU properties.\n"); + return true; } diff --git a/src/mesa/drivers/dri/i965/intel_screen.h b/src/mesa/drivers/dri/i965/intel_screen.h index 3a5f22c3a67..01d45d0c016 100644 --- a/src/mesa/drivers/dri/i965/intel_screen.h +++ b/src/mesa/drivers/dri/i965/intel_screen.h @@ -81,7 +81,17 @@ struct intel_screen * I915_PARAM_CMD_PARSER_VERSION parameter */ int cmd_parser_version; - }; + + /** + * Number of subslices reported by the I915_PARAM_SUBSLICE_TOTAL parameter + */ + int subslice_total; + + /** + * Number of EUs reported by the I915_PARAM_EU_TOTAL parameter + */ + int eu_total; +}; extern void intelDestroyContext(__DRIcontext * driContextPriv); diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c index e21c3ac543f..1601edddef6 100644 --- a/src/mesa/drivers/dri/i965/intel_tex_image.c +++ b/src/mesa/drivers/dri/i965/intel_tex_image.c @@ -50,7 +50,7 @@ intel_miptree_create_for_teximage(struct brw_context *brw, width <<= 1; if (height != 1) height <<= 1; - if (depth != 1) + if (intelObj->base.Target == GL_TEXTURE_3D) depth <<= 1; } @@ -111,7 +111,6 @@ intelTexImage(struct gl_context * ctx, texImage->Width, texImage->Height, texImage->Depth, format, type, pixels, - false /*allocate_storage*/, tex_busy, unpack); if (ok) return; diff --git a/src/mesa/drivers/dri/i965/intel_tex_subimage.c b/src/mesa/drivers/dri/i965/intel_tex_subimage.c index 573f701acdd..4849a4151e2 100644 --- a/src/mesa/drivers/dri/i965/intel_tex_subimage.c +++ b/src/mesa/drivers/dri/i965/intel_tex_subimage.c @@ -214,7 +214,7 @@ intelTexSubImage(struct gl_context * ctx, ok = _mesa_meta_pbo_TexSubImage(ctx, dims, texImage, xoffset, yoffset, zoffset, width, height, depth, format, type, - pixels, false, tex_busy, packing); + pixels, tex_busy, packing); if (ok) return; diff --git a/src/mesa/drivers/x11/xmesa.h b/src/mesa/drivers/x11/xmesa.h index b6a2576d492..cc878e7402e 100644 --- a/src/mesa/drivers/x11/xmesa.h +++ b/src/mesa/drivers/x11/xmesa.h @@ -64,15 +64,15 @@ and create a window, you must do the following to use the X/Mesa interface: #ifndef XMESA_H #define XMESA_H -#ifdef __cplusplus -extern "C" { -#endif - #include <X11/Xlib.h> #include <X11/Xutil.h> #include "xmesa_x.h" #include "GL/gl.h" +#ifdef __cplusplus +extern "C" { +#endif + #define XMESA_MAJOR_VERSION 6 #define XMESA_MINOR_VERSION 3 diff --git a/src/mesa/main/config.h b/src/mesa/main/config.h index 820ae072da6..bc5e56923b8 100644 --- a/src/mesa/main/config.h +++ b/src/mesa/main/config.h @@ -132,11 +132,6 @@ */ #define MAX_TEXTURE_UNITS ((MAX_TEXTURE_COORD_UNITS > MAX_TEXTURE_IMAGE_UNITS) ? MAX_TEXTURE_COORD_UNITS : MAX_TEXTURE_IMAGE_UNITS) - -/** Maximum viewport size */ -#define MAX_VIEWPORT_WIDTH 16384 -#define MAX_VIEWPORT_HEIGHT 16384 - /** Maximun number of viewports supported with ARB_viewport_array */ #define MAX_VIEWPORTS 16 diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c index 26eee28db4e..dbba136f526 100644 --- a/src/mesa/main/context.c +++ b/src/mesa/main/context.c @@ -582,8 +582,8 @@ _mesa_init_constants(struct gl_constants *consts, gl_api api) consts->MaxLights = MAX_LIGHTS; consts->MaxShininess = 128.0; consts->MaxSpotExponent = 128.0; - consts->MaxViewportWidth = MAX_VIEWPORT_WIDTH; - consts->MaxViewportHeight = MAX_VIEWPORT_HEIGHT; + consts->MaxViewportWidth = 16384; + consts->MaxViewportHeight = 16384; consts->MinMapBufferAlignment = 64; /* Driver must override these values if ARB_viewport_array is supported. */ diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h index 3f5aa5db051..60bc8ef4411 100644 --- a/src/mesa/main/dd.h +++ b/src/mesa/main/dd.h @@ -184,22 +184,24 @@ struct dd_function_table { GLenum srcFormat, GLenum srcType ); /** - * Determine sample counts support for a particular target and format + * Queries different driver parameters for a particular target and format. + * Since ARB_internalformat_query2 introduced several new query parameters + * over ARB_internalformat_query, having one driver hook for each parameter + * is no longer feasible. So this is the generic entry-point for calls + * to glGetInternalFormativ and glGetInternalFormati64v, after Mesa has + * checked errors and default values. * * \param ctx GL context * \param target GL target enum * \param internalFormat GL format enum - * \param samples Buffer to hold the returned sample counts. - * Drivers \b must \b not return more than 16 counts. - * - * \returns - * The number of sample counts actually written to \c samples. If - * \c internaFormat is not renderable, zero is returned. - */ - size_t (*QuerySamplesForFormat)(struct gl_context *ctx, - GLenum target, - GLenum internalFormat, - int samples[16]); + * \param pname GL enum that specifies the info to query. + * \param params Buffer to hold the result of the query. + */ + void (*QueryInternalFormat)(struct gl_context *ctx, + GLenum target, + GLenum internalFormat, + GLenum pname, + GLint *params); /** * Called by glTexImage[123]D() and glCopyTexImage[12]D() diff --git a/src/mesa/main/debug_output.c b/src/mesa/main/debug_output.c index 10ee6757cc1..c2b9f053352 100644 --- a/src/mesa/main/debug_output.c +++ b/src/mesa/main/debug_output.c @@ -761,15 +761,11 @@ _mesa_set_debug_state_int(struct gl_context *ctx, GLenum pname, GLint val) GLint _mesa_get_debug_state_int(struct gl_context *ctx, GLenum pname) { - struct gl_debug_state *debug; GLint val; - mtx_lock(&ctx->DebugMutex); - debug = ctx->Debug; - if (!debug) { - mtx_unlock(&ctx->DebugMutex); + struct gl_debug_state *debug = _mesa_lock_debug_state(ctx); + if (!debug) return 0; - } switch (pname) { case GL_DEBUG_OUTPUT: @@ -794,7 +790,7 @@ _mesa_get_debug_state_int(struct gl_context *ctx, GLenum pname) break; } - mtx_unlock(&ctx->DebugMutex); + _mesa_unlock_debug_state(ctx); return val; } @@ -806,15 +802,11 @@ _mesa_get_debug_state_int(struct gl_context *ctx, GLenum pname) void * _mesa_get_debug_state_ptr(struct gl_context *ctx, GLenum pname) { - struct gl_debug_state *debug; void *val; + struct gl_debug_state *debug = _mesa_lock_debug_state(ctx); - mtx_lock(&ctx->DebugMutex); - debug = ctx->Debug; - if (!debug) { - mtx_unlock(&ctx->DebugMutex); + if (!debug) return NULL; - } switch (pname) { case GL_DEBUG_CALLBACK_FUNCTION_ARB: @@ -829,7 +821,7 @@ _mesa_get_debug_state_ptr(struct gl_context *ctx, GLenum pname) break; } - mtx_unlock(&ctx->DebugMutex); + _mesa_unlock_debug_state(ctx); return val; } diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h index 50e050e9009..54a5bb057a3 100644 --- a/src/mesa/main/extensions_table.h +++ b/src/mesa/main/extensions_table.h @@ -73,6 +73,7 @@ EXT(ARB_half_float_vertex , ARB_half_float_vertex EXT(ARB_indirect_parameters , ARB_indirect_parameters , x , GLC, x , x , 2013) EXT(ARB_instanced_arrays , ARB_instanced_arrays , GLL, GLC, x , x , 2008) EXT(ARB_internalformat_query , ARB_internalformat_query , GLL, GLC, x , x , 2011) +EXT(ARB_internalformat_query2 , ARB_internalformat_query2 , GLL, GLC, x , x , 2013) EXT(ARB_invalidate_subdata , dummy_true , GLL, GLC, x , x , 2012) EXT(ARB_map_buffer_alignment , dummy_true , GLL, GLC, x , x , 2011) EXT(ARB_map_buffer_range , ARB_map_buffer_range , GLL, GLC, x , x , 2008) @@ -95,6 +96,7 @@ EXT(ARB_sampler_objects , dummy_true EXT(ARB_seamless_cube_map , ARB_seamless_cube_map , GLL, GLC, x , x , 2009) EXT(ARB_seamless_cubemap_per_texture , AMD_seamless_cubemap_per_texture , GLL, GLC, x , x , 2013) EXT(ARB_separate_shader_objects , dummy_true , GLL, GLC, x , x , 2010) +EXT(ARB_shader_atomic_counter_ops , ARB_shader_atomic_counter_ops , GLL, GLC, x , x , 2015) EXT(ARB_shader_atomic_counters , ARB_shader_atomic_counters , GLL, GLC, x , x , 2011) EXT(ARB_shader_bit_encoding , ARB_shader_bit_encoding , GLL, GLC, x , x , 2010) EXT(ARB_shader_clock , ARB_shader_clock , GLL, GLC, x , x , 2015) diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c index c9e1518ab23..d490918b816 100644 --- a/src/mesa/main/fbobject.c +++ b/src/mesa/main/fbobject.c @@ -3580,8 +3580,22 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx, const struct gl_renderbuffer_attachment *att; GLenum err; - /* The error differs in GL and GLES. */ - err = _mesa_is_desktop_gl(ctx) ? GL_INVALID_OPERATION : GL_INVALID_ENUM; + /* The error code for an attachment type of GL_NONE differs between APIs. + * + * From the ES 2.0.25 specification, page 127: + * "If the value of FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE is NONE, then + * querying any other pname will generate INVALID_ENUM." + * + * From the OpenGL 3.0 specification, page 337, or identically, + * the OpenGL ES 3.0.4 specification, page 240: + * + * "If the value of FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE is NONE, no + * framebuffer is bound to target. In this case querying pname + * FRAMEBUFFER_ATTACHMENT_OBJECT_NAME will return zero, and all other + * queries will generate an INVALID_OPERATION error." + */ + err = ctx->API == API_OPENGLES2 && ctx->Version < 30 ? + GL_INVALID_ENUM : GL_INVALID_OPERATION; if (_mesa_is_winsys_fbo(buffer)) { /* Page 126 (page 136 of the PDF) of the OpenGL ES 2.0.25 spec @@ -4170,7 +4184,8 @@ _mesa_InvalidateFramebuffer(GLenum target, GLsizei numAttachments, */ invalidate_framebuffer_storage(ctx, fb, numAttachments, attachments, 0, 0, - MAX_VIEWPORT_WIDTH, MAX_VIEWPORT_HEIGHT, + ctx->Const.MaxViewportWidth, + ctx->Const.MaxViewportHeight, "glInvalidateFramebuffer"); } @@ -4210,7 +4225,8 @@ _mesa_InvalidateNamedFramebufferData(GLuint framebuffer, */ invalidate_framebuffer_storage(ctx, fb, numAttachments, attachments, 0, 0, - MAX_VIEWPORT_WIDTH, MAX_VIEWPORT_HEIGHT, + ctx->Const.MaxViewportWidth, + ctx->Const.MaxViewportHeight, "glInvalidateNamedFramebufferData"); } diff --git a/src/mesa/main/formatquery.c b/src/mesa/main/formatquery.c index 816f12bf9e2..215c14f889f 100644 --- a/src/mesa/main/formatquery.c +++ b/src/mesa/main/formatquery.c @@ -28,155 +28,1495 @@ #include "enums.h" #include "fbobject.h" #include "formatquery.h" +#include "teximage.h" +#include "texparam.h" +#include "texobj.h" +#include "get.h" +#include "genmipmap.h" +#include "shaderimage.h" +#include "texcompress.h" +#include "textureview.h" -/* default implementation of QuerySamplesForFormat driverfunc, for - * non-multisample-capable drivers. */ -size_t -_mesa_query_samples_for_format(struct gl_context *ctx, GLenum target, - GLenum internalFormat, int samples[16]) +static bool +_is_renderable(struct gl_context *ctx, GLenum internalformat) { - (void) target; - (void) internalFormat; - (void) ctx; + /* Section 4.4.4 on page 212 of the GLES 3.0.4 spec says: + * + * "An internal format is color-renderable if it is one of the + * formats from table 3.13 noted as color-renderable or if it + * is unsized format RGBA or RGB." + * + * Therefore, we must accept GL_RGB and GL_RGBA here. + */ + if (internalformat != GL_RGB && internalformat != GL_RGBA && + _mesa_base_fbo_format(ctx, internalformat) == 0) + return false; - samples[0] = 1; - return 1; + return true; } +/* Handles the cases where either ARB_internalformat_query or + * ARB_internalformat_query2 have to return an error. + */ +static bool +_legal_parameters(struct gl_context *ctx, GLenum target, GLenum internalformat, + GLenum pname, GLsizei bufSize, GLint *params) -void GLAPIENTRY -_mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname, - GLsizei bufSize, GLint *params) { - GLint buffer[16]; - GLsizei count = 0; - GET_CURRENT_CONTEXT(ctx); + bool query2 = _mesa_has_ARB_internalformat_query2(ctx); - ASSERT_OUTSIDE_BEGIN_END(ctx); - - if (!ctx->Extensions.ARB_internalformat_query) { - _mesa_error(ctx, GL_INVALID_OPERATION, "glGetInternalformativ"); - return; - } - - assert(ctx->Driver.QuerySamplesForFormat != NULL); - - /* The ARB_internalformat_query spec says: + /* The ARB_internalformat_query2 spec says: * - * "If the <target> parameter to GetInternalformativ is not one of - * TEXTURE_2D_MULTISAMPLE, TEXTURE_2D_MULTISAMPLE_ARRAY or RENDERBUFFER - * then an INVALID_ENUM error is generated." + * "The INVALID_ENUM error is generated if the <target> parameter to + * GetInternalformati*v is not one of the targets listed in Table 6.xx. */ - switch (target) { + switch(target){ + case GL_TEXTURE_1D: + case GL_TEXTURE_1D_ARRAY: + case GL_TEXTURE_2D: + case GL_TEXTURE_2D_ARRAY: + case GL_TEXTURE_3D: + case GL_TEXTURE_CUBE_MAP: + case GL_TEXTURE_CUBE_MAP_ARRAY: + case GL_TEXTURE_RECTANGLE: + case GL_TEXTURE_BUFFER: + if (!query2) { + /* The ARB_internalformat_query spec says: + * + * "If the <target> parameter to GetInternalformativ is not one of + * TEXTURE_2D_MULTISAMPLE, TEXTURE_2D_MULTISAMPLE_ARRAY + * or RENDERBUFFER then an INVALID_ENUM error is generated. + */ + _mesa_error(ctx, GL_INVALID_ENUM, + "glGetInternalformativ(target=%s)", + _mesa_enum_to_string(target)); + + return false; + } + break; + case GL_RENDERBUFFER: break; case GL_TEXTURE_2D_MULTISAMPLE: case GL_TEXTURE_2D_MULTISAMPLE_ARRAY: - /* These enums are only valid if ARB_texture_multisample is supported */ - if ((_mesa_is_desktop_gl(ctx) && - ctx->Extensions.ARB_texture_multisample) || - _mesa_is_gles31(ctx)) - break; + /* The non-existence of ARB_texture_multisample is treated in + * ARB_internalformat_query implementation like an error. + */ + if (!query2 && + !(_mesa_has_ARB_texture_multisample(ctx) || _mesa_is_gles31(ctx))) { + _mesa_error(ctx, GL_INVALID_ENUM, + "glGetInternalformativ(target=%s)", + _mesa_enum_to_string(target)); + + return false; + } + break; default: _mesa_error(ctx, GL_INVALID_ENUM, "glGetInternalformativ(target=%s)", _mesa_enum_to_string(target)); - return; + return false; } - /* The ARB_internalformat_query spec says: - * - * "If the <internalformat> parameter to GetInternalformativ is not - * color-, depth- or stencil-renderable, then an INVALID_ENUM error is - * generated." - * - * Page 243 of the GLES 3.0.4 spec says this for GetInternalformativ: - * - * "internalformat must be color-renderable, depth-renderable or - * stencilrenderable (as defined in section 4.4.4)." - * - * Section 4.4.4 on page 212 of the same spec says: - * - * "An internal format is color-renderable if it is one of the - * formats from table 3.13 noted as color-renderable or if it - * is unsized format RGBA or RGB." + + /* The ARB_internalformat_query2 spec says: * - * Therefore, we must accept GL_RGB and GL_RGBA here. + * "The INVALID_ENUM error is generated if the <pname> parameter is + * not one of the listed possibilities. */ - if (internalformat != GL_RGB && internalformat != GL_RGBA && - _mesa_base_fbo_format(ctx, internalformat) == 0) { + switch(pname){ + case GL_SAMPLES: + case GL_NUM_SAMPLE_COUNTS: + break; + + case GL_SRGB_DECODE_ARB: + /* The ARB_internalformat_query2 spec says: + * + * "If ARB_texture_sRGB_decode or EXT_texture_sRGB_decode or + * equivalent functionality is not supported, queries for the + * SRGB_DECODE_ARB <pname> set the INVALID_ENUM error. + */ + if (!_mesa_has_EXT_texture_sRGB_decode(ctx)) { + _mesa_error(ctx, GL_INVALID_ENUM, + "glGetInternalformativ(pname=%s)", + _mesa_enum_to_string(pname)); + return false; + } + /* fallthrough */ + case GL_INTERNALFORMAT_SUPPORTED: + case GL_INTERNALFORMAT_PREFERRED: + case GL_INTERNALFORMAT_RED_SIZE: + case GL_INTERNALFORMAT_GREEN_SIZE: + case GL_INTERNALFORMAT_BLUE_SIZE: + case GL_INTERNALFORMAT_ALPHA_SIZE: + case GL_INTERNALFORMAT_DEPTH_SIZE: + case GL_INTERNALFORMAT_STENCIL_SIZE: + case GL_INTERNALFORMAT_SHARED_SIZE: + case GL_INTERNALFORMAT_RED_TYPE: + case GL_INTERNALFORMAT_GREEN_TYPE: + case GL_INTERNALFORMAT_BLUE_TYPE: + case GL_INTERNALFORMAT_ALPHA_TYPE: + case GL_INTERNALFORMAT_DEPTH_TYPE: + case GL_INTERNALFORMAT_STENCIL_TYPE: + case GL_MAX_WIDTH: + case GL_MAX_HEIGHT: + case GL_MAX_DEPTH: + case GL_MAX_LAYERS: + case GL_MAX_COMBINED_DIMENSIONS: + case GL_COLOR_COMPONENTS: + case GL_DEPTH_COMPONENTS: + case GL_STENCIL_COMPONENTS: + case GL_COLOR_RENDERABLE: + case GL_DEPTH_RENDERABLE: + case GL_STENCIL_RENDERABLE: + case GL_FRAMEBUFFER_RENDERABLE: + case GL_FRAMEBUFFER_RENDERABLE_LAYERED: + case GL_FRAMEBUFFER_BLEND: + case GL_READ_PIXELS: + case GL_READ_PIXELS_FORMAT: + case GL_READ_PIXELS_TYPE: + case GL_TEXTURE_IMAGE_FORMAT: + case GL_TEXTURE_IMAGE_TYPE: + case GL_GET_TEXTURE_IMAGE_FORMAT: + case GL_GET_TEXTURE_IMAGE_TYPE: + case GL_MIPMAP: + case GL_MANUAL_GENERATE_MIPMAP: + case GL_AUTO_GENERATE_MIPMAP: + case GL_COLOR_ENCODING: + case GL_SRGB_READ: + case GL_SRGB_WRITE: + case GL_FILTER: + case GL_VERTEX_TEXTURE: + case GL_TESS_CONTROL_TEXTURE: + case GL_TESS_EVALUATION_TEXTURE: + case GL_GEOMETRY_TEXTURE: + case GL_FRAGMENT_TEXTURE: + case GL_COMPUTE_TEXTURE: + case GL_TEXTURE_SHADOW: + case GL_TEXTURE_GATHER: + case GL_TEXTURE_GATHER_SHADOW: + case GL_SHADER_IMAGE_LOAD: + case GL_SHADER_IMAGE_STORE: + case GL_SHADER_IMAGE_ATOMIC: + case GL_IMAGE_TEXEL_SIZE: + case GL_IMAGE_COMPATIBILITY_CLASS: + case GL_IMAGE_PIXEL_FORMAT: + case GL_IMAGE_PIXEL_TYPE: + case GL_IMAGE_FORMAT_COMPATIBILITY_TYPE: + case GL_SIMULTANEOUS_TEXTURE_AND_DEPTH_TEST: + case GL_SIMULTANEOUS_TEXTURE_AND_STENCIL_TEST: + case GL_SIMULTANEOUS_TEXTURE_AND_DEPTH_WRITE: + case GL_SIMULTANEOUS_TEXTURE_AND_STENCIL_WRITE: + case GL_TEXTURE_COMPRESSED: + case GL_TEXTURE_COMPRESSED_BLOCK_WIDTH: + case GL_TEXTURE_COMPRESSED_BLOCK_HEIGHT: + case GL_TEXTURE_COMPRESSED_BLOCK_SIZE: + case GL_CLEAR_BUFFER: + case GL_TEXTURE_VIEW: + case GL_VIEW_COMPATIBILITY_CLASS: + /* The ARB_internalformat_query spec says: + * + * "If the <pname> parameter to GetInternalformativ is not SAMPLES + * or NUM_SAMPLE_COUNTS, then an INVALID_ENUM error is generated." + */ + if (!query2) { + _mesa_error(ctx, GL_INVALID_ENUM, + "glGetInternalformativ(pname=%s)", + _mesa_enum_to_string(pname)); + + return false; + } + break; + + default: _mesa_error(ctx, GL_INVALID_ENUM, - "glGetInternalformativ(internalformat=%s)", - _mesa_enum_to_string(internalformat)); - return; + "glGetInternalformativ(pname=%s)", + _mesa_enum_to_string(pname)); + return false; } /* The ARB_internalformat_query spec says: * * "If the <bufSize> parameter to GetInternalformativ is negative, then * an INVALID_VALUE error is generated." + * + * Nothing is said in ARB_internalformat_query2 but we assume the same. */ if (bufSize < 0) { _mesa_error(ctx, GL_INVALID_VALUE, "glGetInternalformativ(target=%s)", _mesa_enum_to_string(target)); + return false; + } + + /* The ARB_internalformat_query spec says: + * + * "If the <internalformat> parameter to GetInternalformativ is not + * color-, depth- or stencil-renderable, then an INVALID_ENUM error is + * generated." + */ + if (!query2 && !_is_renderable(ctx, internalformat)) { + _mesa_error(ctx, GL_INVALID_ENUM, + "glGetInternalformativ(internalformat=%s)", + _mesa_enum_to_string(internalformat)); + return false; + } + + return true; +} + +/* Sets the appropriate "unsupported" response as defined by the + * ARB_internalformat_query2 spec for each each <pname>. + */ +static void +_set_default_response(GLenum pname, GLint buffer[16]) +{ + /* The ARB_internalformat_query2 defines which is the reponse best + * representing "not supported" or "not applicable" for each <pname>. + * + * " In general: + * - size- or count-based queries will return zero, + * - support-, format- or type-based queries will return NONE, + * - boolean-based queries will return FALSE, and + * - list-based queries return no entries." + */ + switch(pname) { + case GL_SAMPLES: + break; + + case GL_MAX_COMBINED_DIMENSIONS: + /* This value can be a 64-bit value. As the default is the 32-bit query, + * we pack 2 32-bit integers. So we need to clean both */ + buffer[0] = 0; + buffer[1] = 0; + break; + + case GL_NUM_SAMPLE_COUNTS: + case GL_INTERNALFORMAT_RED_SIZE: + case GL_INTERNALFORMAT_GREEN_SIZE: + case GL_INTERNALFORMAT_BLUE_SIZE: + case GL_INTERNALFORMAT_ALPHA_SIZE: + case GL_INTERNALFORMAT_DEPTH_SIZE: + case GL_INTERNALFORMAT_STENCIL_SIZE: + case GL_INTERNALFORMAT_SHARED_SIZE: + case GL_MAX_WIDTH: + case GL_MAX_HEIGHT: + case GL_MAX_DEPTH: + case GL_MAX_LAYERS: + case GL_IMAGE_TEXEL_SIZE: + case GL_TEXTURE_COMPRESSED_BLOCK_WIDTH: + case GL_TEXTURE_COMPRESSED_BLOCK_HEIGHT: + case GL_TEXTURE_COMPRESSED_BLOCK_SIZE: + buffer[0] = 0; + break; + + case GL_INTERNALFORMAT_PREFERRED: + case GL_INTERNALFORMAT_RED_TYPE: + case GL_INTERNALFORMAT_GREEN_TYPE: + case GL_INTERNALFORMAT_BLUE_TYPE: + case GL_INTERNALFORMAT_ALPHA_TYPE: + case GL_INTERNALFORMAT_DEPTH_TYPE: + case GL_INTERNALFORMAT_STENCIL_TYPE: + case GL_FRAMEBUFFER_RENDERABLE: + case GL_FRAMEBUFFER_RENDERABLE_LAYERED: + case GL_FRAMEBUFFER_BLEND: + case GL_READ_PIXELS: + case GL_READ_PIXELS_FORMAT: + case GL_READ_PIXELS_TYPE: + case GL_TEXTURE_IMAGE_FORMAT: + case GL_TEXTURE_IMAGE_TYPE: + case GL_GET_TEXTURE_IMAGE_FORMAT: + case GL_GET_TEXTURE_IMAGE_TYPE: + case GL_MANUAL_GENERATE_MIPMAP: + case GL_AUTO_GENERATE_MIPMAP: + case GL_COLOR_ENCODING: + case GL_SRGB_READ: + case GL_SRGB_WRITE: + case GL_SRGB_DECODE_ARB: + case GL_FILTER: + case GL_VERTEX_TEXTURE: + case GL_TESS_CONTROL_TEXTURE: + case GL_TESS_EVALUATION_TEXTURE: + case GL_GEOMETRY_TEXTURE: + case GL_FRAGMENT_TEXTURE: + case GL_COMPUTE_TEXTURE: + case GL_TEXTURE_SHADOW: + case GL_TEXTURE_GATHER: + case GL_TEXTURE_GATHER_SHADOW: + case GL_SHADER_IMAGE_LOAD: + case GL_SHADER_IMAGE_STORE: + case GL_SHADER_IMAGE_ATOMIC: + case GL_IMAGE_COMPATIBILITY_CLASS: + case GL_IMAGE_PIXEL_FORMAT: + case GL_IMAGE_PIXEL_TYPE: + case GL_IMAGE_FORMAT_COMPATIBILITY_TYPE: + case GL_SIMULTANEOUS_TEXTURE_AND_DEPTH_TEST: + case GL_SIMULTANEOUS_TEXTURE_AND_STENCIL_TEST: + case GL_SIMULTANEOUS_TEXTURE_AND_DEPTH_WRITE: + case GL_SIMULTANEOUS_TEXTURE_AND_STENCIL_WRITE: + case GL_CLEAR_BUFFER: + case GL_TEXTURE_VIEW: + case GL_VIEW_COMPATIBILITY_CLASS: + buffer[0] = GL_NONE; + break; + + case GL_INTERNALFORMAT_SUPPORTED: + case GL_COLOR_COMPONENTS: + case GL_DEPTH_COMPONENTS: + case GL_STENCIL_COMPONENTS: + case GL_COLOR_RENDERABLE: + case GL_DEPTH_RENDERABLE: + case GL_STENCIL_RENDERABLE: + case GL_MIPMAP: + case GL_TEXTURE_COMPRESSED: + buffer[0] = GL_FALSE; + break; + + default: + unreachable("invalid 'pname'"); + } +} + +static bool +_is_target_supported(struct gl_context *ctx, GLenum target) +{ + /* The ARB_internalformat_query2 spec says: + * + * "if a particular type of <target> is not supported by the + * implementation the "unsupported" answer should be given. + * This is not an error." + */ + switch(target){ + case GL_TEXTURE_2D: + case GL_TEXTURE_3D: + break; + + case GL_TEXTURE_1D: + if (!_mesa_is_desktop_gl(ctx)) + return false; + break; + + case GL_TEXTURE_1D_ARRAY: + if (!_mesa_has_EXT_texture_array(ctx)) + return false; + break; + + case GL_TEXTURE_2D_ARRAY: + if (!(_mesa_has_EXT_texture_array(ctx) || _mesa_is_gles3(ctx))) + return false; + break; + + case GL_TEXTURE_CUBE_MAP: + if (!_mesa_has_ARB_texture_cube_map(ctx)) + return false; + break; + + case GL_TEXTURE_CUBE_MAP_ARRAY: + if (!_mesa_has_ARB_texture_cube_map_array(ctx)) + return false; + break; + + case GL_TEXTURE_RECTANGLE: + if (!_mesa_has_NV_texture_rectangle(ctx)) + return false; + break; + + case GL_TEXTURE_BUFFER: + if (!_mesa_has_ARB_texture_buffer_object(ctx)) + return false; + break; + + case GL_RENDERBUFFER: + if (!(_mesa_has_ARB_framebuffer_object(ctx) || + _mesa_is_gles3(ctx))) + return false; + break; + + case GL_TEXTURE_2D_MULTISAMPLE: + case GL_TEXTURE_2D_MULTISAMPLE_ARRAY: + if (!(_mesa_has_ARB_texture_multisample(ctx) || + _mesa_is_gles31(ctx))) + return false; + break; + + default: + unreachable("invalid target"); + } + + return true; +} + +static bool +_is_resource_supported(struct gl_context *ctx, GLenum target, + GLenum internalformat, GLenum pname) +{ + /* From the ARB_internalformat_query2 spec: + * + * In the following descriptions, the term /resource/ is used to generically + * refer to an object of the appropriate type that has been created with + * <internalformat> and <target>. If the particular <target> and + * <internalformat> combination do not make sense, ... the "unsupported" + * answer should be given. This is not an error. + */ + + /* In the ARB_internalformat_query2 spec wording, some <pnames> do not care + * about the /resource/ being supported or not, we return 'true' for those. + */ + switch (pname) { + case GL_INTERNALFORMAT_SUPPORTED: + case GL_INTERNALFORMAT_PREFERRED: + case GL_COLOR_COMPONENTS: + case GL_DEPTH_COMPONENTS: + case GL_STENCIL_COMPONENTS: + case GL_COLOR_RENDERABLE: + case GL_DEPTH_RENDERABLE: + case GL_STENCIL_RENDERABLE: + return true; + default: + break; + } + + switch(target){ + case GL_TEXTURE_1D: + case GL_TEXTURE_1D_ARRAY: + case GL_TEXTURE_2D: + case GL_TEXTURE_2D_ARRAY: + case GL_TEXTURE_3D: + case GL_TEXTURE_CUBE_MAP: + case GL_TEXTURE_CUBE_MAP_ARRAY: + case GL_TEXTURE_RECTANGLE: + /* Based on what Mesa does for glTexImage1D/2D/3D and + * glCompressedTexImage1D/2D/3D functions. + */ + if (_mesa_base_tex_format(ctx, internalformat) < 0) + return false; + + /* additional checks for depth textures */ + if (!_mesa_legal_texture_base_format_for_target(ctx, target, internalformat)) + return false; + + /* additional checks for compressed textures */ + if (_mesa_is_compressed_format(ctx, internalformat) && + (!_mesa_target_can_be_compressed(ctx, target, internalformat, NULL) || + _mesa_format_no_online_compression(ctx, internalformat))) + return false; + + break; + case GL_TEXTURE_2D_MULTISAMPLE: + case GL_TEXTURE_2D_MULTISAMPLE_ARRAY: + /* Based on what Mesa does for glTexImage2D/3DMultisample, + * glTexStorage2D/3DMultisample and + * glTextureStorage2D/3DMultisample functions. + */ + if (!_mesa_is_renderable_texture_format(ctx, internalformat)) + return false; + + break; + case GL_TEXTURE_BUFFER: + /* Based on what Mesa does for the glTexBuffer function. */ + if (_mesa_validate_texbuffer_format(ctx, internalformat) == + MESA_FORMAT_NONE) + return false; + + break; + case GL_RENDERBUFFER: + /* Based on what Mesa does for glRenderbufferStorage(Multisample) and + * glNamedRenderbufferStorage functions. + */ + if (!_mesa_base_fbo_format(ctx, internalformat)) + return false; + + break; + default: + unreachable("bad target"); + } + + return true; +} + +static bool +_is_internalformat_supported(struct gl_context *ctx, GLenum target, + GLenum internalformat) +{ + /* From the ARB_internalformat_query2 specification: + * + * "- INTERNALFORMAT_SUPPORTED: If <internalformat> is an internal format + * that is supported by the implementation in at least some subset of + * possible operations, TRUE is written to <params>. If <internalformat> + * if not a valid token for any internal format usage, FALSE is returned. + * + * <internalformats> that must be supported (in GL 4.2 or later) include + * the following: + * - "sized internal formats" from Table 3.12, 3.13, and 3.15, + * - any specific "compressed internal format" from Table 3.14, + * - any "image unit format" from Table 3.21. + * - any generic "compressed internal format" from Table 3.14, if the + * implementation accepts it for any texture specification commands, and + * - unsized or base internal format, if the implementation accepts + * it for texture or image specification. + */ + GLint buffer[1]; + + /* At this point a internalformat is valid if it is valid as a texture or + * as a renderbuffer format. The checks are different because those methods + * return different values when passing non supported internalformats */ + if (_mesa_base_tex_format(ctx, internalformat) < 0 && + _mesa_base_fbo_format(ctx, internalformat) == 0) + return false; + + /* Let the driver have the final word */ + ctx->Driver.QueryInternalFormat(ctx, target, internalformat, + GL_INTERNALFORMAT_SUPPORTED, buffer); + + return (buffer[0] == GL_TRUE); +} + +static bool +_legal_target_for_framebuffer_texture_layer(struct gl_context *ctx, + GLenum target) +{ + switch (target) { + case GL_TEXTURE_3D: + case GL_TEXTURE_1D_ARRAY: + case GL_TEXTURE_2D_ARRAY: + case GL_TEXTURE_CUBE_MAP_ARRAY: + case GL_TEXTURE_2D_MULTISAMPLE_ARRAY: + case GL_TEXTURE_CUBE_MAP: + return true; + default: + return false; + } +} + +static GLenum +_mesa_generic_type_for_internal_format(GLenum internalFormat) +{ + if (_mesa_is_enum_format_unsigned_int(internalFormat)) + return GL_UNSIGNED_BYTE; + else if (_mesa_is_enum_format_signed_int(internalFormat)) + return GL_BYTE; + else + return GL_FLOAT; +} + +/* default implementation of QueryInternalFormat driverfunc, for + * drivers not implementing ARB_internalformat_query2. + */ +void +_mesa_query_internal_format_default(struct gl_context *ctx, GLenum target, + GLenum internalFormat, GLenum pname, + GLint *params) +{ + (void) target; + + switch (pname) { + case GL_SAMPLES: + case GL_NUM_SAMPLE_COUNTS: + params[0] = 1; + break; + + case GL_INTERNALFORMAT_SUPPORTED: + params[0] = GL_TRUE; + break; + + case GL_INTERNALFORMAT_PREFERRED: + params[0] = internalFormat; + break; + + case GL_READ_PIXELS_FORMAT: { + GLenum base_format = _mesa_base_tex_format(ctx, internalFormat); + switch (base_format) { + case GL_STENCIL_INDEX: + case GL_DEPTH_COMPONENT: + case GL_DEPTH_STENCIL: + case GL_RED: + case GL_RGB: + case GL_BGR: + case GL_RGBA: + case GL_BGRA: + params[0] = base_format; + break; + default: + params[0] = GL_NONE; + break; + } + break; + } + + case GL_READ_PIXELS_TYPE: + case GL_TEXTURE_IMAGE_TYPE: + case GL_GET_TEXTURE_IMAGE_TYPE: { + GLenum base_format = _mesa_base_tex_format(ctx, internalFormat); + if (base_format > 0) + params[0] = _mesa_generic_type_for_internal_format(internalFormat); + else + params[0] = GL_NONE; + break; + } + + case GL_TEXTURE_IMAGE_FORMAT: + case GL_GET_TEXTURE_IMAGE_FORMAT: { + GLenum format = GL_NONE; + GLenum base_format = _mesa_base_tex_format(ctx, internalFormat); + if (base_format > 0) { + if (_mesa_is_enum_format_integer(internalFormat)) + format = _mesa_base_format_to_integer_format(base_format); + else + format = base_format; + } + + params[0] = format; + break; + } + + case GL_MANUAL_GENERATE_MIPMAP: + case GL_AUTO_GENERATE_MIPMAP: + case GL_SRGB_READ: + case GL_SRGB_WRITE: + case GL_SRGB_DECODE_ARB: + case GL_VERTEX_TEXTURE: + case GL_TESS_CONTROL_TEXTURE: + case GL_TESS_EVALUATION_TEXTURE: + case GL_GEOMETRY_TEXTURE: + case GL_FRAGMENT_TEXTURE: + case GL_COMPUTE_TEXTURE: + case GL_SHADER_IMAGE_LOAD: + case GL_SHADER_IMAGE_STORE: + case GL_SHADER_IMAGE_ATOMIC: + case GL_SIMULTANEOUS_TEXTURE_AND_DEPTH_TEST: + case GL_SIMULTANEOUS_TEXTURE_AND_STENCIL_TEST: + case GL_SIMULTANEOUS_TEXTURE_AND_DEPTH_WRITE: + case GL_SIMULTANEOUS_TEXTURE_AND_STENCIL_WRITE: + case GL_CLEAR_BUFFER: + case GL_TEXTURE_VIEW: + case GL_TEXTURE_SHADOW: + case GL_TEXTURE_GATHER: + case GL_TEXTURE_GATHER_SHADOW: + case GL_FRAMEBUFFER_RENDERABLE: + case GL_FRAMEBUFFER_RENDERABLE_LAYERED: + case GL_FRAMEBUFFER_BLEND: + case GL_FILTER: + params[0] = GL_FULL_SUPPORT; + break; + + default: + _set_default_response(pname, params); + break; + } +} + +/* + * For MAX_WIDTH/MAX_HEIGHT/MAX_DEPTH it returns the equivalent GetInteger + * pname for a Getinternalformat pname/target combination. target/pname + * combinations that would return 0 due dimension number or unsupported status + * should be already filtered out + * + * Note that this means that the returned value would be independent of the + * internalformat. This possibility is already mentioned at the Issue 7 of the + * arb_internalformat_query2 spec. + */ +static GLenum +equivalentSizePname(GLenum target, + GLenum pname) +{ + switch (target) { + case GL_TEXTURE_1D: + case GL_TEXTURE_2D: + case GL_TEXTURE_2D_MULTISAMPLE: + return GL_MAX_TEXTURE_SIZE; + case GL_TEXTURE_3D: + return GL_MAX_3D_TEXTURE_SIZE; + case GL_TEXTURE_CUBE_MAP: + return GL_MAX_CUBE_MAP_TEXTURE_SIZE; + case GL_TEXTURE_RECTANGLE: + return GL_MAX_RECTANGLE_TEXTURE_SIZE; + case GL_RENDERBUFFER: + return GL_MAX_RENDERBUFFER_SIZE; + case GL_TEXTURE_1D_ARRAY: + if (pname == GL_MAX_HEIGHT) + return GL_MAX_ARRAY_TEXTURE_LAYERS; + else + return GL_MAX_TEXTURE_SIZE; + case GL_TEXTURE_2D_ARRAY: + case GL_TEXTURE_2D_MULTISAMPLE_ARRAY: + if (pname == GL_MAX_DEPTH) + return GL_MAX_ARRAY_TEXTURE_LAYERS; + else + return GL_MAX_TEXTURE_SIZE; + case GL_TEXTURE_CUBE_MAP_ARRAY: + if (pname == GL_MAX_DEPTH) + return GL_MAX_ARRAY_TEXTURE_LAYERS; + else + return GL_MAX_CUBE_MAP_TEXTURE_SIZE; + case GL_TEXTURE_BUFFER: + return GL_MAX_TEXTURE_BUFFER_SIZE; + default: + return 0; + } +} + +/* + * Returns the dimensions associated to a target. GL_TEXTURE_BUFFER and + * GL_RENDERBUFFER have associated a dimension, but they are not textures + * per-se, so we can't just call _mesa_get_texture_dimension directly. + */ +static GLint +get_target_dimensions(GLenum target) +{ + switch(target) { + case GL_TEXTURE_BUFFER: + return 1; + case GL_RENDERBUFFER: + return 2; + default: + return _mesa_get_texture_dimensions(target); + } +} + +/* + * Returns the minimum amount of dimensions associated to a pname. So for + * example, if querying GL_MAX_HEIGHT, it is assumed that your target would + * have as minimum 2 dimensions. + * + * Useful to handle sentences like this from query2 spec: + * + * "MAX_HEIGHT: + * <skip> + * If the resource does not have at least two dimensions + * <skip>." + */ +static GLint +get_min_dimensions(GLenum pname) +{ + switch(pname) { + case GL_MAX_WIDTH: + return 1; + case GL_MAX_HEIGHT: + return 2; + case GL_MAX_DEPTH: + return 3; + default: + return 0; + } +} + +/* + * Similar to teximage.c:check_multisample_target, but independent of the + * dimensions. + */ +static bool +is_multisample_target(GLenum target) +{ + switch(target) { + case GL_TEXTURE_2D_MULTISAMPLE: + case GL_TEXTURE_2D_MULTISAMPLE_ARRAY: + return true; + default: + return false; + } + +} + +void GLAPIENTRY +_mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname, + GLsizei bufSize, GLint *params) +{ + GLint buffer[16]; + GET_CURRENT_CONTEXT(ctx); + + ASSERT_OUTSIDE_BEGIN_END(ctx); + + /* ARB_internalformat_query is also mandatory for ARB_internalformat_query2 */ + if (!(_mesa_has_ARB_internalformat_query(ctx) || + _mesa_is_gles3(ctx))) { + _mesa_error(ctx, GL_INVALID_OPERATION, "glGetInternalformativ"); return; } + assert(ctx->Driver.QueryInternalFormat != NULL); + + if (!_legal_parameters(ctx, target, internalformat, pname, bufSize, params)) + return; + + /* initialize the contents of the temporary buffer */ + memcpy(buffer, params, MIN2(bufSize, 16) * sizeof(GLint)); + + /* Use the 'unsupported' response defined by the spec for every pname + * as the default answer. + */ + _set_default_response(pname, buffer); + + if (!_is_target_supported(ctx, target) || + !_is_internalformat_supported(ctx, target, internalformat) || + !_is_resource_supported(ctx, target, internalformat, pname)) + goto end; + switch (pname) { case GL_SAMPLES: - count = ctx->Driver.QuerySamplesForFormat(ctx, target, - internalformat, buffer); - break; - case GL_NUM_SAMPLE_COUNTS: { - if ((ctx->API == API_OPENGLES2 && ctx->Version == 30) && - _mesa_is_enum_format_integer(internalformat)) { - /* From GL ES 3.0 specification, section 6.1.15 page 236: "Since - * multisampling is not supported for signed and unsigned integer - * internal formats, the value of NUM_SAMPLE_COUNTS will be zero - * for such formats. - * - * Such a restriction no longer exists in GL ES 3.1. - */ - buffer[0] = 0; - count = 1; + /* fall-through */ + case GL_NUM_SAMPLE_COUNTS: + /* The ARB_internalformat_query2 sets the response as 'unsupported' for + * SAMPLES and NUM_SAMPLE_COUNTS: + * + * "If <internalformat> is not color-renderable, depth-renderable, or + * stencil-renderable (as defined in section 4.4.4), or if <target> + * does not support multiple samples (ie other than + * TEXTURE_2D_MULTISAMPLE, TEXTURE_2D_MULTISAMPLE_ARRAY, + * or RENDERBUFFER)." + */ + if ((target != GL_RENDERBUFFER && + target != GL_TEXTURE_2D_MULTISAMPLE && + target != GL_TEXTURE_2D_MULTISAMPLE_ARRAY) || + !_is_renderable(ctx, internalformat)) + goto end; + + /* The GL ES 3.0 specification, section 6.1.15 page 236 says: + * + * "Since multisampling is not supported for signed and unsigned + * integer internal formats, the value of NUM_SAMPLE_COUNTS will be + * zero for such formats. + */ + if (pname == GL_NUM_SAMPLE_COUNTS && ctx->API == API_OPENGLES2 && + ctx->Version == 30 && _mesa_is_enum_format_integer(internalformat)) { + goto end; + } + + ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname, + buffer); + break; + + case GL_INTERNALFORMAT_SUPPORTED: + /* Having a supported <internalformat> is implemented as a prerequisite + * for all the <pnames>. Thus, if we reach this point, the internalformat is + * supported. + */ + buffer[0] = GL_TRUE; + break; + + case GL_INTERNALFORMAT_PREFERRED: + /* The ARB_internalformat_query2 spec says: + * + * "- INTERNALFORMAT_PREFERRED: The implementation-preferred internal + * format for representing resources of the specified <internalformat> is + * returned in <params>. + * + * Therefore, we let the driver answer. + */ + ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname, + buffer); + break; + + case GL_INTERNALFORMAT_RED_SIZE: + case GL_INTERNALFORMAT_GREEN_SIZE: + case GL_INTERNALFORMAT_BLUE_SIZE: + case GL_INTERNALFORMAT_ALPHA_SIZE: + case GL_INTERNALFORMAT_DEPTH_SIZE: + case GL_INTERNALFORMAT_STENCIL_SIZE: + case GL_INTERNALFORMAT_SHARED_SIZE: + case GL_INTERNALFORMAT_RED_TYPE: + case GL_INTERNALFORMAT_GREEN_TYPE: + case GL_INTERNALFORMAT_BLUE_TYPE: + case GL_INTERNALFORMAT_ALPHA_TYPE: + case GL_INTERNALFORMAT_DEPTH_TYPE: + case GL_INTERNALFORMAT_STENCIL_TYPE: { + GLint baseformat; + mesa_format texformat; + + if (target != GL_RENDERBUFFER) { + if (!_mesa_legal_get_tex_level_parameter_target(ctx, target, true)) + goto end; + + baseformat = _mesa_base_tex_format(ctx, internalformat); } else { - size_t num_samples; + baseformat = _mesa_base_fbo_format(ctx, internalformat); + } - /* The driver can return 0, and we should pass that along to the - * application. The ARB decided that ARB_internalformat_query should - * behave as ARB_internalformat_query2 in this situation. - * - * The ARB_internalformat_query2 spec says: - * - * "- NUM_SAMPLE_COUNTS: The number of sample counts that would be - * returned by querying SAMPLES is returned in <params>. - * * If <internalformat> is not color-renderable, - * depth-renderable, or stencil-renderable (as defined in - * section 4.4.4), or if <target> does not support multiple - * samples (ie other than TEXTURE_2D_MULTISAMPLE, - * TEXTURE_2D_MULTISAMPLE_ARRAY, or RENDERBUFFER), 0 is - * returned." - */ - num_samples = ctx->Driver.QuerySamplesForFormat(ctx, target, internalformat, buffer); + /* Let the driver choose the texture format. + * + * Disclaimer: I am considering that drivers use for renderbuffers the + * same format-choice logic as for textures. + */ + texformat = ctx->Driver.ChooseTextureFormat(ctx, target, internalformat, + GL_NONE /*format */, GL_NONE /* type */); + + if (texformat == MESA_FORMAT_NONE || baseformat <= 0) + goto end; + + /* Implementation based on what Mesa does for glGetTexLevelParameteriv + * and glGetRenderbufferParameteriv functions. + */ + if (pname == GL_INTERNALFORMAT_SHARED_SIZE) { + if (_mesa_has_EXT_texture_shared_exponent(ctx) && + target != GL_TEXTURE_BUFFER && + target != GL_RENDERBUFFER && + texformat == MESA_FORMAT_R9G9B9E5_FLOAT) { + buffer[0] = 5; + } + goto end; + } + + if (!_mesa_base_format_has_channel(baseformat, pname)) + goto end; - /* QuerySamplesForFormat writes some stuff to buffer, so we have to - * separately over-write it with the requested value. + switch (pname) { + case GL_INTERNALFORMAT_DEPTH_SIZE: + if (!_mesa_has_ARB_depth_texture(ctx) && + target != GL_RENDERBUFFER && + target != GL_TEXTURE_BUFFER) + goto end; + /* fallthrough */ + case GL_INTERNALFORMAT_RED_SIZE: + case GL_INTERNALFORMAT_GREEN_SIZE: + case GL_INTERNALFORMAT_BLUE_SIZE: + case GL_INTERNALFORMAT_ALPHA_SIZE: + case GL_INTERNALFORMAT_STENCIL_SIZE: + buffer[0] = _mesa_get_format_bits(texformat, pname); + break; + + case GL_INTERNALFORMAT_DEPTH_TYPE: + if (!_mesa_has_ARB_texture_float(ctx)) + goto end; + /* fallthrough */ + case GL_INTERNALFORMAT_RED_TYPE: + case GL_INTERNALFORMAT_GREEN_TYPE: + case GL_INTERNALFORMAT_BLUE_TYPE: + case GL_INTERNALFORMAT_ALPHA_TYPE: + case GL_INTERNALFORMAT_STENCIL_TYPE: + buffer[0] = _mesa_get_format_datatype(texformat); + break; + + default: + break; + + } + break; + } + + /* For WIDTH/HEIGHT/DEPTH/LAYERS there is no reason to think that the + * returned values should be different to the values returned by + * GetInteger with MAX_TEXTURE_SIZE, MAX_3D_TEXTURE_SIZE, etc.*/ + case GL_MAX_WIDTH: + case GL_MAX_HEIGHT: + case GL_MAX_DEPTH: { + GLenum get_pname; + GLint dimensions; + GLint min_dimensions; + + /* From query2:MAX_HEIGHT spec (as example): + * + * "If the resource does not have at least two dimensions, or if the + * resource is unsupported, zero is returned." + */ + dimensions = get_target_dimensions(target); + min_dimensions = get_min_dimensions(pname); + if (dimensions < min_dimensions) + goto end; + + get_pname = equivalentSizePname(target, pname); + if (get_pname == 0) + goto end; + + _mesa_GetIntegerv(get_pname, buffer); + break; + } + + case GL_MAX_LAYERS: + if (!_mesa_has_EXT_texture_array(ctx)) + goto end; + + if (!_mesa_is_array_texture(target)) + goto end; + + _mesa_GetIntegerv(GL_MAX_ARRAY_TEXTURE_LAYERS, buffer); + break; + + case GL_MAX_COMBINED_DIMENSIONS:{ + GLint64 combined_value = 1; + GLenum max_dimensions_pnames[] = { + GL_MAX_WIDTH, + GL_MAX_HEIGHT, + GL_MAX_DEPTH, + GL_SAMPLES + }; + unsigned i; + GLint current_value; + + /* Combining the dimensions. Note that for array targets, this would + * automatically include the value of MAX_LAYERS, as that value is + * returned as MAX_HEIGHT or MAX_DEPTH */ + for (i = 0; i < 4; i++) { + if (max_dimensions_pnames[i] == GL_SAMPLES && + !is_multisample_target(target)) + continue; + + _mesa_GetInternalformativ(target, internalformat, + max_dimensions_pnames[i], + 1, ¤t_value); + + if (current_value != 0) + combined_value *= current_value; + } + + if (_mesa_is_cube_map_texture(target)) + combined_value *= 6; + + /* We pack the 64-bit value on two 32-bit values. Calling the 32-bit + * query, this would work as far as the value can be hold on a 32-bit + * signed integer. For the 64-bit query, the wrapper around the 32-bit + * query will unpack the value */ + memcpy(buffer, &combined_value, sizeof(GLint64)); + break; + } + + case GL_COLOR_COMPONENTS: + /* The ARB_internalformat_query2 spec says: + * + * "- COLOR_COMPONENTS: If the internal format contains any color + * components (R, G, B, or A), TRUE is returned in <params>. + * If the internal format is unsupported or contains no color + * components, FALSE is returned." + */ + if (_mesa_is_color_format(internalformat)) + buffer[0] = GL_TRUE; + break; + + case GL_DEPTH_COMPONENTS: + /* The ARB_internalformat_query2 spec says: + * + * "- DEPTH_COMPONENTS: If the internal format contains a depth + * component (D), TRUE is returned in <params>. If the internal format + * is unsupported or contains no depth component, FALSE is returned." + */ + if (_mesa_is_depth_format(internalformat) || + _mesa_is_depthstencil_format(internalformat)) + buffer[0] = GL_TRUE; + break; + + case GL_STENCIL_COMPONENTS: + /* The ARB_internalformat_query2 spec says: + * + * "- STENCIL_COMPONENTS: If the internal format contains a stencil + * component (S), TRUE is returned in <params>. If the internal format + * is unsupported or contains no stencil component, FALSE is returned. + */ + if (_mesa_is_stencil_format(internalformat) || + _mesa_is_depthstencil_format(internalformat)) + buffer[0] = GL_TRUE; + break; + + case GL_COLOR_RENDERABLE: + case GL_DEPTH_RENDERABLE: + case GL_STENCIL_RENDERABLE: + if (!_is_renderable(ctx, internalformat)) + goto end; + + if (pname == GL_COLOR_RENDERABLE) { + if (!_mesa_is_color_format(internalformat)) + goto end; + } else { + GLenum baseFormat = _mesa_base_fbo_format(ctx, internalformat); + if (baseFormat != GL_DEPTH_STENCIL && + ((pname == GL_DEPTH_RENDERABLE && baseFormat != GL_DEPTH_COMPONENT) || + (pname == GL_STENCIL_RENDERABLE && baseFormat != GL_STENCIL_INDEX))) + goto end; + } + + buffer[0] = GL_TRUE; + break; + + case GL_FRAMEBUFFER_RENDERABLE_LAYERED: + if (!_mesa_has_EXT_texture_array(ctx) || + _legal_target_for_framebuffer_texture_layer(ctx, target)) + goto end; + /* fallthrough */ + case GL_FRAMEBUFFER_RENDERABLE: + case GL_FRAMEBUFFER_BLEND: + if (!_mesa_has_ARB_framebuffer_object(ctx)) + goto end; + + if (target == GL_TEXTURE_BUFFER || + !_is_renderable(ctx, internalformat)) + goto end; + + ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname, + buffer); + break; + + case GL_READ_PIXELS: + case GL_READ_PIXELS_FORMAT: + case GL_READ_PIXELS_TYPE: + ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname, + buffer); + break; + + case GL_TEXTURE_IMAGE_FORMAT: + case GL_GET_TEXTURE_IMAGE_FORMAT: + case GL_TEXTURE_IMAGE_TYPE: + case GL_GET_TEXTURE_IMAGE_TYPE: + ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname, + buffer); + break; + + case GL_MIPMAP: + case GL_MANUAL_GENERATE_MIPMAP: + case GL_AUTO_GENERATE_MIPMAP: + if (!_mesa_is_valid_generate_texture_mipmap_target(ctx, target) || + !_mesa_is_valid_generate_texture_mipmap_internalformat(ctx, + internalformat)) { + goto end; + } + + if (pname == GL_MIPMAP) { + buffer[0] = GL_TRUE; + goto end; + } + else if (pname == GL_MANUAL_GENERATE_MIPMAP) { + if (!_mesa_has_ARB_framebuffer_object(ctx)) + goto end; + } + else { + /* From ARB_internalformat_query2: + * "Dependencies on OpenGL 3.2 (Core Profile) + * In core profiles for OpenGL 3.2 and later versions, queries + * for the AUTO_GENERATE_MIPMAP <pname> return the appropriate + * unsupported response." */ - buffer[0] = (GLint) num_samples; - count = 1; + if (_mesa_is_desktop_gl(ctx) && ctx->Version >= 32) + goto end; } + + ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname, + buffer); + break; + + case GL_COLOR_ENCODING: + if (!_mesa_is_color_format(internalformat)) + goto end; + + if (_mesa_is_srgb_format(internalformat)) + buffer[0] = GL_SRGB; + else + buffer[0] = GL_LINEAR; + break; + + case GL_SRGB_READ: + if (!_mesa_has_EXT_texture_sRGB(ctx) || + !_mesa_is_srgb_format(internalformat)) { + goto end; + } + + ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname, + buffer); + break; + + case GL_SRGB_WRITE: + if (!_mesa_has_EXT_framebuffer_sRGB(ctx) || + !_mesa_is_color_format(internalformat)) { + goto end; + } + + ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname, + buffer); + break; + + case GL_SRGB_DECODE_ARB: + /* Presence of EXT_texture_sRGB_decode was already verified */ + if (!_mesa_has_EXT_texture_sRGB(ctx) || + target == GL_RENDERBUFFER || + !_mesa_is_srgb_format(internalformat)) { + goto end; + } + + ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname, + buffer); + break; + + case GL_FILTER: + /* If it doesn't allow to set sampler parameters then it would not allow + * to set a filter different to GL_NEAREST. In practice, this method + * only filters out MULTISAMPLE/MULTISAMPLE_ARRAY */ + if (!_mesa_target_allows_setting_sampler_parameters(target)) + goto end; + + if (_mesa_is_enum_format_integer(internalformat)) + goto end; + + if (target == GL_TEXTURE_BUFFER) + goto end; + + /* At this point we know that multi-texel filtering is supported. We + * need to call the driver to know if it is CAVEAT_SUPPORT or + * FULL_SUPPORT. + */ + ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname, + buffer); + break; + + case GL_VERTEX_TEXTURE: + case GL_TESS_CONTROL_TEXTURE: + case GL_TESS_EVALUATION_TEXTURE: + case GL_GEOMETRY_TEXTURE: + case GL_FRAGMENT_TEXTURE: + case GL_COMPUTE_TEXTURE: + if (target == GL_RENDERBUFFER) + goto end; + + if ((pname == GL_TESS_CONTROL_TEXTURE || + pname == GL_TESS_EVALUATION_TEXTURE) && + !_mesa_has_tessellation(ctx)) + goto end; + + if (pname == GL_GEOMETRY_TEXTURE && !_mesa_has_geometry_shaders(ctx)) + goto end; + + if (pname == GL_COMPUTE_TEXTURE && !_mesa_has_compute_shaders(ctx)) + goto end; + + ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname, + buffer); + break; + + case GL_TEXTURE_GATHER: + case GL_TEXTURE_GATHER_SHADOW: + if (!_mesa_has_ARB_texture_gather(ctx)) + goto end; + + /* fallthrough */ + case GL_TEXTURE_SHADOW: + /* Only depth or depth-stencil image formats make sense in shadow + samplers */ + if (pname != GL_TEXTURE_GATHER && + !_mesa_is_depth_format(internalformat) && + !_mesa_is_depthstencil_format(internalformat)) + goto end; + + /* Validate the target for shadow and gather operations */ + switch (target) { + case GL_TEXTURE_2D: + case GL_TEXTURE_2D_ARRAY: + case GL_TEXTURE_CUBE_MAP: + case GL_TEXTURE_CUBE_MAP_ARRAY: + case GL_TEXTURE_RECTANGLE: + break; + + case GL_TEXTURE_1D: + case GL_TEXTURE_1D_ARRAY: + /* 1D and 1DArray textures are not admitted in gather operations */ + if (pname != GL_TEXTURE_SHADOW) + goto end; + break; + + default: + goto end; + } + + ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname, + buffer); + break; + + case GL_SHADER_IMAGE_LOAD: + case GL_SHADER_IMAGE_STORE: + if (!_mesa_has_ARB_shader_image_load_store(ctx)) + goto end; + + /* We call to _mesa_is_shader_image_format_supported + * using "internalformat" as parameter, because the + * the ARB_internalformat_query2 spec says: + * "In this case the <internalformat> is the value of the <format> + * parameter that is passed to BindImageTexture." + */ + if (target == GL_RENDERBUFFER || + !_mesa_is_shader_image_format_supported(ctx, internalformat)) + goto end; + + ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname, + buffer); + break; + + case GL_SHADER_IMAGE_ATOMIC: + if (!_mesa_has_ARB_shader_image_load_store(ctx)) + goto end; + + ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname, + buffer); + break; + + case GL_IMAGE_TEXEL_SIZE: { + mesa_format image_format; + + if (!_mesa_has_ARB_shader_image_load_store(ctx) || + target == GL_RENDERBUFFER) + goto end; + + image_format = _mesa_get_shader_image_format(internalformat); + if (image_format == MESA_FORMAT_NONE) + goto end; + + /* We return bits */ + buffer[0] = (_mesa_get_format_bytes(image_format) * 8); break; } + + case GL_IMAGE_COMPATIBILITY_CLASS: + if (!_mesa_has_ARB_shader_image_load_store(ctx) || + target == GL_RENDERBUFFER) + goto end; + + buffer[0] = _mesa_get_image_format_class(internalformat); + break; + + case GL_IMAGE_PIXEL_FORMAT: { + GLint base_format; + + if (!_mesa_has_ARB_shader_image_load_store(ctx) || + target == GL_RENDERBUFFER || + !_mesa_is_shader_image_format_supported(ctx, internalformat)) + goto end; + + base_format = _mesa_base_tex_format(ctx, internalformat); + if (base_format == -1) + goto end; + + if (_mesa_is_enum_format_integer(internalformat)) + buffer[0] = _mesa_base_format_to_integer_format(base_format); + else + buffer[0] = base_format; + break; + } + + case GL_IMAGE_PIXEL_TYPE: { + mesa_format image_format; + GLenum datatype; + GLuint comps; + + if (!_mesa_has_ARB_shader_image_load_store(ctx) || + target == GL_RENDERBUFFER) + goto end; + + image_format = _mesa_get_shader_image_format(internalformat); + if (image_format == MESA_FORMAT_NONE) + goto end; + + _mesa_uncompressed_format_to_type_and_comps(image_format, &datatype, + &comps); + if (!datatype) + goto end; + + buffer[0] = datatype; + break; + } + + case GL_IMAGE_FORMAT_COMPATIBILITY_TYPE: { + if (!_mesa_has_ARB_shader_image_load_store(ctx)) + goto end; + + if (!_mesa_legal_get_tex_level_parameter_target(ctx, target, true)) + goto end; + + /* From spec: "Equivalent to calling GetTexParameter with <value> set + * to IMAGE_FORMAT_COMPATIBILITY_TYPE." + * + * GetTexParameter just returns + * tex_obj->ImageFormatCompatibilityType. We create a fake tex_obj + * just with the purpose of getting the value. + */ + struct gl_texture_object *tex_obj = _mesa_new_texture_object(ctx, 0, target); + buffer[0] = tex_obj->ImageFormatCompatibilityType; + _mesa_delete_texture_object(ctx, tex_obj); + + break; + } + + case GL_SIMULTANEOUS_TEXTURE_AND_DEPTH_TEST: + case GL_SIMULTANEOUS_TEXTURE_AND_STENCIL_TEST: + case GL_SIMULTANEOUS_TEXTURE_AND_DEPTH_WRITE: + case GL_SIMULTANEOUS_TEXTURE_AND_STENCIL_WRITE: + if (target == GL_RENDERBUFFER) + goto end; + + if (!_mesa_is_depthstencil_format(internalformat)) { + if (((pname == GL_SIMULTANEOUS_TEXTURE_AND_DEPTH_TEST || + pname == GL_SIMULTANEOUS_TEXTURE_AND_DEPTH_WRITE) && + !_mesa_is_depth_format(internalformat)) || + ((pname == GL_SIMULTANEOUS_TEXTURE_AND_STENCIL_TEST || + pname == GL_SIMULTANEOUS_TEXTURE_AND_STENCIL_WRITE) && + !_mesa_is_stencil_format(internalformat))) + goto end; + } + + ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname, + buffer); + break; + + case GL_TEXTURE_COMPRESSED: + buffer[0] = _mesa_is_compressed_format(ctx, internalformat); + break; + + case GL_TEXTURE_COMPRESSED_BLOCK_WIDTH: + case GL_TEXTURE_COMPRESSED_BLOCK_HEIGHT: + case GL_TEXTURE_COMPRESSED_BLOCK_SIZE: { + mesa_format mesaformat; + GLint block_size; + + mesaformat = _mesa_glenum_to_compressed_format(internalformat); + if (mesaformat == MESA_FORMAT_NONE) + goto end; + + block_size = _mesa_get_format_bytes(mesaformat); + assert(block_size > 0); + + if (pname == GL_TEXTURE_COMPRESSED_BLOCK_SIZE) { + buffer[0] = block_size; + } else { + GLuint bwidth, bheight; + + /* Returns the width and height in pixels. We return bytes */ + _mesa_get_format_block_size(mesaformat, &bwidth, &bheight); + assert(bwidth > 0 && bheight > 0); + + if (pname == GL_TEXTURE_COMPRESSED_BLOCK_WIDTH) + buffer[0] = block_size / bheight; + else + buffer[0] = block_size / bwidth; + } + break; + } + + case GL_CLEAR_BUFFER: + if (target != GL_TEXTURE_BUFFER) + goto end; + + ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname, + buffer); + break; + + case GL_TEXTURE_VIEW: + case GL_VIEW_COMPATIBILITY_CLASS: + if (!_mesa_has_ARB_texture_view(ctx) || + target == GL_TEXTURE_BUFFER || + target == GL_RENDERBUFFER) + goto end; + + if (pname == GL_TEXTURE_VIEW) { + ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname, + buffer); + } else { + GLenum view_class = _mesa_texture_view_lookup_view_class(ctx, + internalformat); + if (view_class == GL_FALSE) + goto end; + + buffer[0] = view_class; + } + break; + default: - _mesa_error(ctx, GL_INVALID_ENUM, - "glGetInternalformativ(pname=%s)", - _mesa_enum_to_string(pname)); - return; + unreachable("bad param"); } + end: if (bufSize != 0 && params == NULL) { /* Emit a warning to aid application debugging, but go ahead and do the * memcpy (and probably crash) anyway. @@ -190,7 +1530,55 @@ _mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname, * application. Clamp the size of the copy to the size supplied by the * application. */ - memcpy(params, buffer, MIN2(count, bufSize) * sizeof(GLint)); + memcpy(params, buffer, MIN2(bufSize, 16) * sizeof(GLint)); return; } + +void GLAPIENTRY +_mesa_GetInternalformati64v(GLenum target, GLenum internalformat, + GLenum pname, GLsizei bufSize, GLint64 *params) +{ + GLint params32[16]; + unsigned i; + GLsizei realSize = MIN2(bufSize, 16); + GLsizei callSize; + + GET_CURRENT_CONTEXT(ctx); + + ASSERT_OUTSIDE_BEGIN_END(ctx); + + if (!_mesa_has_ARB_internalformat_query2(ctx)) { + _mesa_error(ctx, GL_INVALID_OPERATION, "glGetInternalformati64v"); + return; + } + + /* For SAMPLES there are cases where params needs to remain unmodified. As + * no pname can return a negative value, we fill params32 with negative + * values as reference values, that can be used to know what copy-back to + * params */ + memset(params32, -1, 16); + + /* For GL_MAX_COMBINED_DIMENSIONS we need to get back 2 32-bit integers, + * and at the same time we only need 2. So for that pname, we call the + * 32-bit query with bufSize 2, except on the case of bufSize 0, that is + * basically like asking to not get the value, but that is a caller + * problem. */ + if (pname == GL_MAX_COMBINED_DIMENSIONS && bufSize > 0) + callSize = 2; + else + callSize = bufSize; + + _mesa_GetInternalformativ(target, internalformat, pname, callSize, params32); + + if (pname == GL_MAX_COMBINED_DIMENSIONS) { + memcpy(params, params32, sizeof(GLint64)); + } else { + for (i = 0; i < realSize; i++) { + /* We only copy back the values that changed */ + if (params32[i] < 0) + break; + params[i] = (GLint64) params32[i]; + } + } +} diff --git a/src/mesa/main/formatquery.h b/src/mesa/main/formatquery.h index 603400059e5..1061fd26753 100644 --- a/src/mesa/main/formatquery.h +++ b/src/mesa/main/formatquery.h @@ -32,8 +32,17 @@ size_t _mesa_query_samples_for_format(struct gl_context *ctx, GLenum target, GLenum internalFormat, int samples[16]); +void +_mesa_query_internal_format_default(struct gl_context *ctx, GLenum target, + GLenum internalFormat, GLenum pname, + GLint *params); + extern void GLAPIENTRY _mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname, GLsizei bufSize, GLint *params); +extern void GLAPIENTRY +_mesa_GetInternalformati64v(GLenum target, GLenum internalformat, + GLenum pname, GLsizei bufSize, GLint64 *params); + #endif /* FORMATQUERY_H */ diff --git a/src/mesa/main/formats.c b/src/mesa/main/formats.c index 9f920075026..41d40a522fa 100644 --- a/src/mesa/main/formats.c +++ b/src/mesa/main/formats.c @@ -132,21 +132,25 @@ _mesa_get_format_bits(mesa_format format, GLenum pname) case GL_TEXTURE_RED_SIZE: case GL_RENDERBUFFER_RED_SIZE_EXT: case GL_FRAMEBUFFER_ATTACHMENT_RED_SIZE: + case GL_INTERNALFORMAT_RED_SIZE: return info->RedBits; case GL_GREEN_BITS: case GL_TEXTURE_GREEN_SIZE: case GL_RENDERBUFFER_GREEN_SIZE_EXT: case GL_FRAMEBUFFER_ATTACHMENT_GREEN_SIZE: + case GL_INTERNALFORMAT_GREEN_SIZE: return info->GreenBits; case GL_BLUE_BITS: case GL_TEXTURE_BLUE_SIZE: case GL_RENDERBUFFER_BLUE_SIZE_EXT: case GL_FRAMEBUFFER_ATTACHMENT_BLUE_SIZE: + case GL_INTERNALFORMAT_BLUE_SIZE: return info->BlueBits; case GL_ALPHA_BITS: case GL_TEXTURE_ALPHA_SIZE: case GL_RENDERBUFFER_ALPHA_SIZE_EXT: case GL_FRAMEBUFFER_ATTACHMENT_ALPHA_SIZE: + case GL_INTERNALFORMAT_ALPHA_SIZE: return info->AlphaBits; case GL_TEXTURE_INTENSITY_SIZE: return info->IntensityBits; @@ -158,11 +162,13 @@ _mesa_get_format_bits(mesa_format format, GLenum pname) case GL_TEXTURE_DEPTH_SIZE_ARB: case GL_RENDERBUFFER_DEPTH_SIZE_EXT: case GL_FRAMEBUFFER_ATTACHMENT_DEPTH_SIZE: + case GL_INTERNALFORMAT_DEPTH_SIZE: return info->DepthBits; case GL_STENCIL_BITS: case GL_TEXTURE_STENCIL_SIZE_EXT: case GL_RENDERBUFFER_STENCIL_SIZE_EXT: case GL_FRAMEBUFFER_ATTACHMENT_STENCIL_SIZE: + case GL_INTERNALFORMAT_STENCIL_SIZE: return info->StencilBits; default: _mesa_problem(NULL, "bad pname in _mesa_get_format_bits()"); diff --git a/src/mesa/main/formats.csv b/src/mesa/main/formats.csv index 529de31212c..a663c1e867f 100644 --- a/src/mesa/main/formats.csv +++ b/src/mesa/main/formats.csv @@ -26,7 +26,7 @@ # ########################################################################### -# This CSV file has the input data for gen_format.h and gen_format.c +# This CSV file has the input data for format_parser.py # # Each format entry contains: # - name, per enum mesa_format diff --git a/src/mesa/main/genmipmap.c b/src/mesa/main/genmipmap.c index 6c2d31dbcf3..6eacd424df7 100644 --- a/src/mesa/main/genmipmap.c +++ b/src/mesa/main/genmipmap.c @@ -38,27 +38,18 @@ #include "texobj.h" #include "hash.h" -/** - * Implements glGenerateMipmap and glGenerateTextureMipmap. - * Generates all the mipmap levels below the base level. - */ -void -_mesa_generate_texture_mipmap(struct gl_context *ctx, - struct gl_texture_object *texObj, GLenum target, - bool dsa) +bool +_mesa_is_valid_generate_texture_mipmap_target(struct gl_context *ctx, + GLenum target) { - struct gl_texture_image *srcImage; - GLboolean error; - const char *suffix = dsa ? "Texture" : ""; - - FLUSH_VERTICES(ctx, 0); + bool error; switch (target) { case GL_TEXTURE_1D: error = _mesa_is_gles(ctx); break; case GL_TEXTURE_2D: - error = GL_FALSE; + error = false; break; case GL_TEXTURE_3D: error = ctx->API == API_OPENGLES; @@ -78,14 +69,35 @@ _mesa_generate_texture_mipmap(struct gl_context *ctx, !ctx->Extensions.ARB_texture_cube_map_array; break; default: - error = GL_TRUE; + error = true; } - if (error) { - _mesa_error(ctx, GL_INVALID_ENUM, "glGenerate%sMipmap(target=%s)", - suffix, _mesa_enum_to_string(target)); - return; - } + return !error; +} + +bool +_mesa_is_valid_generate_texture_mipmap_internalformat(struct gl_context *ctx, + GLenum internalformat) +{ + return (!_mesa_is_enum_format_integer(internalformat) && + !_mesa_is_depthstencil_format(internalformat) && + !_mesa_is_astc_format(internalformat) && + !_mesa_is_stencil_format(internalformat)); +} + +/** + * Implements glGenerateMipmap and glGenerateTextureMipmap. + * Generates all the mipmap levels below the base level. + */ +void +_mesa_generate_texture_mipmap(struct gl_context *ctx, + struct gl_texture_object *texObj, GLenum target, + bool dsa) +{ + struct gl_texture_image *srcImage; + const char *suffix = dsa ? "Texture" : ""; + + FLUSH_VERTICES(ctx, 0); if (texObj->BaseLevel >= texObj->MaxLevel) { /* nothing to do */ @@ -109,10 +121,8 @@ _mesa_generate_texture_mipmap(struct gl_context *ctx, return; } - if (_mesa_is_enum_format_integer(srcImage->InternalFormat) || - _mesa_is_depthstencil_format(srcImage->InternalFormat) || - _mesa_is_astc_format(srcImage->InternalFormat) || - _mesa_is_stencil_format(srcImage->InternalFormat)) { + if (!_mesa_is_valid_generate_texture_mipmap_internalformat(ctx, + srcImage->InternalFormat)) { _mesa_unlock_texture(ctx, texObj); _mesa_error(ctx, GL_INVALID_OPERATION, "glGenerate%sMipmap(invalid internal format)", suffix); @@ -143,6 +153,12 @@ _mesa_GenerateMipmap(GLenum target) struct gl_texture_object *texObj; GET_CURRENT_CONTEXT(ctx); + if (!_mesa_is_valid_generate_texture_mipmap_target(ctx, target)) { + _mesa_error(ctx, GL_INVALID_ENUM, "glGenerateMipmap(target=%s)", + _mesa_enum_to_string(target)); + return; + } + texObj = _mesa_get_current_tex_object(ctx, target); if (!texObj) return; @@ -163,5 +179,11 @@ _mesa_GenerateTextureMipmap(GLuint texture) if (!texObj) return; + if (!_mesa_is_valid_generate_texture_mipmap_target(ctx, texObj->Target)) { + _mesa_error(ctx, GL_INVALID_ENUM, "glGenerateTextureMipmap(target=%s)", + _mesa_enum_to_string(texObj->Target)); + return; + } + _mesa_generate_texture_mipmap(ctx, texObj, texObj->Target, true); } diff --git a/src/mesa/main/genmipmap.h b/src/mesa/main/genmipmap.h index f4ef859511e..40b7f3636aa 100644 --- a/src/mesa/main/genmipmap.h +++ b/src/mesa/main/genmipmap.h @@ -32,6 +32,12 @@ extern void _mesa_generate_texture_mipmap(struct gl_context *ctx, struct gl_texture_object *texObj, GLenum target, bool dsa); +bool +_mesa_is_valid_generate_texture_mipmap_target(struct gl_context *ctx, + GLenum target); +bool +_mesa_is_valid_generate_texture_mipmap_internalformat(struct gl_context *ctx, + GLenum internalformat); extern void GLAPIENTRY _mesa_GenerateMipmap(GLenum target); diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c index f40c5705813..b0fadc93aef 100644 --- a/src/mesa/main/get.c +++ b/src/mesa/main/get.c @@ -384,6 +384,13 @@ static const int extra_ARB_shader_storage_buffer_object_and_geometry_shader[] = EXTRA_END }; +static const int extra_ARB_shader_image_load_store_shader_storage_buffer_object_es31[] = { + EXT(ARB_shader_image_load_store), + EXT(ARB_shader_storage_buffer_object), + EXTRA_API_ES31, + EXTRA_END +}; + static const int extra_ARB_framebuffer_no_attachments_and_geometry_shader[] = { EXTRA_EXT_FB_NO_ATTACH_GS, EXTRA_END @@ -1055,6 +1062,8 @@ find_custom_value(struct gl_context *ctx, const struct value_desc *d, union valu } break; /* GL_KHR_DEBUG */ + case GL_DEBUG_OUTPUT: + case GL_DEBUG_OUTPUT_SYNCHRONOUS: case GL_DEBUG_LOGGED_MESSAGES: case GL_DEBUG_NEXT_LOGGED_MESSAGE_LENGTH: case GL_DEBUG_GROUP_STACK_DEPTH: @@ -1715,19 +1724,19 @@ _mesa_GetInteger64v(GLenum pname, GLint64 *params) break; case TYPE_FLOATN_4: - params[3] = FLOAT_TO_INT64(((GLfloat *) p)[3]); + params[3] = FLOAT_TO_INT(((GLfloat *) p)[3]); case TYPE_FLOATN_3: - params[2] = FLOAT_TO_INT64(((GLfloat *) p)[2]); + params[2] = FLOAT_TO_INT(((GLfloat *) p)[2]); case TYPE_FLOATN_2: - params[1] = FLOAT_TO_INT64(((GLfloat *) p)[1]); + params[1] = FLOAT_TO_INT(((GLfloat *) p)[1]); case TYPE_FLOATN: - params[0] = FLOAT_TO_INT64(((GLfloat *) p)[0]); + params[0] = FLOAT_TO_INT(((GLfloat *) p)[0]); break; case TYPE_DOUBLEN_2: - params[1] = FLOAT_TO_INT64(((GLdouble *) p)[1]); + params[1] = FLOAT_TO_INT(((GLdouble *) p)[1]); case TYPE_DOUBLEN: - params[0] = FLOAT_TO_INT64(((GLdouble *) p)[0]); + params[0] = FLOAT_TO_INT(((GLdouble *) p)[0]); break; case TYPE_INT_4: diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py index 07d2d20df7a..12c21899cb1 100644 --- a/src/mesa/main/get_hash_params.py +++ b/src/mesa/main/get_hash_params.py @@ -126,6 +126,8 @@ descriptor=[ [ "MAX_TEXTURE_MAX_ANISOTROPY_EXT", "CONTEXT_FLOAT(Const.MaxTextureMaxAnisotropy), extra_EXT_texture_filter_anisotropic" ], # GL_KHR_debug (GL 4.3)/ GL_ARB_debug_output + [ "DEBUG_OUTPUT", "LOC_CUSTOM, TYPE_BOOLEAN, 0, NO_EXTRA" ], + [ "DEBUG_OUTPUT_SYNCHRONOUS", "LOC_CUSTOM, TYPE_BOOLEAN, 0, NO_EXTRA" ], [ "DEBUG_LOGGED_MESSAGES", "LOC_CUSTOM, TYPE_INT, 0, NO_EXTRA" ], [ "DEBUG_NEXT_LOGGED_MESSAGE_LENGTH", "LOC_CUSTOM, TYPE_INT, 0, NO_EXTRA" ], [ "MAX_DEBUG_LOGGED_MESSAGES", "CONST(MAX_DEBUG_LOGGED_MESSAGES), NO_EXTRA" ], @@ -493,9 +495,12 @@ descriptor=[ [ "MAX_COMBINED_SHADER_STORAGE_BLOCKS", "CONTEXT_INT(Const.MaxCombinedShaderStorageBlocks), extra_ARB_shader_storage_buffer_object_es31" ], [ "MAX_SHADER_STORAGE_BLOCK_SIZE", "CONTEXT_INT(Const.MaxShaderStorageBlockSize), extra_ARB_shader_storage_buffer_object_es31" ], [ "MAX_SHADER_STORAGE_BUFFER_BINDINGS", "CONTEXT_INT(Const.MaxShaderStorageBufferBindings), extra_ARB_shader_storage_buffer_object_es31" ], - [ "MAX_COMBINED_SHADER_OUTPUT_RESOURCES", "CONTEXT_INT(Const.MaxCombinedShaderOutputResources), extra_ARB_shader_storage_buffer_object_es31" ], [ "SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT", "CONTEXT_INT(Const.ShaderStorageBufferOffsetAlignment), extra_ARB_shader_storage_buffer_object_es31" ], [ "SHADER_STORAGE_BUFFER_BINDING", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_shader_storage_buffer_object_es31" ], + + # GL_ARB_shader_image_load_store / GL_ARB_shader_storage_buffer_object / GLES 3.1 + # (MAX_COMBINED_IMAGE_UNITS_AND_FRAGMENT_OUTPUTS in GL_ARB_shader_image_load_store) + [ "MAX_COMBINED_SHADER_OUTPUT_RESOURCES", "CONTEXT_INT(Const.MaxCombinedShaderOutputResources), extra_ARB_shader_image_load_store_shader_storage_buffer_object_es31" ], ]}, # Enums in OpenGL Core profile and ES 3.1 @@ -773,6 +778,7 @@ descriptor=[ [ "DEPTH_CLAMP", "CONTEXT_BOOL(Transform.DepthClamp), extra_ARB_depth_clamp" ], # GL_ATI_fragment_shader + [ "FRAGMENT_SHADER_ATI", "CONTEXT_BOOL(ATIFragmentShader.Enabled), extra_ATI_fragment_shader" ], [ "NUM_FRAGMENT_REGISTERS_ATI", "CONST(6), extra_ATI_fragment_shader" ], [ "NUM_FRAGMENT_CONSTANTS_ATI", "CONST(8), extra_ATI_fragment_shader" ], [ "NUM_PASSES_ATI", "CONST(2), extra_ATI_fragment_shader" ], @@ -838,7 +844,6 @@ descriptor=[ [ "MAX_PROGRAM_TEXTURE_GATHER_COMPONENTS_ARB", "CONTEXT_INT(Const.MaxProgramTextureGatherComponents), extra_ARB_texture_gather"], # GL_ARB_shader_image_load_store - [ "MAX_COMBINED_IMAGE_UNITS_AND_FRAGMENT_OUTPUTS", "CONTEXT_INT(Const.MaxCombinedShaderOutputResources), extra_ARB_shader_image_load_store" ], [ "MAX_IMAGE_SAMPLES", "CONTEXT_INT(Const.MaxImageSamples), extra_ARB_shader_image_load_store" ], # GL_EXT_polygon_offset_clamp diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c index 987cd0db45c..cf6495885b6 100644 --- a/src/mesa/main/glformats.c +++ b/src/mesa/main/glformats.c @@ -1339,6 +1339,51 @@ _mesa_is_compressed_format(const struct gl_context *ctx, GLenum format) } /** + * Test if the given format represents an sRGB format. + * \param format the GL format (can be an internal format) + * \return GL_TRUE if format is sRGB, GL_FALSE otherwise + */ +GLboolean +_mesa_is_srgb_format(GLenum format) +{ + switch (format) { + case GL_SRGB: + case GL_SRGB8: + case GL_SRGB_ALPHA: + case GL_SRGB8_ALPHA8: + case GL_COMPRESSED_SRGB: + case GL_COMPRESSED_SRGB_ALPHA: + case GL_COMPRESSED_SRGB_S3TC_DXT1_EXT: + case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT: + case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT: + case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT: + case GL_COMPRESSED_SRGB8_ETC2: + case GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC: + case GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2: + case GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM: + case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR: + case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x4_KHR: + case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5_KHR: + case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x5_KHR: + case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR: + case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x5_KHR: + case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x6_KHR: + case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x8_KHR: + case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x5_KHR: + case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x6_KHR: + case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR: + case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR: + case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR: + case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR: + return GL_TRUE; + default: + break; + } + + return GL_FALSE; +} + +/** * Convert various unpack formats to the corresponding base format. */ GLenum @@ -1430,6 +1475,8 @@ _mesa_base_format_has_channel(GLenum base_format, GLenum pname) case GL_TEXTURE_RED_TYPE: case GL_RENDERBUFFER_RED_SIZE_EXT: case GL_FRAMEBUFFER_ATTACHMENT_RED_SIZE: + case GL_INTERNALFORMAT_RED_SIZE: + case GL_INTERNALFORMAT_RED_TYPE: if (base_format == GL_RED || base_format == GL_RG || base_format == GL_RGB || @@ -1441,6 +1488,8 @@ _mesa_base_format_has_channel(GLenum base_format, GLenum pname) case GL_TEXTURE_GREEN_TYPE: case GL_RENDERBUFFER_GREEN_SIZE_EXT: case GL_FRAMEBUFFER_ATTACHMENT_GREEN_SIZE: + case GL_INTERNALFORMAT_GREEN_SIZE: + case GL_INTERNALFORMAT_GREEN_TYPE: if (base_format == GL_RG || base_format == GL_RGB || base_format == GL_RGBA) { @@ -1451,6 +1500,8 @@ _mesa_base_format_has_channel(GLenum base_format, GLenum pname) case GL_TEXTURE_BLUE_TYPE: case GL_RENDERBUFFER_BLUE_SIZE_EXT: case GL_FRAMEBUFFER_ATTACHMENT_BLUE_SIZE: + case GL_INTERNALFORMAT_BLUE_SIZE: + case GL_INTERNALFORMAT_BLUE_TYPE: if (base_format == GL_RGB || base_format == GL_RGBA) { return GL_TRUE; @@ -1460,6 +1511,8 @@ _mesa_base_format_has_channel(GLenum base_format, GLenum pname) case GL_TEXTURE_ALPHA_TYPE: case GL_RENDERBUFFER_ALPHA_SIZE_EXT: case GL_FRAMEBUFFER_ATTACHMENT_ALPHA_SIZE: + case GL_INTERNALFORMAT_ALPHA_SIZE: + case GL_INTERNALFORMAT_ALPHA_TYPE: if (base_format == GL_RGBA || base_format == GL_ALPHA || base_format == GL_LUMINANCE_ALPHA) { @@ -1483,6 +1536,8 @@ _mesa_base_format_has_channel(GLenum base_format, GLenum pname) case GL_TEXTURE_DEPTH_TYPE: case GL_RENDERBUFFER_DEPTH_SIZE_EXT: case GL_FRAMEBUFFER_ATTACHMENT_DEPTH_SIZE: + case GL_INTERNALFORMAT_DEPTH_SIZE: + case GL_INTERNALFORMAT_DEPTH_TYPE: if (base_format == GL_DEPTH_STENCIL || base_format == GL_DEPTH_COMPONENT) { return GL_TRUE; @@ -1490,6 +1545,8 @@ _mesa_base_format_has_channel(GLenum base_format, GLenum pname) return GL_FALSE; case GL_RENDERBUFFER_STENCIL_SIZE_EXT: case GL_FRAMEBUFFER_ATTACHMENT_STENCIL_SIZE: + case GL_INTERNALFORMAT_STENCIL_SIZE: + case GL_INTERNALFORMAT_STENCIL_TYPE: if (base_format == GL_DEPTH_STENCIL || base_format == GL_STENCIL_INDEX) { return GL_TRUE; @@ -2546,6 +2603,10 @@ _mesa_es3_effective_internal_format_for_format_and_type(GLenum format, return GL_RGBA8; case GL_RGB: return GL_RGB8; + case GL_RG: + return GL_RG8; + case GL_RED: + return GL_R8; /* Although LUMINANCE_ALPHA, LUMINANCE and ALPHA appear in table 3.12, * (section 3.8 Texturing, page 128 of the OpenGL-ES 3.0.4) as effective * internal formats, they do not correspond to GL constants, so the base @@ -3464,6 +3525,27 @@ _mesa_format_from_format_and_type(GLenum format, GLenum type) case GL_UNSIGNED_INT_10F_11F_11F_REV: if (format == GL_RGB) return MESA_FORMAT_R11G11B10_FLOAT; + break; + case GL_FLOAT: + if (format == GL_DEPTH_COMPONENT) + return MESA_FORMAT_Z_FLOAT32; + break; + case GL_UNSIGNED_INT: + if (format == GL_DEPTH_COMPONENT) + return MESA_FORMAT_Z_UNORM32; + break; + case GL_UNSIGNED_SHORT: + if (format == GL_DEPTH_COMPONENT) + return MESA_FORMAT_Z_UNORM16; + break; + case GL_UNSIGNED_INT_24_8: + if (format == GL_DEPTH_STENCIL) + return MESA_FORMAT_Z24_UNORM_S8_UINT; + break; + case GL_FLOAT_32_UNSIGNED_INT_24_8_REV: + if (format == GL_DEPTH_STENCIL) + return MESA_FORMAT_Z32_FLOAT_S8X24_UINT; + break; default: break; } diff --git a/src/mesa/main/glformats.h b/src/mesa/main/glformats.h index b3668556da2..00d2767085d 100644 --- a/src/mesa/main/glformats.h +++ b/src/mesa/main/glformats.h @@ -101,6 +101,9 @@ _mesa_is_depth_or_stencil_format(GLenum format); extern GLboolean _mesa_is_compressed_format(const struct gl_context *ctx, GLenum format); +extern GLboolean +_mesa_is_srgb_format(GLenum format); + extern GLenum _mesa_base_format_to_integer_format(GLenum format); diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index a2e3c26c321..2e43996f23a 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -3811,6 +3811,7 @@ struct gl_extensions GLboolean ARB_indirect_parameters; GLboolean ARB_instanced_arrays; GLboolean ARB_internalformat_query; + GLboolean ARB_internalformat_query2; GLboolean ARB_map_buffer_range; GLboolean ARB_occlusion_query; GLboolean ARB_occlusion_query2; @@ -3819,6 +3820,7 @@ struct gl_extensions GLboolean ARB_query_buffer_object; GLboolean ARB_sample_shading; GLboolean ARB_seamless_cube_map; + GLboolean ARB_shader_atomic_counter_ops; GLboolean ARB_shader_atomic_counters; GLboolean ARB_shader_bit_encoding; GLboolean ARB_shader_clock; diff --git a/src/mesa/main/multisample.c b/src/mesa/main/multisample.c index e7783ea5374..77773a20883 100644 --- a/src/mesa/main/multisample.c +++ b/src/mesa/main/multisample.c @@ -174,10 +174,15 @@ _mesa_check_sample_count(struct gl_context *ctx, GLenum target, * for <internalformat> then the error INVALID_OPERATION is generated." */ if (ctx->Extensions.ARB_internalformat_query) { - GLint buffer[16]; - int count = ctx->Driver.QuerySamplesForFormat(ctx, target, - internalFormat, buffer); - int limit = count ? buffer[0] : -1; + GLint buffer[16] = {-1}; + GLint limit; + + ctx->Driver.QueryInternalFormat(ctx, target, internalFormat, + GL_SAMPLES, buffer); + /* since the query returns samples sorted in descending order, + * the first element is the greatest supported sample value. + */ + limit = buffer[0]; return samples > limit ? GL_INVALID_OPERATION : GL_NO_ERROR; } diff --git a/src/mesa/main/samplerobj.h b/src/mesa/main/samplerobj.h index abc6e019046..8e9539d8d8f 100644 --- a/src/mesa/main/samplerobj.h +++ b/src/mesa/main/samplerobj.h @@ -27,14 +27,12 @@ #ifndef SAMPLEROBJ_H #define SAMPLEROBJ_H +#include "mtypes.h" + #ifdef __cplusplus extern "C" { #endif - -#include "mtypes.h" - - struct dd_function_table; static inline struct gl_sampler_object * diff --git a/src/mesa/main/shaderimage.c b/src/mesa/main/shaderimage.c index 040e9fd6e3c..fd5934f939f 100644 --- a/src/mesa/main/shaderimage.c +++ b/src/mesa/main/shaderimage.c @@ -331,12 +331,54 @@ get_image_format_class(mesa_format format) } } -/** - * Return whether an image format should be supported based on the current API - * version of the context. - */ -static bool -is_image_format_supported(const struct gl_context *ctx, GLenum format) +static GLenum +_image_format_class_to_glenum(enum image_format_class class) +{ + switch (class) { + case IMAGE_FORMAT_CLASS_NONE: + return GL_NONE; + case IMAGE_FORMAT_CLASS_1X8: + return GL_IMAGE_CLASS_1_X_8; + case IMAGE_FORMAT_CLASS_1X16: + return GL_IMAGE_CLASS_1_X_16; + case IMAGE_FORMAT_CLASS_1X32: + return GL_IMAGE_CLASS_1_X_32; + case IMAGE_FORMAT_CLASS_2X8: + return GL_IMAGE_CLASS_2_X_8; + case IMAGE_FORMAT_CLASS_2X16: + return GL_IMAGE_CLASS_2_X_16; + case IMAGE_FORMAT_CLASS_2X32: + return GL_IMAGE_CLASS_2_X_32; + case IMAGE_FORMAT_CLASS_10_11_11: + return GL_IMAGE_CLASS_11_11_10; + case IMAGE_FORMAT_CLASS_4X8: + return GL_IMAGE_CLASS_4_X_8; + case IMAGE_FORMAT_CLASS_4X16: + return GL_IMAGE_CLASS_4_X_16; + case IMAGE_FORMAT_CLASS_4X32: + return GL_IMAGE_CLASS_4_X_32; + case IMAGE_FORMAT_CLASS_2_10_10_10: + return GL_IMAGE_CLASS_10_10_10_2; + default: + assert(!"Invalid image_format_class"); + return GL_NONE; + } +} + +GLenum +_mesa_get_image_format_class(GLenum format) +{ + mesa_format tex_format = _mesa_get_shader_image_format(format); + if (tex_format == MESA_FORMAT_NONE) + return GL_NONE; + + enum image_format_class class = get_image_format_class(tex_format); + return _image_format_class_to_glenum(class); +} + +bool +_mesa_is_shader_image_format_supported(const struct gl_context *ctx, + GLenum format) { switch (format) { /* Formats supported on both desktop and ES GL, c.f. table 8.27 of the @@ -503,7 +545,7 @@ validate_bind_image_texture(struct gl_context *ctx, GLuint unit, return GL_FALSE; } - if (!is_image_format_supported(ctx, format)) { + if (!_mesa_is_shader_image_format_supported(ctx, format)) { _mesa_error(ctx, GL_INVALID_VALUE, "glBindImageTexture(format)"); return GL_FALSE; } @@ -668,7 +710,7 @@ _mesa_BindImageTextures(GLuint first, GLsizei count, const GLuint *textures) tex_format = image->InternalFormat; } - if (!is_image_format_supported(ctx, tex_format)) { + if (!_mesa_is_shader_image_format_supported(ctx, tex_format)) { /* The ARB_multi_bind spec says: * * "An INVALID_OPERATION error is generated if the internal diff --git a/src/mesa/main/shaderimage.h b/src/mesa/main/shaderimage.h index 94ee814a716..85193e104f6 100644 --- a/src/mesa/main/shaderimage.h +++ b/src/mesa/main/shaderimage.h @@ -43,6 +43,20 @@ mesa_format _mesa_get_shader_image_format(GLenum format); /** + * Get the GL image format class for a shader image format GL enum + */ +GLenum +_mesa_get_image_format_class(GLenum format); + +/** + * Return whether an image format should be supported based on the current API + * version of the context. + */ +bool +_mesa_is_shader_image_format_supported(const struct gl_context *ctx, + GLenum format); + +/** * Get a single image unit struct with the default state. */ struct gl_image_unit diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp index 24e3d189091..09b97c33074 100644 --- a/src/mesa/main/tests/dispatch_sanity.cpp +++ b/src/mesa/main/tests/dispatch_sanity.cpp @@ -857,6 +857,9 @@ const struct function common_desktop_functions_possible[] = { /* GL_ARB_internalformat_query */ { "glGetInternalformativ", 30, -1 }, + /* GL_ARB_internalformat_query */ + { "glGetInternalformati64v", 30, -1 }, + /* GL_ARB_multi_bind */ { "glBindBuffersBase", 44, -1 }, { "glBindBuffersRange", 44, -1 }, @@ -2355,6 +2358,7 @@ const struct function gles3_functions_possible[] = { { "glGetInteger64v", 30, -1 }, { "glGetIntegeri_v", 30, -1 }, { "glGetInternalformativ", 30, -1 }, + { "glGetInternalformati64v", 30, -1 }, // glGetProgramBinary aliases glGetProgramBinaryOES in GLES 2 { "glGetQueryiv", 30, -1 }, { "glGetQueryObjectuiv", 30, -1 }, diff --git a/src/mesa/main/texcompress.c b/src/mesa/main/texcompress.c index a8ac19e40d7..3ae64521513 100644 --- a/src/mesa/main/texcompress.c +++ b/src/mesa/main/texcompress.c @@ -443,7 +443,7 @@ _mesa_get_compressed_formats(struct gl_context *ctx, GLint *formats) /** - * Convert a compressed MESA_FORMAT_x to a GLenum. + * Convert GLenum to a compressed MESA_FORMAT_x. */ mesa_format _mesa_glenum_to_compressed_format(GLenum format) diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c index 8a4c6286cbe..616a92953e7 100644 --- a/src/mesa/main/teximage.c +++ b/src/mesa/main/teximage.c @@ -242,6 +242,24 @@ _mesa_is_array_texture(GLenum target) }; } +/** + * Test if a target is a cube map. + * + * \param target texture target. + * + * \return true if the target is a cube map, false otherwise. + */ +bool +_mesa_is_cube_map_texture(GLenum target) +{ + switch(target) { + case GL_TEXTURE_CUBE_MAP: + case GL_TEXTURE_CUBE_MAP_ARRAY: + return true; + default: + return false; + } +} /** * Return the proxy target which corresponds to the given texture target @@ -1266,7 +1284,7 @@ compressedteximage_only_format(const struct gl_context *ctx, GLenum format) /** * Return true if the format doesn't support online compression. */ -static bool +bool _mesa_format_no_online_compression(const struct gl_context *ctx, GLenum format) { return _mesa_is_astc_format(format) || @@ -1552,19 +1570,12 @@ compressed_tex_size(GLsizei width, GLsizei height, GLsizei depth, * \param ctx GL context * \param target Texture target * \param internalFormat Internal format of the texture image - * \param dimensions Dimensionality at the caller. This is \b not used - * in the validation. It is only used when logging - * error messages. - * \param caller Base name of the calling function (e.g., - * "glTexImage" or "glTexStorage"). * * \returns true if the combination is legal, false otherwise. */ bool _mesa_legal_texture_base_format_for_target(struct gl_context *ctx, - GLenum target, GLenum internalFormat, - unsigned dimensions, - const char *caller) + GLenum target, GLenum internalFormat) { if (_mesa_base_tex_format(ctx, internalFormat) == GL_DEPTH_COMPONENT || _mesa_base_tex_format(ctx, internalFormat) == GL_DEPTH_STENCIL @@ -1603,9 +1614,6 @@ _mesa_legal_texture_base_format_for_target(struct gl_context *ctx, !((target == GL_TEXTURE_CUBE_MAP_ARRAY || target == GL_PROXY_TEXTURE_CUBE_MAP_ARRAY) && ctx->Extensions.ARB_texture_cube_map_array)) { - _mesa_error(ctx, GL_INVALID_OPERATION, - "%s%dD(bad target for depth texture)", - caller, dimensions); return false; } } @@ -1849,9 +1857,11 @@ texture_error_check( struct gl_context *ctx, } /* additional checks for depth textures */ - if (!_mesa_legal_texture_base_format_for_target(ctx, target, internalFormat, - dimensions, "glTexImage")) + if (!_mesa_legal_texture_base_format_for_target(ctx, target, internalFormat)) { + _mesa_error(ctx, GL_INVALID_OPERATION, + "glTexImage%dD(bad target for texture)", dimensions); return GL_TRUE; + } /* additional checks for compressed textures */ if (_mesa_is_compressed_format(ctx, internalFormat)) { @@ -5148,8 +5158,8 @@ _mesa_TextureBufferRange(GLuint texture, GLenum internalFormat, GLuint buffer, bufObj, offset, size, "glTextureBufferRange"); } -static GLboolean -is_renderable_texture_format(struct gl_context *ctx, GLenum internalformat) +GLboolean +_mesa_is_renderable_texture_format(struct gl_context *ctx, GLenum internalformat) { /* Everything that is allowed for renderbuffers, * except for a base format of GL_STENCIL_INDEX, unless supported. @@ -5229,7 +5239,7 @@ texture_image_multisample(struct gl_context *ctx, GLuint dims, return; } - if (!is_renderable_texture_format(ctx, internalformat)) { + if (!_mesa_is_renderable_texture_format(ctx, internalformat)) { /* Page 172 of OpenGL ES 3.1 spec says: * "An INVALID_ENUM error is generated if sizedinternalformat is not * color-renderable, depth-renderable, or stencil-renderable (as diff --git a/src/mesa/main/teximage.h b/src/mesa/main/teximage.h index 17f2c908ecc..8b687062a67 100644 --- a/src/mesa/main/teximage.h +++ b/src/mesa/main/teximage.h @@ -210,9 +210,13 @@ _mesa_validate_texbuffer_format(const struct gl_context *ctx, bool _mesa_legal_texture_base_format_for_target(struct gl_context *ctx, GLenum target, - GLenum internalFormat, - unsigned dimensions, - const char *caller); + GLenum internalFormat); + +bool +_mesa_format_no_online_compression(const struct gl_context *ctx, GLenum format); + +GLboolean +_mesa_is_renderable_texture_format(struct gl_context *ctx, GLenum internalformat); extern void _mesa_texture_sub_image(struct gl_context *ctx, GLuint dims, @@ -252,6 +256,10 @@ _mesa_texture_buffer_range(struct gl_context *ctx, struct gl_buffer_object *bufObj, GLintptr offset, GLsizeiptr size, const char *caller); + +bool +_mesa_is_cube_map_texture(GLenum target); + /*@}*/ diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c index 3b769f436b7..9350ca5c035 100644 --- a/src/mesa/main/texparam.c +++ b/src/mesa/main/texparam.c @@ -252,8 +252,8 @@ incomplete(struct gl_context *ctx, struct gl_texture_object *texObj) } -static GLboolean -target_allows_setting_sampler_parameters(GLenum target) +GLboolean +_mesa_target_allows_setting_sampler_parameters(GLenum target) { switch (target) { case GL_TEXTURE_2D_MULTISAMPLE: @@ -279,7 +279,7 @@ set_tex_parameteri(struct gl_context *ctx, switch (pname) { case GL_TEXTURE_MIN_FILTER: - if (!target_allows_setting_sampler_parameters(texObj->Target)) + if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target)) goto invalid_enum; if (texObj->Sampler.MinFilter == params[0]) @@ -307,7 +307,7 @@ set_tex_parameteri(struct gl_context *ctx, return GL_FALSE; case GL_TEXTURE_MAG_FILTER: - if (!target_allows_setting_sampler_parameters(texObj->Target)) + if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target)) goto invalid_enum; if (texObj->Sampler.MagFilter == params[0]) @@ -324,7 +324,7 @@ set_tex_parameteri(struct gl_context *ctx, return GL_FALSE; case GL_TEXTURE_WRAP_S: - if (!target_allows_setting_sampler_parameters(texObj->Target)) + if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target)) goto invalid_enum; if (texObj->Sampler.WrapS == params[0]) @@ -337,7 +337,7 @@ set_tex_parameteri(struct gl_context *ctx, return GL_FALSE; case GL_TEXTURE_WRAP_T: - if (!target_allows_setting_sampler_parameters(texObj->Target)) + if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target)) goto invalid_enum; if (texObj->Sampler.WrapT == params[0]) @@ -350,7 +350,7 @@ set_tex_parameteri(struct gl_context *ctx, return GL_FALSE; case GL_TEXTURE_WRAP_R: - if (!target_allows_setting_sampler_parameters(texObj->Target)) + if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target)) goto invalid_enum; if (texObj->Sampler.WrapR == params[0]) @@ -438,7 +438,7 @@ set_tex_parameteri(struct gl_context *ctx, if ((_mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_shadow) || _mesa_is_gles3(ctx)) { - if (!target_allows_setting_sampler_parameters(texObj->Target)) + if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target)) goto invalid_enum; if (texObj->Sampler.CompareMode == params[0]) @@ -457,7 +457,7 @@ set_tex_parameteri(struct gl_context *ctx, if ((_mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_shadow) || _mesa_is_gles3(ctx)) { - if (!target_allows_setting_sampler_parameters(texObj->Target)) + if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target)) goto invalid_enum; if (texObj->Sampler.CompareFunc == params[0]) @@ -571,7 +571,7 @@ set_tex_parameteri(struct gl_context *ctx, if (ctx->Extensions.EXT_texture_sRGB_decode) { GLenum decode = params[0]; - if (!target_allows_setting_sampler_parameters(texObj->Target)) + if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target)) goto invalid_enum; if (decode == GL_DECODE_EXT || decode == GL_SKIP_DECODE_EXT) { @@ -589,7 +589,7 @@ set_tex_parameteri(struct gl_context *ctx, && ctx->Extensions.AMD_seamless_cubemap_per_texture) { GLenum param = params[0]; - if (!target_allows_setting_sampler_parameters(texObj->Target)) + if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target)) goto invalid_enum; if (param != GL_TRUE && param != GL_FALSE) { @@ -645,7 +645,7 @@ set_tex_parameterf(struct gl_context *ctx, if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx)) goto invalid_pname; - if (!target_allows_setting_sampler_parameters(texObj->Target)) + if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target)) goto invalid_enum; if (texObj->Sampler.MinLod == params[0]) @@ -658,7 +658,7 @@ set_tex_parameterf(struct gl_context *ctx, if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx)) goto invalid_pname; - if (!target_allows_setting_sampler_parameters(texObj->Target)) + if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target)) goto invalid_enum; if (texObj->Sampler.MaxLod == params[0]) @@ -677,7 +677,7 @@ set_tex_parameterf(struct gl_context *ctx, case GL_TEXTURE_MAX_ANISOTROPY_EXT: if (ctx->Extensions.EXT_texture_filter_anisotropic) { - if (!target_allows_setting_sampler_parameters(texObj->Target)) + if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target)) goto invalid_enum; if (texObj->Sampler.MaxAnisotropy == params[0]) @@ -705,7 +705,7 @@ set_tex_parameterf(struct gl_context *ctx, if (_mesa_is_gles(ctx)) goto invalid_pname; - if (!target_allows_setting_sampler_parameters(texObj->Target)) + if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target)) goto invalid_enum; if (texObj->Sampler.LodBias != params[0]) { @@ -720,7 +720,7 @@ set_tex_parameterf(struct gl_context *ctx, !ctx->Extensions.ARB_texture_border_clamp) goto invalid_pname; - if (!target_allows_setting_sampler_parameters(texObj->Target)) + if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target)) goto invalid_enum; flush(ctx); @@ -1202,9 +1202,9 @@ _mesa_TextureParameterIuiv(GLuint texture, GLenum pname, const GLuint *params) _mesa_texture_parameterIuiv(ctx, texObj, pname, params, true); } -static GLboolean -legal_get_tex_level_parameter_target(struct gl_context *ctx, GLenum target, - bool dsa) +GLboolean +_mesa_legal_get_tex_level_parameter_target(struct gl_context *ctx, GLenum target, + bool dsa) { /* Common targets for desktop GL and GLES 3.1. */ switch (target) { @@ -1578,7 +1578,7 @@ valid_tex_level_parameteriv_target(struct gl_context *ctx, GLenum target, bool dsa) { const char *suffix = dsa ? "ture" : ""; - if (!legal_get_tex_level_parameter_target(ctx, target, dsa)) { + if (!_mesa_legal_get_tex_level_parameter_target(ctx, target, dsa)) { _mesa_error(ctx, GL_INVALID_ENUM, "glGetTex%sLevelParameter[if]v(target=%s)", suffix, _mesa_enum_to_string(target)); diff --git a/src/mesa/main/texparam.h b/src/mesa/main/texparam.h index 96defbec213..b2f6a847337 100644 --- a/src/mesa/main/texparam.h +++ b/src/mesa/main/texparam.h @@ -65,6 +65,13 @@ _mesa_texture_parameterIuiv(struct gl_context *ctx, struct gl_texture_object *texObj, GLenum pname, const GLuint *params, bool dsa); +GLboolean +_mesa_legal_get_tex_level_parameter_target(struct gl_context *ctx, GLenum target, + bool dsa); + +GLboolean +_mesa_target_allows_setting_sampler_parameters(GLenum target); + /*@}*/ /** diff --git a/src/mesa/main/texstorage.c b/src/mesa/main/texstorage.c index 9fd969fbc53..f4a076028fb 100644 --- a/src/mesa/main/texstorage.c +++ b/src/mesa/main/texstorage.c @@ -358,11 +358,11 @@ tex_storage_error_check(struct gl_context *ctx, } /* additional checks for depth textures */ - if (!_mesa_legal_texture_base_format_for_target(ctx, target, internalformat, - dims, dsa ? - "glTextureStorage" : - "glTexStorage")) + if (!_mesa_legal_texture_base_format_for_target(ctx, target, internalformat)) { + _mesa_error(ctx, GL_INVALID_OPERATION, "glTex%sStorage%uD(bad target for texture)", + suffix, dims); return GL_TRUE; + } return GL_FALSE; } diff --git a/src/mesa/main/textureview.c b/src/mesa/main/textureview.c index 316d8280338..419fbebf2f0 100644 --- a/src/mesa/main/textureview.c +++ b/src/mesa/main/textureview.c @@ -162,12 +162,8 @@ static const struct internal_format_class_info s3tc_compatible_internal_formats[ {GL_VIEW_CLASS_S3TC_DXT5_RGBA, GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT}, }; -/** - * Lookup format view class based on internalformat - * \return VIEW_CLASS if internalformat found in table, false otherwise. - */ -static GLenum -lookup_view_class(const struct gl_context *ctx, GLenum internalformat) +GLenum +_mesa_texture_view_lookup_view_class(const struct gl_context *ctx, GLenum internalformat) { GLuint i; @@ -336,8 +332,8 @@ _mesa_texture_view_compatible_format(const struct gl_context *ctx, if (origInternalFormat == newInternalFormat) return true; - origViewClass = lookup_view_class(ctx, origInternalFormat); - newViewClass = lookup_view_class(ctx, newInternalFormat); + origViewClass = _mesa_texture_view_lookup_view_class(ctx, origInternalFormat); + newViewClass = _mesa_texture_view_lookup_view_class(ctx, newInternalFormat); if ((origViewClass == newViewClass) && origViewClass != false) return true; diff --git a/src/mesa/main/textureview.h b/src/mesa/main/textureview.h index 59e24b68dd0..39b415d8793 100644 --- a/src/mesa/main/textureview.h +++ b/src/mesa/main/textureview.h @@ -34,6 +34,14 @@ _mesa_texture_view_compatible_format(const struct gl_context *ctx, GLenum origInternalFormat, GLenum newInternalFormat); +/** + * Lookup format view class based on internalformat + * \return VIEW_CLASS if internalformat found in table, false otherwise. + */ +GLenum +_mesa_texture_view_lookup_view_class(const struct gl_context *ctx, + GLenum internalformat); + extern void GLAPIENTRY _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture, GLenum internalformat, diff --git a/src/mesa/main/transformfeedback.c b/src/mesa/main/transformfeedback.c index 976b2686a60..f73a89f6c0f 100644 --- a/src/mesa/main/transformfeedback.c +++ b/src/mesa/main/transformfeedback.c @@ -1136,6 +1136,11 @@ _mesa_DeleteTransformFeedbacks(GLsizei n, const GLuint *names) } _mesa_HashRemove(ctx->TransformFeedback.Objects, names[i]); /* unref, but object may not be deleted until later */ + if (obj == ctx->TransformFeedback.CurrentObject) { + reference_transform_feedback_object( + &ctx->TransformFeedback.CurrentObject, + ctx->TransformFeedback.DefaultObject); + } reference_transform_feedback_object(&obj, NULL); } } diff --git a/src/mesa/program/prog_execute.c b/src/mesa/program/prog_execute.c index 8f167be60cd..aadf82116e5 100644 --- a/src/mesa/program/prog_execute.c +++ b/src/mesa/program/prog_execute.c @@ -650,11 +650,9 @@ _mesa_execute_program(struct gl_context * ctx, program->Instructions[inst->BranchTarget].Opcode == OPCODE_ENDIF); /* eval condition */ - if (inst->SrcReg[0].File != PROGRAM_UNDEFINED) { - GLfloat a[4]; - fetch_vector1(&inst->SrcReg[0], machine, a); - cond = (a[0] != 0.0F); - } + GLfloat a[4]; + fetch_vector1(&inst->SrcReg[0], machine, a); + cond = (a[0] != 0.0F); if (DEBUG_PROG) { printf("IF: %d\n", cond); } diff --git a/src/mesa/program/prog_optimize.c b/src/mesa/program/prog_optimize.c index c6d22644419..401f26ec0d0 100644 --- a/src/mesa/program/prog_optimize.c +++ b/src/mesa/program/prog_optimize.c @@ -289,8 +289,7 @@ _mesa_remove_dead_code_global(struct gl_program *prog) /* check dst reg */ if (inst->DstReg.File == PROGRAM_TEMPORARY) { - const GLuint index = inst->DstReg.Index; - assert(index < REG_ALLOCATE_MAX_PROGRAM_TEMPS); + assert(inst->DstReg.Index < REG_ALLOCATE_MAX_PROGRAM_TEMPS); if (inst->DstReg.RelAddr) { if (dbg) diff --git a/src/mesa/program/program.c b/src/mesa/program/program.c index 27867c48d52..67672164f59 100644 --- a/src/mesa/program/program.c +++ b/src/mesa/program/program.c @@ -219,7 +219,6 @@ _mesa_new_program(struct gl_context *ctx, GLenum target, GLuint id) struct gl_vertex_program *prog = CALLOC_STRUCT(gl_vertex_program); return _mesa_init_gl_program(&prog->Base, target, id); } - case GL_FRAGMENT_PROGRAM_NV: case GL_FRAGMENT_PROGRAM_ARB: { struct gl_fragment_program *prog = CALLOC_STRUCT(gl_fragment_program); return _mesa_init_gl_program(&prog->Base, target, id); diff --git a/src/mesa/program/program_parse_extra.c b/src/mesa/program/program_parse_extra.c index 1c5f5794dd5..c82c9c1b751 100644 --- a/src/mesa/program/program_parse_extra.c +++ b/src/mesa/program/program_parse_extra.c @@ -40,7 +40,6 @@ _mesa_parse_instruction_suffix(const struct asm_parser_state *state, { inst->Saturate = GL_FALSE; - /* The only possible suffix element is the saturation selector from * ARB_fragment_program. */ @@ -51,7 +50,6 @@ _mesa_parse_instruction_suffix(const struct asm_parser_state *state, } } - /* It is an error for all of the suffix string not to be consumed. */ return suffix[0] == '\0'; @@ -85,7 +83,6 @@ _mesa_ARBfp_parse_option(struct asm_parser_state *state, const char *option) */ option += 4; - if (strncmp(option, "fog_", 4) == 0) { option += 4; @@ -136,10 +133,12 @@ _mesa_ARBfp_parse_option(struct asm_parser_state *state, const char *option) * program options will fail to load. */ - if (strcmp(option, "nicest") == 0 && state->option.PrecisionHint != OPTION_FASTEST) { + if (strcmp(option, "nicest") == 0 && + state->option.PrecisionHint != OPTION_FASTEST) { state->option.PrecisionHint = OPTION_NICEST; return 1; - } else if (strcmp(option, "fastest") == 0 && state->option.PrecisionHint != OPTION_NICEST) { + } else if (strcmp(option, "fastest") == 0 && + state->option.PrecisionHint != OPTION_NICEST) { state->option.PrecisionHint = OPTION_FASTEST; return 1; } diff --git a/src/mesa/state_tracker/st_atom.c b/src/mesa/state_tracker/st_atom.c index 622621bdcbb..fc80adf6f8d 100644 --- a/src/mesa/state_tracker/st_atom.c +++ b/src/mesa/state_tracker/st_atom.c @@ -62,7 +62,12 @@ static const struct st_tracked_state *render_atoms[] = &st_update_tessctrl_texture, &st_update_tesseval_texture, &st_update_sampler, /* depends on update_*_texture for swizzle */ - &st_update_framebuffer, + &st_bind_vs_images, + &st_bind_tcs_images, + &st_bind_tes_images, + &st_bind_gs_images, + &st_bind_fs_images, + &st_update_framebuffer, /* depends on update_*_texture and bind_*_images */ &st_update_msaa, &st_update_sample_shading, &st_update_vs_constants, @@ -85,11 +90,6 @@ static const struct st_tracked_state *render_atoms[] = &st_bind_tes_ssbos, &st_bind_fs_ssbos, &st_bind_gs_ssbos, - &st_bind_vs_images, - &st_bind_tcs_images, - &st_bind_tes_images, - &st_bind_gs_images, - &st_bind_fs_images, &st_update_pixel_transfer, &st_update_tess, diff --git a/src/mesa/state_tracker/st_atom_image.c b/src/mesa/state_tracker/st_atom_image.c index 4b48bc30b69..e96d10a196c 100644 --- a/src/mesa/state_tracker/st_atom_image.c +++ b/src/mesa/state_tracker/st_atom_image.c @@ -25,6 +25,7 @@ **************************************************************************/ #include "main/imports.h" +#include "main/shaderimage.h" #include "program/prog_parameter.h" #include "program/prog_print.h" #include "compiler/glsl/ir_uniform.h" @@ -48,17 +49,19 @@ st_bind_images(struct st_context *st, struct gl_shader *shader, { unsigned i; struct pipe_image_view images[MAX_IMAGE_UNIFORMS]; - struct gl_program_constants *c = &st->ctx->Const.Program[shader->Stage]; + struct gl_program_constants *c; if (!shader || !st->pipe->set_shader_images) return; + c = &st->ctx->Const.Program[shader->Stage]; + for (i = 0; i < shader->NumImages; i++) { struct gl_image_unit *u = &st->ctx->ImageUnits[shader->ImageUnits[i]]; struct st_texture_object *stObj = st_texture_object(u->TexObj); struct pipe_image_view *img = &images[i]; - if (!stObj || + if (!_mesa_is_image_unit_valid(st->ctx, u) || !st_finalize_texture(st->ctx, st->pipe, u->TexObj) || !stObj->pt) { memset(img, 0, sizeof(*img)); @@ -67,6 +70,21 @@ st_bind_images(struct st_context *st, struct gl_shader *shader, img->resource = stObj->pt; img->format = st_mesa_format_to_pipe_format(st, u->_ActualFormat); + + switch (u->Access) { + case GL_READ_ONLY: + img->access = PIPE_IMAGE_ACCESS_READ; + break; + case GL_WRITE_ONLY: + img->access = PIPE_IMAGE_ACCESS_WRITE; + break; + case GL_READ_WRITE: + img->access = PIPE_IMAGE_ACCESS_READ_WRITE; + break; + default: + unreachable("bad gl_image_unit::Access"); + } + if (stObj->pt->target == PIPE_BUFFER) { unsigned base, size; unsigned f, n; diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c index a88f0352746..ff90bd61d5b 100644 --- a/src/mesa/state_tracker/st_atom_shader.c +++ b/src/mesa/state_tracker/st_atom_shader.c @@ -37,6 +37,7 @@ #include "main/imports.h" #include "main/mtypes.h" +#include "main/framebuffer.h" #include "program/program.h" #include "pipe/p_context.h" @@ -70,16 +71,13 @@ update_fp( struct st_context *st ) key.clamp_color = st->clamp_frag_color_in_shader && st->ctx->Color._ClampFragmentColor; - /* Don't set it if the driver can force the interpolation by itself. - * If SAMPLE_ID or SAMPLE_POS are used, the interpolation is set - * automatically. - * Ignore sample qualifier while computing this flag. - */ + /* _NEW_MULTISAMPLE | _NEW_BUFFERS */ key.persample_shading = st->force_persample_in_shader && - !(stfp->Base.Base.SystemValuesRead & (SYSTEM_BIT_SAMPLE_ID | - SYSTEM_BIT_SAMPLE_POS)) && - _mesa_get_min_invocations_per_fragment(st->ctx, &stfp->Base, true) > 1; + st->ctx->Multisample._Enabled && + st->ctx->Multisample.SampleShading && + st->ctx->Multisample.MinSampleShadingValue * + _mesa_geometric_samples(st->ctx->DrawBuffer) > 1; st->fp_variant = st_get_fp_variant(st, stfp, &key); diff --git a/src/mesa/state_tracker/st_cb_compute.c b/src/mesa/state_tracker/st_cb_compute.c index 364159d62d8..bfc6d96cd57 100644 --- a/src/mesa/state_tracker/st_cb_compute.c +++ b/src/mesa/state_tracker/st_cb_compute.c @@ -47,7 +47,7 @@ static void st_dispatch_compute_common(struct gl_context *ctx, if (ctx->NewState) _mesa_update_state(ctx); - if (st->dirty_cp.st || ctx->NewDriverState) + if (st->dirty_cp.st || st->dirty_cp.mesa || ctx->NewDriverState) st_validate_state(st, ST_PIPELINE_COMPUTE); for (unsigned i = 0; i < 3; i++) { diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c index cfec627f10c..bffa4d026cb 100644 --- a/src/mesa/state_tracker/st_cb_texture.c +++ b/src/mesa/state_tracker/st_cb_texture.c @@ -3104,7 +3104,7 @@ void st_init_texture_functions(struct dd_function_table *functions) { functions->ChooseTextureFormat = st_ChooseTextureFormat; - functions->QuerySamplesForFormat = st_QuerySamplesForFormat; + functions->QueryInternalFormat = st_QueryInternalFormat; functions->TexImage = st_TexImage; functions->TexSubImage = st_TexSubImage; functions->CompressedTexSubImage = st_CompressedTexSubImage; diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c index e3ddee660f7..f5a6f8598ca 100644 --- a/src/mesa/state_tracker/st_context.c +++ b/src/mesa/state_tracker/st_context.c @@ -141,9 +141,7 @@ void st_invalidate_state(struct gl_context * ctx, GLbitfield new_state) /* Invalidate render and compute pipelines. */ st->dirty.mesa |= new_state; - st->dirty.st |= ST_NEW_MESA; st->dirty_cp.mesa |= new_state; - st->dirty_cp.st |= ST_NEW_MESA; /* This is the only core Mesa module we depend upon. * No longer use swrast, swsetup, tnl. diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h index f960c64cbe8..ba51a9c6248 100644 --- a/src/mesa/state_tracker/st_context.h +++ b/src/mesa/state_tracker/st_context.h @@ -50,7 +50,7 @@ struct st_perf_monitor_group; struct u_upload_mgr; -#define ST_NEW_MESA (1 << 0) /* Mesa state has changed */ +/* gap */ #define ST_NEW_FRAGMENT_PROGRAM (1 << 1) #define ST_NEW_VERTEX_PROGRAM (1 << 2) #define ST_NEW_FRAMEBUFFER (1 << 3) diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c index 2de6620602d..fdd59a383a9 100644 --- a/src/mesa/state_tracker/st_draw.c +++ b/src/mesa/state_tracker/st_draw.c @@ -201,7 +201,7 @@ st_draw_vbo(struct gl_context *ctx, st_flush_bitmap_cache(st); /* Validate state. */ - if (st->dirty.st || ctx->NewDriverState) { + if (st->dirty.st || st->dirty.mesa || ctx->NewDriverState) { st_validate_state(st, ST_PIPELINE_RENDER); #if 0 @@ -314,7 +314,7 @@ st_indirect_draw_vbo(struct gl_context *ctx, assert(stride); /* Validate state. */ - if (st->dirty.st || ctx->NewDriverState) { + if (st->dirty.st || st->dirty.mesa || ctx->NewDriverState) { st_validate_state(st, ST_PIPELINE_RENDER); } diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c index 24c64447f44..3666ece8ee7 100644 --- a/src/mesa/state_tracker/st_extensions.c +++ b/src/mesa/state_tracker/st_extensions.c @@ -109,23 +109,20 @@ void st_init_limits(struct pipe_screen *screen, _clamp(screen->get_param(screen, PIPE_CAP_MAX_RENDER_TARGETS), 1, MAX_DRAW_BUFFERS); - c->MaxDualSourceDrawBuffers - = _clamp(screen->get_param(screen, PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS), - 0, MAX_DRAW_BUFFERS); - - c->MaxLineWidth - = _maxf(1.0f, screen->get_paramf(screen, - PIPE_CAPF_MAX_LINE_WIDTH)); - c->MaxLineWidthAA - = _maxf(1.0f, screen->get_paramf(screen, - PIPE_CAPF_MAX_LINE_WIDTH_AA)); - - c->MaxPointSize - = _maxf(1.0f, screen->get_paramf(screen, - PIPE_CAPF_MAX_POINT_WIDTH)); - c->MaxPointSizeAA - = _maxf(1.0f, screen->get_paramf(screen, - PIPE_CAPF_MAX_POINT_WIDTH_AA)); + c->MaxDualSourceDrawBuffers = + _clamp(screen->get_param(screen, + PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS), + 0, MAX_DRAW_BUFFERS); + + c->MaxLineWidth = + _maxf(1.0f, screen->get_paramf(screen, PIPE_CAPF_MAX_LINE_WIDTH)); + c->MaxLineWidthAA = + _maxf(1.0f, screen->get_paramf(screen, PIPE_CAPF_MAX_LINE_WIDTH_AA)); + + c->MaxPointSize = + _maxf(1.0f, screen->get_paramf(screen, PIPE_CAPF_MAX_POINT_WIDTH)); + c->MaxPointSizeAA = + _maxf(1.0f, screen->get_paramf(screen, PIPE_CAPF_MAX_POINT_WIDTH_AA)); /* these are not queryable. Note that GL basically mandates a 1.0 minimum * for non-aa sizes, but we can go down to 0.0 for aa points. @@ -133,15 +130,16 @@ void st_init_limits(struct pipe_screen *screen, c->MinPointSize = 1.0f; c->MinPointSizeAA = 0.0f; - c->MaxTextureMaxAnisotropy - = _maxf(2.0f, screen->get_paramf(screen, - PIPE_CAPF_MAX_TEXTURE_ANISOTROPY)); + c->MaxTextureMaxAnisotropy = + _maxf(2.0f, + screen->get_paramf(screen, PIPE_CAPF_MAX_TEXTURE_ANISOTROPY)); - c->MaxTextureLodBias - = screen->get_paramf(screen, PIPE_CAPF_MAX_TEXTURE_LOD_BIAS); + c->MaxTextureLodBias = + screen->get_paramf(screen, PIPE_CAPF_MAX_TEXTURE_LOD_BIAS); - c->QuadsFollowProvokingVertexConvention = screen->get_param( - screen, PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION); + c->QuadsFollowProvokingVertexConvention = + screen->get_param(screen, + PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION); c->MaxUniformBlockSize = screen->get_shader_param(screen, PIPE_SHADER_FRAGMENT, @@ -195,21 +193,31 @@ void st_init_limits(struct pipe_screen *screen, PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS), MAX_TEXTURE_IMAGE_UNITS); - pc->MaxInstructions = pc->MaxNativeInstructions = + pc->MaxInstructions = + pc->MaxNativeInstructions = screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_INSTRUCTIONS); - pc->MaxAluInstructions = pc->MaxNativeAluInstructions = - screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS); - pc->MaxTexInstructions = pc->MaxNativeTexInstructions = - screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS); - pc->MaxTexIndirections = pc->MaxNativeTexIndirections = - screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS); - pc->MaxAttribs = pc->MaxNativeAttribs = + pc->MaxAluInstructions = + pc->MaxNativeAluInstructions = + screen->get_shader_param(screen, sh, + PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS); + pc->MaxTexInstructions = + pc->MaxNativeTexInstructions = + screen->get_shader_param(screen, sh, + PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS); + pc->MaxTexIndirections = + pc->MaxNativeTexIndirections = + screen->get_shader_param(screen, sh, + PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS); + pc->MaxAttribs = + pc->MaxNativeAttribs = screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_INPUTS); - pc->MaxTemps = pc->MaxNativeTemps = + pc->MaxTemps = + pc->MaxNativeTemps = screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_TEMPS); - pc->MaxAddressRegs = pc->MaxNativeAddressRegs = - sh == PIPE_SHADER_VERTEX ? 1 : 0; - pc->MaxParameters = pc->MaxNativeParameters = + pc->MaxAddressRegs = + pc->MaxNativeAddressRegs = sh == PIPE_SHADER_VERTEX ? 1 : 0; + pc->MaxParameters = + pc->MaxNativeParameters = screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE) / sizeof(float[4]); pc->MaxInputComponents = @@ -217,10 +225,12 @@ void st_init_limits(struct pipe_screen *screen, pc->MaxOutputComponents = screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_OUTPUTS) * 4; - pc->MaxUniformComponents = 4 * MIN2(pc->MaxNativeParameters, MAX_UNIFORMS); + pc->MaxUniformComponents = + 4 * MIN2(pc->MaxNativeParameters, MAX_UNIFORMS); pc->MaxUniformBlocks = - screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_CONST_BUFFERS); + screen->get_shader_param(screen, sh, + PIPE_SHADER_CAP_MAX_CONST_BUFFERS); if (pc->MaxUniformBlocks) pc->MaxUniformBlocks -= 1; /* The first one is for ordinary uniforms. */ pc->MaxUniformBlocks = _min(pc->MaxUniformBlocks, MAX_UNIFORM_BUFFERS); @@ -246,21 +256,33 @@ void st_init_limits(struct pipe_screen *screen, options->EmitNoNoise = TRUE; /* TODO: make these more fine-grained if anyone needs it */ - options->MaxIfDepth = screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH); - options->EmitNoLoops = !screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH); - options->EmitNoFunctions = !screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_SUBROUTINES); - options->EmitNoMainReturn = !screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_SUBROUTINES); - - options->EmitNoCont = !screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED); - - options->EmitNoIndirectInput = !screen->get_shader_param(screen, sh, - PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR); - options->EmitNoIndirectOutput = !screen->get_shader_param(screen, sh, - PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR); - options->EmitNoIndirectTemp = !screen->get_shader_param(screen, sh, - PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR); - options->EmitNoIndirectUniform = !screen->get_shader_param(screen, sh, - PIPE_SHADER_CAP_INDIRECT_CONST_ADDR); + options->MaxIfDepth = + screen->get_shader_param(screen, sh, + PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH); + options->EmitNoLoops = + !screen->get_shader_param(screen, sh, + PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH); + options->EmitNoFunctions = + !screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_SUBROUTINES); + options->EmitNoMainReturn = + !screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_SUBROUTINES); + + options->EmitNoCont = + !screen->get_shader_param(screen, sh, + PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED); + + options->EmitNoIndirectInput = + !screen->get_shader_param(screen, sh, + PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR); + options->EmitNoIndirectOutput = + !screen->get_shader_param(screen, sh, + PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR); + options->EmitNoIndirectTemp = + !screen->get_shader_param(screen, sh, + PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR); + options->EmitNoIndirectUniform = + !screen->get_shader_param(screen, sh, + PIPE_SHADER_CAP_INDIRECT_CONST_ADDR); if (pc->MaxNativeInstructions && (options->EmitNoIndirectUniform || pc->MaxUniformBlocks < 12)) { @@ -268,10 +290,14 @@ void st_init_limits(struct pipe_screen *screen, } if (options->EmitNoLoops) - options->MaxUnrollIterations = MIN2(screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_INSTRUCTIONS), 65536); + options->MaxUnrollIterations = + MIN2(screen->get_shader_param(screen, sh, + PIPE_SHADER_CAP_MAX_INSTRUCTIONS), + 65536); else - options->MaxUnrollIterations = screen->get_shader_param(screen, sh, - PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT); + options->MaxUnrollIterations = + screen->get_shader_param(screen, sh, + PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT); options->LowerClipDistance = true; options->LowerBufferInterfaceBlocks = true; @@ -293,37 +319,50 @@ void st_init_limits(struct pipe_screen *screen, /* This depends on program constants. */ c->MaxTextureCoordUnits - = _min(c->Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits, MAX_TEXTURE_COORD_UNITS); + = _min(c->Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits, + MAX_TEXTURE_COORD_UNITS); - c->MaxTextureUnits = _min(c->Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits, c->MaxTextureCoordUnits); + c->MaxTextureUnits = + _min(c->Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits, + c->MaxTextureCoordUnits); - c->Program[MESA_SHADER_VERTEX].MaxAttribs = MIN2(c->Program[MESA_SHADER_VERTEX].MaxAttribs, 16); + c->Program[MESA_SHADER_VERTEX].MaxAttribs = + MIN2(c->Program[MESA_SHADER_VERTEX].MaxAttribs, 16); /* PIPE_SHADER_CAP_MAX_INPUTS for the FS specifies the maximum number * of inputs. It's always 2 colors + N generic inputs. */ c->MaxVarying = screen->get_shader_param(screen, PIPE_SHADER_FRAGMENT, PIPE_SHADER_CAP_MAX_INPUTS); c->MaxVarying = MIN2(c->MaxVarying, MAX_VARYING); - c->MaxGeometryOutputVertices = screen->get_param(screen, PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES); - c->MaxGeometryTotalOutputComponents = screen->get_param(screen, PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS); + c->MaxGeometryOutputVertices = + screen->get_param(screen, PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES); + c->MaxGeometryTotalOutputComponents = + screen->get_param(screen, PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS); c->MaxTessPatchComponents = MAX2(screen->get_param(screen, PIPE_CAP_MAX_SHADER_PATCH_VARYINGS), MAX_VARYING) * 4; - c->MinProgramTexelOffset = screen->get_param(screen, PIPE_CAP_MIN_TEXEL_OFFSET); - c->MaxProgramTexelOffset = screen->get_param(screen, PIPE_CAP_MAX_TEXEL_OFFSET); + c->MinProgramTexelOffset = + screen->get_param(screen, PIPE_CAP_MIN_TEXEL_OFFSET); + c->MaxProgramTexelOffset = + screen->get_param(screen, PIPE_CAP_MAX_TEXEL_OFFSET); - c->MaxProgramTextureGatherComponents = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS); - c->MinProgramTextureGatherOffset = screen->get_param(screen, PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET); - c->MaxProgramTextureGatherOffset = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET); + c->MaxProgramTextureGatherComponents = + screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS); + c->MinProgramTextureGatherOffset = + screen->get_param(screen, PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET); + c->MaxProgramTextureGatherOffset = + screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET); c->MaxTransformFeedbackBuffers = screen->get_param(screen, PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS); - c->MaxTransformFeedbackBuffers = MIN2(c->MaxTransformFeedbackBuffers, MAX_FEEDBACK_BUFFERS); + c->MaxTransformFeedbackBuffers = MIN2(c->MaxTransformFeedbackBuffers, + MAX_FEEDBACK_BUFFERS); c->MaxTransformFeedbackSeparateComponents = screen->get_param(screen, PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS); c->MaxTransformFeedbackInterleavedComponents = - screen->get_param(screen, PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS); + screen->get_param(screen, + PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS); c->MaxVertexStreams = MAX2(1, screen->get_param(screen, PIPE_CAP_MAX_VERTEX_STREAMS)); @@ -368,8 +407,10 @@ void st_init_limits(struct pipe_screen *screen, c->Program[MESA_SHADER_FRAGMENT].MaxAtomicBuffers; assert(c->MaxCombinedAtomicBuffers <= MAX_COMBINED_ATOMIC_BUFFERS); - if (c->MaxCombinedAtomicBuffers > 0) + if (c->MaxCombinedAtomicBuffers > 0) { extensions->ARB_shader_atomic_counters = GL_TRUE; + extensions->ARB_shader_atomic_counter_ops = GL_TRUE; + } c->MaxCombinedShaderOutputResources = c->MaxDrawBuffers; c->ShaderStorageBufferOffsetAlignment = @@ -1028,12 +1069,14 @@ void st_init_extensions(struct pipe_screen *screen, extensions->ARB_ES3_compatibility = GL_TRUE; } +#ifdef HAVE_ST_VDPAU if (screen->get_video_param && screen->get_video_param(screen, PIPE_VIDEO_PROFILE_UNKNOWN, PIPE_VIDEO_ENTRYPOINT_BITSTREAM, PIPE_VIDEO_CAP_SUPPORTS_INTERLACED)) { extensions->NV_vdpau_interop = GL_TRUE; } +#endif if (screen->get_shader_param(screen, PIPE_SHADER_VERTEX, PIPE_SHADER_CAP_DOUBLES) && diff --git a/src/mesa/state_tracker/st_format.c b/src/mesa/state_tracker/st_format.c index 82bf3a185ad..5392c23ec00 100644 --- a/src/mesa/state_tracker/st_format.c +++ b/src/mesa/state_tracker/st_format.c @@ -42,6 +42,7 @@ #include "main/texstore.h" #include "main/image.h" #include "main/macros.h" +#include "main/formatquery.h" #include "pipe/p_context.h" #include "pipe/p_defines.h" @@ -2308,9 +2309,9 @@ st_ChooseTextureFormat(struct gl_context *ctx, GLenum target, /** - * Called via ctx->Driver.ChooseTextureFormat(). + * Called via ctx->Driver.QueryInternalFormat(). */ -size_t +static size_t st_QuerySamplesForFormat(struct gl_context *ctx, GLenum target, GLenum internalFormat, int samples[16]) { @@ -2349,6 +2350,39 @@ st_QuerySamplesForFormat(struct gl_context *ctx, GLenum target, return num_sample_counts; } +/** + * ARB_internalformat_query2 driver hook. + */ +void +st_QueryInternalFormat(struct gl_context *ctx, GLenum target, + GLenum internalFormat, GLenum pname, GLint *params) +{ + /* The API entry-point gives us a temporary params buffer that is non-NULL + * and guaranteed to have at least 16 elements. + */ + assert(params != NULL); + + switch (pname) { + case GL_SAMPLES: + st_QuerySamplesForFormat(ctx, target, internalFormat, params); + break; + + case GL_NUM_SAMPLE_COUNTS: { + size_t num_samples; + num_samples = st_QuerySamplesForFormat(ctx, target, internalFormat, + params); + params[0] = (GLint) num_samples; + break; + } + + default: + /* For the rest of the pnames, we call back the Mesa's default + * function for drivers that don't implement ARB_internalformat_query2. + */ + _mesa_query_internal_format_default(ctx, target, internalFormat, pname, + params); + } +} /** * This is used for translating texture border color and the clear diff --git a/src/mesa/state_tracker/st_format.h b/src/mesa/state_tracker/st_format.h index 3e10aa64bc6..6ba61df7e4e 100644 --- a/src/mesa/state_tracker/st_format.h +++ b/src/mesa/state_tracker/st_format.h @@ -70,11 +70,9 @@ st_ChooseTextureFormat(struct gl_context * ctx, GLenum target, GLint internalFormat, GLenum format, GLenum type); -size_t -st_QuerySamplesForFormat(struct gl_context *ctx, GLenum target, - GLenum internalFormat, int samples[16]); - - +void +st_QueryInternalFormat(struct gl_context *ctx, GLenum target, + GLenum internalFormat, GLenum pname, GLint *params); extern void st_translate_color(const union gl_color_union *colorIn, diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp index 26e463e0437..18414055549 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp @@ -3158,8 +3158,8 @@ void glsl_to_tgsi_visitor::visit_atomic_counter_intrinsic(ir_call *ir) { const char *callee = ir->callee->function_name(); - ir_dereference *deref = static_cast<ir_dereference *>( - ir->actual_parameters.get_head()); + exec_node *param = ir->actual_parameters.get_head(); + ir_dereference *deref = static_cast<ir_dereference *>(param); ir_variable *location = deref->variable_referenced(); st_src_reg buffer( @@ -3188,17 +3188,56 @@ glsl_to_tgsi_visitor::visit_atomic_counter_intrinsic(ir_call *ir) if (!strcmp("__intrinsic_atomic_read", callee)) { inst = emit_asm(ir, TGSI_OPCODE_LOAD, dst, offset); - inst->buffer = buffer; } else if (!strcmp("__intrinsic_atomic_increment", callee)) { inst = emit_asm(ir, TGSI_OPCODE_ATOMUADD, dst, offset, st_src_reg_for_int(1)); - inst->buffer = buffer; } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) { inst = emit_asm(ir, TGSI_OPCODE_ATOMUADD, dst, offset, st_src_reg_for_int(-1)); - inst->buffer = buffer; emit_asm(ir, TGSI_OPCODE_ADD, dst, this->result, st_src_reg_for_int(-1)); + } else { + param = param->get_next(); + ir_rvalue *val = ((ir_instruction *)param)->as_rvalue(); + val->accept(this); + + st_src_reg data = this->result, data2 = undef_src; + unsigned opcode; + if (!strcmp("__intrinsic_atomic_add", callee)) + opcode = TGSI_OPCODE_ATOMUADD; + else if (!strcmp("__intrinsic_atomic_min", callee)) + opcode = TGSI_OPCODE_ATOMIMIN; + else if (!strcmp("__intrinsic_atomic_max", callee)) + opcode = TGSI_OPCODE_ATOMIMAX; + else if (!strcmp("__intrinsic_atomic_and", callee)) + opcode = TGSI_OPCODE_ATOMAND; + else if (!strcmp("__intrinsic_atomic_or", callee)) + opcode = TGSI_OPCODE_ATOMOR; + else if (!strcmp("__intrinsic_atomic_xor", callee)) + opcode = TGSI_OPCODE_ATOMXOR; + else if (!strcmp("__intrinsic_atomic_exchange", callee)) + opcode = TGSI_OPCODE_ATOMXCHG; + else if (!strcmp("__intrinsic_atomic_comp_swap", callee)) { + opcode = TGSI_OPCODE_ATOMCAS; + param = param->get_next(); + val = ((ir_instruction *)param)->as_rvalue(); + val->accept(this); + data2 = this->result; + } else if (!strcmp("__intrinsic_atomic_sub", callee)) { + opcode = TGSI_OPCODE_ATOMUADD; + st_src_reg res = get_temp(glsl_type::uvec4_type); + st_dst_reg dstres = st_dst_reg(res); + dstres.writemask = dst.writemask; + emit_asm(ir, TGSI_OPCODE_INEG, dstres, data); + data = res; + } else { + assert(!"Unexpected intrinsic"); + return; + } + + inst = emit_asm(ir, opcode, dst, offset, data, data2); } + + inst->buffer = buffer; } void @@ -3577,6 +3616,13 @@ glsl_to_tgsi_visitor::visit_image_intrinsic(ir_call *ir) inst->image_format = st_mesa_format_to_pipe_format(st_context(ctx), _mesa_get_shader_image_format(imgvar->data.image_format)); + + if (imgvar->data.image_coherent) + inst->buffer_access |= TGSI_MEMORY_COHERENT; + if (imgvar->data.image_restrict) + inst->buffer_access |= TGSI_MEMORY_RESTRICT; + if (imgvar->data.image_volatile) + inst->buffer_access |= TGSI_MEMORY_VOLATILE; } void @@ -3591,7 +3637,16 @@ glsl_to_tgsi_visitor::visit(ir_call *ir) /* Filter out intrinsics */ if (!strcmp("__intrinsic_atomic_read", callee) || !strcmp("__intrinsic_atomic_increment", callee) || - !strcmp("__intrinsic_atomic_predecrement", callee)) { + !strcmp("__intrinsic_atomic_predecrement", callee) || + !strcmp("__intrinsic_atomic_add", callee) || + !strcmp("__intrinsic_atomic_sub", callee) || + !strcmp("__intrinsic_atomic_min", callee) || + !strcmp("__intrinsic_atomic_max", callee) || + !strcmp("__intrinsic_atomic_and", callee) || + !strcmp("__intrinsic_atomic_or", callee) || + !strcmp("__intrinsic_atomic_xor", callee) || + !strcmp("__intrinsic_atomic_exchange", callee) || + !strcmp("__intrinsic_atomic_comp_swap", callee)) { visit_atomic_counter_intrinsic(ir); return; } @@ -5524,7 +5579,7 @@ compile_tgsi_instruction(struct st_translate *t, int num_dst; int num_src; - unsigned tex_target; + unsigned tex_target = 0; num_dst = num_inst_dst_regs(inst); num_src = num_inst_src_regs(inst); @@ -5599,32 +5654,38 @@ compile_tgsi_instruction(struct st_translate *t, for (i = num_src - 1; i >= 0; i--) src[i + 1] = src[i]; num_src++; - if (inst->buffer.file == PROGRAM_MEMORY) + if (inst->buffer.file == PROGRAM_MEMORY) { src[0] = t->shared_memory; - else if (inst->buffer.file == PROGRAM_BUFFER) + } else if (inst->buffer.file == PROGRAM_BUFFER) { src[0] = t->buffers[inst->buffer.index]; - else + } else { src[0] = t->images[inst->buffer.index]; + tex_target = st_translate_texture_target(inst->tex_target, inst->tex_shadow); + } if (inst->buffer.reladdr) src[0] = ureg_src_indirect(src[0], ureg_src(t->address[2])); assert(src[0].File != TGSI_FILE_NULL); ureg_memory_insn(ureg, inst->op, dst, num_dst, src, num_src, - inst->buffer_access); + inst->buffer_access, + tex_target, inst->image_format); break; case TGSI_OPCODE_STORE: - if (inst->buffer.file == PROGRAM_MEMORY) + if (inst->buffer.file == PROGRAM_MEMORY) { dst[0] = ureg_dst(t->shared_memory); - else if (inst->buffer.file == PROGRAM_BUFFER) + } else if (inst->buffer.file == PROGRAM_BUFFER) { dst[0] = ureg_dst(t->buffers[inst->buffer.index]); - else + } else { dst[0] = ureg_dst(t->images[inst->buffer.index]); + tex_target = st_translate_texture_target(inst->tex_target, inst->tex_shadow); + } dst[0] = ureg_writemask(dst[0], inst->dst[0].writemask); if (inst->buffer.reladdr) dst[0] = ureg_dst_indirect(dst[0], ureg_src(t->address[2])); assert(dst[0].File != TGSI_FILE_NULL); ureg_memory_insn(ureg, inst->op, dst, num_dst, src, num_src, - inst->buffer_access); + inst->buffer_access, + tex_target, inst->image_format); break; case TGSI_OPCODE_SCS: @@ -6060,6 +6121,9 @@ st_translate_program( } if (procType == TGSI_PROCESSOR_FRAGMENT) { + if (program->shader->EarlyFragmentTests) + ureg_property(ureg, TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL, 1); + if (proginfo->InputsRead & VARYING_BIT_POS) { /* Must do this after setting up t->inputs. */ emit_wpos(st_context(ctx), t, proginfo, ureg, diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c index 2e21d02b8b5..c9f390aa9a2 100644 --- a/src/mesa/state_tracker/st_program.c +++ b/src/mesa/state_tracker/st_program.c @@ -573,10 +573,6 @@ st_translate_fragment_program(struct st_context *st, else interpLocation[slot] = TGSI_INTERPOLATE_LOC_CENTER; - if (stfp->Base.Base.SystemValuesRead & (SYSTEM_BIT_SAMPLE_ID | - SYSTEM_BIT_SAMPLE_POS)) - interpLocation[slot] = TGSI_INTERPOLATE_LOC_SAMPLE; - switch (attr) { case VARYING_SLOT_POS: input_semantic_name[slot] = TGSI_SEMANTIC_POSITION; diff --git a/src/mesa/state_tracker/st_vdpau.c b/src/mesa/state_tracker/st_vdpau.c index 63af1196af1..71dd15bc4fe 100644 --- a/src/mesa/state_tracker/st_vdpau.c +++ b/src/mesa/state_tracker/st_vdpau.c @@ -49,6 +49,8 @@ #include "st_format.h" #include "st_cb_flush.h" +#ifdef HAVE_ST_VDPAU + static void st_vdpau_map_surface(struct gl_context *ctx, GLenum target, GLenum access, GLboolean output, struct gl_texture_object *texObj, @@ -180,9 +182,13 @@ st_vdpau_unmap_surface(struct gl_context *ctx, GLenum target, GLenum access, st_flush(st, NULL, 0); } +#endif + void st_init_vdpau_functions(struct dd_function_table *functions) { +#ifdef HAVE_ST_VDPAU functions->VDPAUMapSurface = st_vdpau_map_surface; functions->VDPAUUnmapSurface = st_vdpau_unmap_surface; +#endif } diff --git a/src/mesa/swrast/s_context.c b/src/mesa/swrast/s_context.c index af24207e567..0a5fc7e9329 100644 --- a/src/mesa/swrast/s_context.c +++ b/src/mesa/swrast/s_context.c @@ -900,11 +900,16 @@ void _swrast_render_finish( struct gl_context *ctx ) { SWcontext *swrast = SWRAST_CONTEXT(ctx); + struct gl_query_object *query = ctx->Query.CurrentOcclusionObject; _swrast_flush(ctx); if (swrast->Driver.SpanRenderFinish) swrast->Driver.SpanRenderFinish( ctx ); + + if (query && (query->Target == GL_ANY_SAMPLES_PASSED || + query->Target == GL_ANY_SAMPLES_PASSED_CONSERVATIVE)) + query->Result = !!query->Result; } diff --git a/src/util/mesa-sha1.h b/src/util/mesa-sha1.h index 1599405cd5a..0be5485f313 100644 --- a/src/util/mesa-sha1.h +++ b/src/util/mesa-sha1.h @@ -23,12 +23,12 @@ #ifndef SHA1_H #define SHA1_H +#include <stdlib.h> + #ifdef __cplusplus extern "C" { #endif -#include <stdlib.h> - struct mesa_sha1; struct mesa_sha1 * |