diff options
author | Alok Hota <[email protected]> | 2018-06-05 13:59:53 -0500 |
---|---|---|
committer | Tim Rowley <[email protected]> | 2018-06-18 13:57:38 -0500 |
commit | a678f40e467bbf72719c60928de26a91f21ac699 (patch) | |
tree | 1583ecaf97c287ad3976e4bf77b65a9349b291bc /src/gallium/drivers/swr | |
parent | d85fef1e34657fc082b9a763de9499d324fbeebf (diff) |
swr/rast: Clang-Format most rasterizer source code
Reviewed-by: Bruce Cherniak <[email protected]>
Diffstat (limited to 'src/gallium/drivers/swr')
114 files changed, 27802 insertions, 22174 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp b/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp index 502835ca801..ceb06ae471f 100644 --- a/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp +++ b/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp @@ -1,30 +1,30 @@ /**************************************************************************** -* Copyright (C) 2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file archrast.cpp -* -* @brief Implementation for archrast. -* -******************************************************************************/ + * Copyright (C) 2016 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file archrast.cpp + * + * @brief Implementation for archrast. + * + ******************************************************************************/ #include <atomic> #include "common/os.h" @@ -38,14 +38,14 @@ namespace ArchRast /// @brief struct that keeps track of depth and stencil event information struct DepthStencilStats { - uint32_t earlyZTestPassCount = 0; - uint32_t earlyZTestFailCount = 0; - uint32_t lateZTestPassCount = 0; - uint32_t lateZTestFailCount = 0; + uint32_t earlyZTestPassCount = 0; + uint32_t earlyZTestFailCount = 0; + uint32_t lateZTestPassCount = 0; + uint32_t lateZTestFailCount = 0; uint32_t earlyStencilTestPassCount = 0; uint32_t earlyStencilTestFailCount = 0; - uint32_t lateStencilTestPassCount = 0; - uint32_t lateStencilTestFailCount = 0; + uint32_t lateStencilTestPassCount = 0; + uint32_t lateStencilTestFailCount = 0; }; struct CStats @@ -76,12 +76,12 @@ namespace ArchRast struct CullStats { uint32_t degeneratePrimCount = 0; - uint32_t backfacePrimCount = 0; + uint32_t backfacePrimCount = 0; }; struct AlphaStats { - uint32_t alphaTestCount = 0; + uint32_t alphaTestCount = 0; uint32_t alphaBlendCount = 0; }; @@ -93,20 +93,26 @@ namespace ArchRast class EventHandlerApiStats : public EventHandlerFile { public: - EventHandlerApiStats(uint32_t id) : EventHandlerFile(id) { + EventHandlerApiStats(uint32_t id) : EventHandlerFile(id) + { #if defined(_WIN32) - // Attempt to copy the events.proto file to the ArchRast output dir. It's common for tools to place the events.proto file - // in the DEBUG_OUTPUT_DIR when launching AR. If it exists, this will attempt to copy it the first time we get here to package - // it with the stats. Otherwise, the user would need to specify the events.proto location when parsing the stats in post. + // Attempt to copy the events.proto file to the ArchRast output dir. It's common for + // tools to place the events.proto file in the DEBUG_OUTPUT_DIR when launching AR. If it + // exists, this will attempt to copy it the first time we get here to package it with + // the stats. Otherwise, the user would need to specify the events.proto location when + // parsing the stats in post. std::stringstream eventsProtoSrcFilename, eventsProtoDstFilename; eventsProtoSrcFilename << KNOB_DEBUG_OUTPUT_DIR << "\\events.proto" << std::ends; - eventsProtoDstFilename << mOutputDir.substr(0, mOutputDir.size() - 1) << "\\events.proto" << std::ends; + eventsProtoDstFilename << mOutputDir.substr(0, mOutputDir.size() - 1) + << "\\events.proto" << std::ends; // If event.proto already exists, we're done; else do the copy struct stat buf; // Use a Posix stat for file existence check - if (!stat(eventsProtoDstFilename.str().c_str(), &buf) == 0) { + if (!stat(eventsProtoDstFilename.str().c_str(), &buf) == 0) + { // Now check to make sure the events.proto source exists - if (stat(eventsProtoSrcFilename.str().c_str(), &buf) == 0) { + if (stat(eventsProtoSrcFilename.str().c_str(), &buf) == 0) + { std::ifstream srcFile; srcFile.open(eventsProtoSrcFilename.str().c_str(), std::ios::binary); if (srcFile.is_open()) @@ -125,18 +131,40 @@ namespace ArchRast virtual void Handle(const DrawInstancedEvent& event) { - DrawInfoEvent e(event.data.drawId, ArchRast::Instanced, event.data.topology, - event.data.numVertices, 0, 0, event.data.startVertex, event.data.numInstances, - event.data.startInstance, event.data.tsEnable, event.data.gsEnable, event.data.soEnable, event.data.soTopology, event.data.splitId); - + DrawInfoEvent e(event.data.drawId, + ArchRast::Instanced, + event.data.topology, + event.data.numVertices, + 0, + 0, + event.data.startVertex, + event.data.numInstances, + event.data.startInstance, + event.data.tsEnable, + event.data.gsEnable, + event.data.soEnable, + event.data.soTopology, + event.data.splitId); + EventHandlerFile::Handle(e); } virtual void Handle(const DrawIndexedInstancedEvent& event) { - DrawInfoEvent e(event.data.drawId, ArchRast::IndexedInstanced, event.data.topology, 0, - event.data.numIndices, event.data.indexOffset, event.data.baseVertex, event.data.numInstances, - event.data.startInstance, event.data.tsEnable, event.data.gsEnable, event.data.soEnable, event.data.soTopology, event.data.splitId); + DrawInfoEvent e(event.data.drawId, + ArchRast::IndexedInstanced, + event.data.topology, + 0, + event.data.numIndices, + event.data.indexOffset, + event.data.baseVertex, + event.data.numInstances, + event.data.startInstance, + event.data.tsEnable, + event.data.gsEnable, + event.data.soEnable, + event.data.soTopology, + event.data.splitId); EventHandlerFile::Handle(e); } @@ -156,127 +184,148 @@ namespace ArchRast virtual void Handle(const EarlyDepthStencilInfoSingleSample& event) { - //earlyZ test compute + // earlyZ test compute mDSSingleSample.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask); - mDSSingleSample.earlyZTestFailCount += _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); + mDSSingleSample.earlyZTestFailCount += + _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); - //earlyStencil test compute + // earlyStencil test compute mDSSingleSample.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask); - mDSSingleSample.earlyStencilTestFailCount += _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); + mDSSingleSample.earlyStencilTestFailCount += + _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); - //earlyZ test single and multi sample + // earlyZ test single and multi sample mDSCombined.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask); - mDSCombined.earlyZTestFailCount += _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); + mDSCombined.earlyZTestFailCount += + _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); - //earlyStencil test single and multi sample + // earlyStencil test single and multi sample mDSCombined.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask); - mDSCombined.earlyStencilTestFailCount += _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); + mDSCombined.earlyStencilTestFailCount += + _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); mNeedFlush = true; } virtual void Handle(const EarlyDepthStencilInfoSampleRate& event) { - //earlyZ test compute + // earlyZ test compute mDSSampleRate.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask); - mDSSampleRate.earlyZTestFailCount += _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); + mDSSampleRate.earlyZTestFailCount += + _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); - //earlyStencil test compute + // earlyStencil test compute mDSSampleRate.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask); - mDSSampleRate.earlyStencilTestFailCount += _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); + mDSSampleRate.earlyStencilTestFailCount += + _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); - //earlyZ test single and multi sample + // earlyZ test single and multi sample mDSCombined.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask); - mDSCombined.earlyZTestFailCount += _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); + mDSCombined.earlyZTestFailCount += + _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); - //earlyStencil test single and multi sample + // earlyStencil test single and multi sample mDSCombined.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask); - mDSCombined.earlyStencilTestFailCount += _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); + mDSCombined.earlyStencilTestFailCount += + _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); mNeedFlush = true; } virtual void Handle(const EarlyDepthStencilInfoNullPS& event) { - //earlyZ test compute + // earlyZ test compute mDSNullPS.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask); - mDSNullPS.earlyZTestFailCount += _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); + mDSNullPS.earlyZTestFailCount += + _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); - //earlyStencil test compute + // earlyStencil test compute mDSNullPS.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask); - mDSNullPS.earlyStencilTestFailCount += _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); + mDSNullPS.earlyStencilTestFailCount += + _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); mNeedFlush = true; } virtual void Handle(const LateDepthStencilInfoSingleSample& event) { - //lateZ test compute + // lateZ test compute mDSSingleSample.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask); - mDSSingleSample.lateZTestFailCount += _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); + mDSSingleSample.lateZTestFailCount += + _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); - //lateStencil test compute + // lateStencil test compute mDSSingleSample.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask); - mDSSingleSample.lateStencilTestFailCount += _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); + mDSSingleSample.lateStencilTestFailCount += + _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); - //lateZ test single and multi sample + // lateZ test single and multi sample mDSCombined.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask); - mDSCombined.lateZTestFailCount += _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); + mDSCombined.lateZTestFailCount += + _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); - //lateStencil test single and multi sample + // lateStencil test single and multi sample mDSCombined.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask); - mDSCombined.lateStencilTestFailCount += _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); + mDSCombined.lateStencilTestFailCount += + _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); mNeedFlush = true; } virtual void Handle(const LateDepthStencilInfoSampleRate& event) { - //lateZ test compute + // lateZ test compute mDSSampleRate.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask); - mDSSampleRate.lateZTestFailCount += _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); + mDSSampleRate.lateZTestFailCount += + _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); - //lateStencil test compute + // lateStencil test compute mDSSampleRate.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask); - mDSSampleRate.lateStencilTestFailCount += _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); + mDSSampleRate.lateStencilTestFailCount += + _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); - - //lateZ test single and multi sample + // lateZ test single and multi sample mDSCombined.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask); - mDSCombined.lateZTestFailCount += _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); + mDSCombined.lateZTestFailCount += + _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); - //lateStencil test single and multi sample + // lateStencil test single and multi sample mDSCombined.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask); - mDSCombined.lateStencilTestFailCount += _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); + mDSCombined.lateStencilTestFailCount += + _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); mNeedFlush = true; } virtual void Handle(const LateDepthStencilInfoNullPS& event) { - //lateZ test compute + // lateZ test compute mDSNullPS.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask); - mDSNullPS.lateZTestFailCount += _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); + mDSNullPS.lateZTestFailCount += + _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); - //lateStencil test compute + // lateStencil test compute mDSNullPS.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask); - mDSNullPS.lateStencilTestFailCount += _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); + mDSNullPS.lateStencilTestFailCount += + _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); mNeedFlush = true; } virtual void Handle(const EarlyDepthInfoPixelRate& event) { - //earlyZ test compute + // earlyZ test compute mDSPixelRate.earlyZTestPassCount += event.data.depthPassCount; - mDSPixelRate.earlyZTestFailCount += (_mm_popcnt_u32(event.data.activeLanes) - event.data.depthPassCount); + mDSPixelRate.earlyZTestFailCount += + (_mm_popcnt_u32(event.data.activeLanes) - event.data.depthPassCount); mNeedFlush = true; } virtual void Handle(const LateDepthInfoPixelRate& event) { - //lateZ test compute + // lateZ test compute mDSPixelRate.lateZTestPassCount += event.data.depthPassCount; - mDSPixelRate.lateZTestFailCount += (_mm_popcnt_u32(event.data.activeLanes) - event.data.depthPassCount); + mDSPixelRate.lateZTestFailCount += + (_mm_popcnt_u32(event.data.activeLanes) - event.data.depthPassCount); mNeedFlush = true; } @@ -284,8 +333,10 @@ namespace ArchRast virtual void Handle(const ClipInfoEvent& event) { mClipper.mustClipCount += _mm_popcnt_u32(event.data.clipMask); - mClipper.trivialRejectCount += event.data.numInvocations - _mm_popcnt_u32(event.data.validMask); - mClipper.trivialAcceptCount += _mm_popcnt_u32(event.data.validMask & ~event.data.clipMask); + mClipper.trivialRejectCount += + event.data.numInvocations - _mm_popcnt_u32(event.data.validMask); + mClipper.trivialAcceptCount += + _mm_popcnt_u32(event.data.validMask & ~event.data.clipMask); } struct ShaderStats @@ -328,58 +379,86 @@ namespace ArchRast // Flush cached events for this draw virtual void FlushDraw(uint32_t drawId) { - if (mNeedFlush == false) return; + if (mNeedFlush == false) + return; EventHandlerFile::Handle(PSInfo(drawId, mShaderStats[SHADER_PIXEL].numInstExecuted)); EventHandlerFile::Handle(CSInfo(drawId, mShaderStats[SHADER_COMPUTE].numInstExecuted)); - //singleSample - EventHandlerFile::Handle(EarlyZSingleSample(drawId, mDSSingleSample.earlyZTestPassCount, mDSSingleSample.earlyZTestFailCount)); - EventHandlerFile::Handle(LateZSingleSample(drawId, mDSSingleSample.lateZTestPassCount, mDSSingleSample.lateZTestFailCount)); - EventHandlerFile::Handle(EarlyStencilSingleSample(drawId, mDSSingleSample.earlyStencilTestPassCount, mDSSingleSample.earlyStencilTestFailCount)); - EventHandlerFile::Handle(LateStencilSingleSample(drawId, mDSSingleSample.lateStencilTestPassCount, mDSSingleSample.lateStencilTestFailCount)); - - //sampleRate - EventHandlerFile::Handle(EarlyZSampleRate(drawId, mDSSampleRate.earlyZTestPassCount, mDSSampleRate.earlyZTestFailCount)); - EventHandlerFile::Handle(LateZSampleRate(drawId, mDSSampleRate.lateZTestPassCount, mDSSampleRate.lateZTestFailCount)); - EventHandlerFile::Handle(EarlyStencilSampleRate(drawId, mDSSampleRate.earlyStencilTestPassCount, mDSSampleRate.earlyStencilTestFailCount)); - EventHandlerFile::Handle(LateStencilSampleRate(drawId, mDSSampleRate.lateStencilTestPassCount, mDSSampleRate.lateStencilTestFailCount)); - - //combined - EventHandlerFile::Handle(EarlyZ(drawId, mDSCombined.earlyZTestPassCount, mDSCombined.earlyZTestFailCount)); - EventHandlerFile::Handle(LateZ(drawId, mDSCombined.lateZTestPassCount, mDSCombined.lateZTestFailCount)); - EventHandlerFile::Handle(EarlyStencil(drawId, mDSCombined.earlyStencilTestPassCount, mDSCombined.earlyStencilTestFailCount)); - EventHandlerFile::Handle(LateStencil(drawId, mDSCombined.lateStencilTestPassCount, mDSCombined.lateStencilTestFailCount)); - - //pixelRate - EventHandlerFile::Handle(EarlyZPixelRate(drawId, mDSPixelRate.earlyZTestPassCount, mDSPixelRate.earlyZTestFailCount)); - EventHandlerFile::Handle(LateZPixelRate(drawId, mDSPixelRate.lateZTestPassCount, mDSPixelRate.lateZTestFailCount)); - - - //NullPS - EventHandlerFile::Handle(EarlyZNullPS(drawId, mDSNullPS.earlyZTestPassCount, mDSNullPS.earlyZTestFailCount)); - EventHandlerFile::Handle(EarlyStencilNullPS(drawId, mDSNullPS.earlyStencilTestPassCount, mDSNullPS.earlyStencilTestFailCount)); + // singleSample + EventHandlerFile::Handle(EarlyZSingleSample( + drawId, mDSSingleSample.earlyZTestPassCount, mDSSingleSample.earlyZTestFailCount)); + EventHandlerFile::Handle(LateZSingleSample( + drawId, mDSSingleSample.lateZTestPassCount, mDSSingleSample.lateZTestFailCount)); + EventHandlerFile::Handle( + EarlyStencilSingleSample(drawId, + mDSSingleSample.earlyStencilTestPassCount, + mDSSingleSample.earlyStencilTestFailCount)); + EventHandlerFile::Handle( + LateStencilSingleSample(drawId, + mDSSingleSample.lateStencilTestPassCount, + mDSSingleSample.lateStencilTestFailCount)); + + // sampleRate + EventHandlerFile::Handle(EarlyZSampleRate( + drawId, mDSSampleRate.earlyZTestPassCount, mDSSampleRate.earlyZTestFailCount)); + EventHandlerFile::Handle(LateZSampleRate( + drawId, mDSSampleRate.lateZTestPassCount, mDSSampleRate.lateZTestFailCount)); + EventHandlerFile::Handle( + EarlyStencilSampleRate(drawId, + mDSSampleRate.earlyStencilTestPassCount, + mDSSampleRate.earlyStencilTestFailCount)); + EventHandlerFile::Handle(LateStencilSampleRate(drawId, + mDSSampleRate.lateStencilTestPassCount, + mDSSampleRate.lateStencilTestFailCount)); + + // combined + EventHandlerFile::Handle( + EarlyZ(drawId, mDSCombined.earlyZTestPassCount, mDSCombined.earlyZTestFailCount)); + EventHandlerFile::Handle( + LateZ(drawId, mDSCombined.lateZTestPassCount, mDSCombined.lateZTestFailCount)); + EventHandlerFile::Handle(EarlyStencil(drawId, + mDSCombined.earlyStencilTestPassCount, + mDSCombined.earlyStencilTestFailCount)); + EventHandlerFile::Handle(LateStencil(drawId, + mDSCombined.lateStencilTestPassCount, + mDSCombined.lateStencilTestFailCount)); + + // pixelRate + EventHandlerFile::Handle(EarlyZPixelRate( + drawId, mDSPixelRate.earlyZTestPassCount, mDSPixelRate.earlyZTestFailCount)); + EventHandlerFile::Handle(LateZPixelRate( + drawId, mDSPixelRate.lateZTestPassCount, mDSPixelRate.lateZTestFailCount)); + + + // NullPS + EventHandlerFile::Handle( + EarlyZNullPS(drawId, mDSNullPS.earlyZTestPassCount, mDSNullPS.earlyZTestFailCount)); + EventHandlerFile::Handle(EarlyStencilNullPS( + drawId, mDSNullPS.earlyStencilTestPassCount, mDSNullPS.earlyStencilTestFailCount)); // Rasterized Subspans EventHandlerFile::Handle(RasterTiles(drawId, rastStats.rasterTiles)); // Alpha Subspans - EventHandlerFile::Handle(AlphaEvent(drawId, mAlphaStats.alphaTestCount, mAlphaStats.alphaBlendCount)); + EventHandlerFile::Handle( + AlphaEvent(drawId, mAlphaStats.alphaTestCount, mAlphaStats.alphaBlendCount)); // Primitive Culling - EventHandlerFile::Handle(CullEvent(drawId, mCullStats.backfacePrimCount, mCullStats.degeneratePrimCount)); + EventHandlerFile::Handle( + CullEvent(drawId, mCullStats.backfacePrimCount, mCullStats.degeneratePrimCount)); mDSSingleSample = {}; - mDSSampleRate = {}; - mDSCombined = {}; - mDSPixelRate = {}; + mDSSampleRate = {}; + mDSCombined = {}; + mDSPixelRate = {}; mDSNullPS = {}; - rastStats = {}; - mCullStats = {}; + rastStats = {}; + mCullStats = {}; mAlphaStats = {}; - mShaderStats[SHADER_PIXEL] = {}; + mShaderStats[SHADER_PIXEL] = {}; mShaderStats[SHADER_COMPUTE] = {}; mNeedFlush = false; @@ -387,31 +466,38 @@ namespace ArchRast virtual void Handle(const FrontendDrawEndEvent& event) { - //Clipper - EventHandlerFile::Handle(ClipperEvent(event.data.drawId, mClipper.trivialRejectCount, mClipper.trivialAcceptCount, mClipper.mustClipCount)); + // Clipper + EventHandlerFile::Handle(ClipperEvent(event.data.drawId, + mClipper.trivialRejectCount, + mClipper.trivialAcceptCount, + mClipper.mustClipCount)); - //Tesselator + // Tesselator EventHandlerFile::Handle(TessPrims(event.data.drawId, mTS.inputPrims)); - //Geometry Shader + // Geometry Shader EventHandlerFile::Handle(GSInputPrims(event.data.drawId, mGS.inputPrimCount)); EventHandlerFile::Handle(GSPrimsGen(event.data.drawId, mGS.primGeneratedCount)); EventHandlerFile::Handle(GSVertsInput(event.data.drawId, mGS.vertsInput)); - EventHandlerFile::Handle(VSInfo(event.data.drawId, mShaderStats[SHADER_VERTEX].numInstExecuted)); - EventHandlerFile::Handle(HSInfo(event.data.drawId, mShaderStats[SHADER_HULL].numInstExecuted)); - EventHandlerFile::Handle(DSInfo(event.data.drawId, mShaderStats[SHADER_DOMAIN].numInstExecuted)); - EventHandlerFile::Handle(GSInfo(event.data.drawId, mShaderStats[SHADER_GEOMETRY].numInstExecuted)); - - mShaderStats[SHADER_VERTEX] = {}; - mShaderStats[SHADER_HULL] = {}; - mShaderStats[SHADER_DOMAIN] = {}; + EventHandlerFile::Handle( + VSInfo(event.data.drawId, mShaderStats[SHADER_VERTEX].numInstExecuted)); + EventHandlerFile::Handle( + HSInfo(event.data.drawId, mShaderStats[SHADER_HULL].numInstExecuted)); + EventHandlerFile::Handle( + DSInfo(event.data.drawId, mShaderStats[SHADER_DOMAIN].numInstExecuted)); + EventHandlerFile::Handle( + GSInfo(event.data.drawId, mShaderStats[SHADER_GEOMETRY].numInstExecuted)); + + mShaderStats[SHADER_VERTEX] = {}; + mShaderStats[SHADER_HULL] = {}; + mShaderStats[SHADER_DOMAIN] = {}; mShaderStats[SHADER_GEOMETRY] = {}; - //Reset Internal Counters + // Reset Internal Counters mClipper = {}; - mTS = {}; - mGS = {}; + mTS = {}; + mGS = {}; } virtual void Handle(const GSPrimInfo& event) @@ -421,10 +507,7 @@ namespace ArchRast mGS.vertsInput += event.data.vertsInput; } - virtual void Handle(const TessPrimCount& event) - { - mTS.inputPrims += event.data.primCount; - } + virtual void Handle(const TessPrimCount& event) { mTS.inputPrims += event.data.primCount; } virtual void Handle(const RasterTileCount& event) { @@ -433,13 +516,15 @@ namespace ArchRast virtual void Handle(const CullInfoEvent& event) { - mCullStats.degeneratePrimCount += _mm_popcnt_u32(event.data.validMask ^ (event.data.validMask & ~event.data.degeneratePrimMask)); - mCullStats.backfacePrimCount += _mm_popcnt_u32(event.data.validMask ^ (event.data.validMask & ~event.data.backfacePrimMask)); + mCullStats.degeneratePrimCount += _mm_popcnt_u32( + event.data.validMask ^ (event.data.validMask & ~event.data.degeneratePrimMask)); + mCullStats.backfacePrimCount += _mm_popcnt_u32( + event.data.validMask ^ (event.data.validMask & ~event.data.backfacePrimMask)); } virtual void Handle(const AlphaInfoEvent& event) { - mAlphaStats.alphaTestCount += event.data.alphaTestEnable; + mAlphaStats.alphaTestCount += event.data.alphaTestEnable; mAlphaStats.alphaBlendCount += event.data.alphaBlendEnable; } @@ -447,17 +532,17 @@ namespace ArchRast bool mNeedFlush; // Per draw stats DepthStencilStats mDSSingleSample = {}; - DepthStencilStats mDSSampleRate = {}; - DepthStencilStats mDSPixelRate = {}; - DepthStencilStats mDSCombined = {}; - DepthStencilStats mDSNullPS = {}; - DepthStencilStats mDSOmZ = {}; - CStats mClipper = {}; - TEStats mTS = {}; - GSStateInfo mGS = {}; - RastStats rastStats = {}; - CullStats mCullStats = {}; - AlphaStats mAlphaStats = {}; + DepthStencilStats mDSSampleRate = {}; + DepthStencilStats mDSPixelRate = {}; + DepthStencilStats mDSCombined = {}; + DepthStencilStats mDSNullPS = {}; + DepthStencilStats mDSOmZ = {}; + CStats mClipper = {}; + TEStats mTS = {}; + GSStateInfo mGS = {}; + RastStats rastStats = {}; + CullStats mCullStats = {}; + AlphaStats mAlphaStats = {}; ShaderStats mShaderStats[NUM_SHADER_TYPES]; @@ -473,7 +558,7 @@ namespace ArchRast { // Can we assume single threaded here? static std::atomic<uint32_t> counter(0); - uint32_t id = counter.fetch_add(1); + uint32_t id = counter.fetch_add(1); EventManager* pManager = new EventManager(); @@ -528,4 +613,4 @@ namespace ArchRast pManager->FlushDraw(drawId); } -} +} // namespace ArchRast diff --git a/src/gallium/drivers/swr/rasterizer/archrast/archrast.h b/src/gallium/drivers/swr/rasterizer/archrast/archrast.h index c74d6ad9097..d42c197bcda 100644 --- a/src/gallium/drivers/swr/rasterizer/archrast/archrast.h +++ b/src/gallium/drivers/swr/rasterizer/archrast/archrast.h @@ -1,30 +1,30 @@ /**************************************************************************** -* Copyright (C) 2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file archrast.h -* -* @brief Definitions for archrast. -* -******************************************************************************/ + * Copyright (C) 2016 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file archrast.h + * + * @brief Definitions for archrast. + * + ******************************************************************************/ #pragma once #include "common/os.h" @@ -35,15 +35,14 @@ namespace ArchRast { enum class AR_THREAD { - API = 0, + API = 0, WORKER = 1 }; HANDLE CreateThreadContext(AR_THREAD type); - void DestroyThreadContext(HANDLE hThreadContext); + void DestroyThreadContext(HANDLE hThreadContext); // Dispatch event for this thread. void Dispatch(HANDLE hThreadContext, const Event& event); void FlushDraw(HANDLE hThreadContext, uint32_t drawId); -}; - +}; // namespace ArchRast diff --git a/src/gallium/drivers/swr/rasterizer/archrast/eventmanager.h b/src/gallium/drivers/swr/rasterizer/archrast/eventmanager.h index 10e0dce6ad9..118a100e850 100644 --- a/src/gallium/drivers/swr/rasterizer/archrast/eventmanager.h +++ b/src/gallium/drivers/swr/rasterizer/archrast/eventmanager.h @@ -1,30 +1,30 @@ /**************************************************************************** -* Copyright (C) 2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file archrast.h -* -* @brief Definitions for the event manager. -* -******************************************************************************/ + * Copyright (C) 2016 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file archrast.h + * + * @brief Definitions for the event manager. + * + ******************************************************************************/ #pragma once #include "common/os.h" @@ -78,12 +78,11 @@ namespace ArchRast pHandler->FlushDraw(drawId); } } - private: + private: // Handlers stay registered for life void Detach(EventHandler* pHandler) { SWR_INVALID("Should not be called"); } std::vector<EventHandler*> mHandlers; }; -}; - +}; // namespace ArchRast diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.cpp index 1ecb455c3a1..e696dd2096a 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.cpp +++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.cpp @@ -1,35 +1,36 @@ /**************************************************************************** -* Copyright (C) 2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file ${filename} -* -* @brief Implementation for events. auto-generated file -* -* DO NOT EDIT -* -* Generation Command Line: -* ${'\n* '.join(cmdline)} -* -******************************************************************************/ + * Copyright (C) 2016 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file ${filename} + * + * @brief Implementation for events. auto-generated file + * + * DO NOT EDIT + * + * Generation Command Line: + * ${'\n * '.join(cmdline)} + * + ******************************************************************************/ +// clang-format off #include "common/os.h" #include "gen_ar_event.hpp" #include "gen_ar_eventhandler.hpp" @@ -42,3 +43,5 @@ void ${name}::Accept(EventHandler* pHandler) const pHandler->Handle(*this); } % endfor +// clan-format on + diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.hpp index 685a10b3867..fe3f261f680 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.hpp +++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.hpp @@ -1,35 +1,36 @@ /**************************************************************************** -* Copyright (C) 2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file ${filename} -* -* @brief Definitions for events. auto-generated file -* -* DO NOT EDIT -* -* Generation Command Line: -* ${'\n* '.join(cmdline)} -* -******************************************************************************/ + * Copyright (C) 2016 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file ${filename} + * + * @brief Definitions for events. auto-generated file + * + * DO NOT EDIT + * + * Generation Command Line: + * ${'\n * '.join(cmdline)} + * + ******************************************************************************/ +// clang-format off #pragma once #include "common/os.h" @@ -46,7 +47,7 @@ namespace ArchRast }; % endfor - //Forward decl + // Forward decl class EventHandler; ////////////////////////////////////////////////////////////////////////// @@ -104,5 +105,6 @@ namespace ArchRast virtual void Accept(EventHandler* pHandler) const; }; -% endfor -}
\ No newline at end of file + % endfor +} // namespace ArchRast +// clang-format on diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandler.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandler.hpp index 87d0ef47cab..140dd00dbeb 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandler.hpp +++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandler.hpp @@ -1,35 +1,36 @@ /**************************************************************************** -* Copyright (C) 2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file ${filename} -* -* @brief Event handler interface. auto-generated file -* -* DO NOT EDIT -* -* Generation Command Line: -* ${'\n* '.join(cmdline)} -* -******************************************************************************/ + * Copyright (C) 2016 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file ${filename} + * + * @brief Event handler interface. auto-generated file + * + * DO NOT EDIT + * + * Generation Command Line: + * ${'\n * '.join(cmdline)} + * + ******************************************************************************/ +// clang-format on #pragma once #include "${event_header}" @@ -51,4 +52,5 @@ namespace ArchRast virtual void Handle(const ${name}& event) {} % endfor }; -} +} // namespace ArchRast +// clan-format off diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp index 79612f31208..7c10e124c3c 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp +++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp @@ -1,35 +1,36 @@ /**************************************************************************** -* Copyright (C) 2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file ${filename} -* -* @brief Event handler interface. auto-generated file -* -* DO NOT EDIT -* -* Generation Command Line: -* ${'\n* '.join(cmdline)} -* -******************************************************************************/ + * Copyright (C) 2016 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file ${filename} + * + * @brief Event handler interface. auto-generated file + * + * DO NOT EDIT + * + * Generation Command Line: + * ${'\n * '.join(cmdline)} + * + ******************************************************************************/ +// clang-format off #pragma once #include "common/os.h" @@ -47,19 +48,22 @@ namespace ArchRast class EventHandlerFile : public EventHandler { public: - EventHandlerFile(uint32_t id) - : mBufOffset(0) + EventHandlerFile(uint32_t id) : mBufOffset(0) { #if defined(_WIN32) DWORD pid = GetCurrentProcessId(); TCHAR procname[MAX_PATH]; GetModuleFileName(NULL, procname, MAX_PATH); - const char* pBaseName = strrchr(procname, '\\'); + const char* pBaseName = strrchr(procname, '\\'); std::stringstream outDir; outDir << KNOB_DEBUG_OUTPUT_DIR << pBaseName << "_" << pid << std::ends; mOutputDir = outDir.str(); - if (CreateDirectory(mOutputDir.c_str(), NULL)) { - std::cout << std::endl << "ArchRast Dir: " << mOutputDir << std::endl << std::endl << std::flush; + if (CreateDirectory(mOutputDir.c_str(), NULL)) + { + std::cout << std::endl + << "ArchRast Dir: " << mOutputDir << std::endl + << std::endl + << std::flush; } // There could be multiple threads creating thread pools. We @@ -80,10 +84,7 @@ namespace ArchRast #endif } - virtual ~EventHandlerFile() - { - FlushBuffer(); - } + virtual ~EventHandlerFile() { FlushBuffer(); } ////////////////////////////////////////////////////////////////////////// /// @brief Flush buffer to file. @@ -109,7 +110,7 @@ namespace ArchRast file.write((char*)mBuffer, mBufOffset); file.close(); - mBufOffset = 0; + mBufOffset = 0; mHeaderBufOffset = 0; // Reset header offset so its no longer considered. } return true; @@ -124,7 +125,8 @@ namespace ArchRast if (!FlushBuffer()) { // Don't corrupt what's already in the buffer? - /// @todo Maybe add corrupt marker to buffer here in case we can open file in future? + /// @todo Maybe add corrupt marker to buffer here in case we can open file in + /// future? return; } } @@ -159,8 +161,9 @@ namespace ArchRast std::string mOutputDir; static const uint32_t mBufferSize = 1024; - uint8_t mBuffer[mBufferSize]; + uint8_t mBuffer[mBufferSize]; uint32_t mBufOffset{0}; uint32_t mHeaderBufOffset{0}; }; -} +} // namespace ArchRast +// clang-format on diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp index 088b1cd79d5..b8da5298f3d 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp +++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp @@ -19,11 +19,11 @@ // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS // IN THE SOFTWARE. -// +// // @file BackendPixelRate${fileNum}.cpp -// +// // @brief auto-generated file -// +// // DO NOT EDIT // // Generation Command Line: diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp index bcbcb30cc14..5182bc4259f 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp +++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp @@ -30,6 +30,7 @@ // ${'\n// '.join(cmdline)} // //============================================================================ +// clang-format off #pragma once //============================================================================ @@ -57,10 +58,10 @@ ${func['decl']} %for arg in func['types']: args.push_back(${arg}->getType()); %endfor - Function * pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']}, args); + Function* pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']}, args); return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name); %else: - Function * pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']}); + Function* pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']}); return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name); %endif %else: @@ -68,4 +69,5 @@ ${func['decl']} %endif } -%endfor +% endfor + // clang-format on diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp index 5625ef8a0de..d0682c55f03 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp +++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp @@ -19,11 +19,11 @@ // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS // IN THE SOFTWARE. -// +// // @file ${filename} -// +// // @brief auto-generated file -// +// // DO NOT EDIT // // Generation Command Line: @@ -31,6 +31,8 @@ // //============================================================================ +// clang-format off + %for num in range(numFiles): void Init${tableName}${num}(); %endfor @@ -41,3 +43,4 @@ static INLINE void Init${tableName}() Init${tableName}${num}(); %endfor } +// clang-format on diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp index cfdc37072e5..9375569ebeb 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp +++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp @@ -1,35 +1,36 @@ /****************************************************************************** -* Copyright (C) 2015-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file ${filename}.cpp -* -* @brief Dynamic Knobs for Core. -* -* ======================= AUTO GENERATED: DO NOT EDIT !!! ==================== -* -* Generation Command Line: -* ${'\n* '.join(cmdline)} -* -******************************************************************************/ + * Copyright (C) 2015-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file ${filename}.cpp + * + * @brief Dynamic Knobs for Core. + * + * ======================= AUTO GENERATED: DO NOT EDIT !!! ==================== + * + * Generation Command Line: + * ${'\n * '.join(cmdline)} + * + ******************************************************************************/ +// clang-format off <% calc_max_knob_len(knobs) %> % for inc in includes: #include <${inc}> @@ -40,13 +41,14 @@ //======================================================== // Implementation //======================================================== -void KnobBase::autoExpandEnvironmentVariables(std::string &text) +void KnobBase::autoExpandEnvironmentVariables(std::string& text) { #if (__GNUC__) && (GCC_VERSION < 409000) // <regex> isn't implemented prior to gcc-4.9.0 // unix style variable replacement size_t start; - while ((start = text.find("${'${'}")) != std::string::npos) { + while ((start = text.find("${'${'}")) != std::string::npos) + { size_t end = text.find("}"); if (end == std::string::npos) break; @@ -54,7 +56,8 @@ void KnobBase::autoExpandEnvironmentVariables(std::string &text) text.replace(start, end - start + 1, var); } // win32 style variable replacement - while ((start = text.find("%")) != std::string::npos) { + while ((start = text.find("%")) != std::string::npos) + { size_t end = text.find("%", start + 1); if (end == std::string::npos) break; @@ -65,7 +68,7 @@ void KnobBase::autoExpandEnvironmentVariables(std::string &text) { // unix style variable replacement static std::regex env("\\$\\{([^}]+)\\}"); - std::smatch match; + std::smatch match; while (std::regex_search(text, match, env)) { const std::string var = GetEnv(match[1].str()); @@ -77,7 +80,7 @@ void KnobBase::autoExpandEnvironmentVariables(std::string &text) { // win32 style variable replacement static std::regex env("\\%([^}]+)\\%"); - std::smatch match; + std::smatch match; while (std::regex_search(text, match, env)) { const std::string var = GetEnv(match[1].str()); @@ -89,7 +92,6 @@ void KnobBase::autoExpandEnvironmentVariables(std::string &text) #endif } - //======================================================== // Static Data Members //======================================================== @@ -113,7 +115,10 @@ std::string GlobalKnobs::ToString(const char* optPerLinePrefix) std::basic_stringstream<char> str; str << std::showbase << std::setprecision(1) << std::fixed; - if (optPerLinePrefix == nullptr) { optPerLinePrefix = ""; } + if (optPerLinePrefix == nullptr) + { + optPerLinePrefix = ""; + } % for knob in knobs: str << optPerLinePrefix << "KNOB_${knob[0]}:${space_knob(knob[0])}"; @@ -157,3 +162,4 @@ std::string GlobalKnobs::ToString(const char* optPerLinePrefix) name_len = len(name) return ' '*(max_len - name_len) %> +// clang-format on diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h index 4213f334433..71dbdacfd1d 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h +++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h @@ -1,35 +1,36 @@ /****************************************************************************** -* Copyright (C) 2015-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file ${filename}.h -* -* @brief Dynamic Knobs for Core. -* -* ======================= AUTO GENERATED: DO NOT EDIT !!! ==================== -* -* Generation Command Line: -* ${'\n* '.join(cmdline)} -* -******************************************************************************/ + * Copyright (C) 2015-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file ${filename}.h + * + * @brief Dynamic Knobs for Core. + * + * ======================= AUTO GENERATED: DO NOT EDIT !!! ==================== + * + * Generation Command Line: + * ${'\n * '.join(cmdline)} + * + ******************************************************************************/ +// clang-format off <% calc_max_knob_len(knobs) %> #pragma once #include <string> @@ -38,11 +39,11 @@ struct KnobBase { private: // Update the input string. - static void autoExpandEnvironmentVariables(std::string &text); + static void autoExpandEnvironmentVariables(std::string& text); protected: // Leave input alone and return new string. - static std::string expandEnvironmentVariables(std::string const &input) + static std::string expandEnvironmentVariables(std::string const& input) { std::string text = input; autoExpandEnvironmentVariables(text); @@ -50,7 +51,7 @@ protected: } template <typename T> - static T expandEnvironmentVariables(T const &input) + static T expandEnvironmentVariables(T const& input) { return input; } @@ -60,8 +61,8 @@ template <typename T> struct Knob : KnobBase { public: - const T& Value() const { return m_Value; } - const T& Value(T const &newValue) + const T& Value() const { return m_Value; } + const T& Value(T const& newValue) { m_Value = expandEnvironmentVariables(newValue); return Value(); @@ -150,3 +151,4 @@ extern GlobalKnobs g_GlobalKnobs; name_len = len(name) return ' '*(max_len - name_len) %> +// clang-format on diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp index 190e660ad1c..df2934fa615 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp +++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp @@ -1,35 +1,37 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file ${filename} -* -* @brief auto-generated file -* -* DO NOT EDIT -* -* Generation Command Line: -* ${'\n* '.join(cmdline)} -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file ${filename} + * + * @brief auto-generated file + * + * DO NOT EDIT + * + * Generation Command Line: + * ${'\n * '.join(cmdline)} + * + ******************************************************************************/ +// clang-format off + #pragma once namespace SwrJit @@ -37,7 +39,7 @@ namespace SwrJit using namespace llvm; %for type in types: - INLINE static StructType *Gen_${type['name']}(JitManager* pJitMgr) + INLINE static StructType* Gen_${type['name']}(JitManager* pJitMgr) { %if needs_ctx(type): LLVMContext& ctx = pJitMgr->mContext; @@ -76,7 +78,7 @@ namespace SwrJit %endfor %endfor -} // ns SwrJit +} // namespace SwrJit <%! # Global function definitions import os @@ -98,3 +100,4 @@ namespace SwrJit pad_amt = max_len - cur_len return ' '*pad_amt %> +// clang-format on diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp index 06c876231b9..92e0f406235 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp +++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp @@ -19,17 +19,18 @@ // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS // IN THE SOFTWARE. -// +// // @file gen_rasterizer${fileNum}.cpp -// +// // @brief auto-generated file -// +// // DO NOT EDIT // // Generation Command Line: // ${'\n// '.join(cmdline)} // //============================================================================ +// clang-format off #include "core/rasterizer.h" #include "core/rasterizer_impl.h" @@ -40,3 +41,4 @@ void InitRasterizerFuncs${fileNum}() ${func} %endfor } +// clang-format on diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.cpp b/src/gallium/drivers/swr/rasterizer/common/formats.cpp index 1c086ff1882..e0800f5e88e 100644 --- a/src/gallium/drivers/swr/rasterizer/common/formats.cpp +++ b/src/gallium/drivers/swr/rasterizer/common/formats.cpp @@ -1,32 +1,32 @@ /**************************************************************************** -* Copyright (C) 2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file formats.cpp -* -* @brief auto-generated file -* -* DO NOT EDIT -* -******************************************************************************/ + * Copyright (C) 2016 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file formats.cpp + * + * @brief auto-generated file + * + * DO NOT EDIT + * + ******************************************************************************/ #include "formats.h" @@ -72,6842 +72,9227 @@ const SWR_FORMAT_INFO gFormatInfo[] = { // R32G32B32A32_FLOAT (0x0) { "R32G32B32A32_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 32, 32, 32, 32 }, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {32, 32, 32, 32}, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R32G32B32A32_SINT (0x1) { "R32G32B32A32_SINT", - { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 32, 32, 32, 32 }, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {32, 32, 32, 32}, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R32G32B32A32_UINT (0x2) { "R32G32B32A32_UINT", - { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 32, 32, 32, 32 }, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {32, 32, 32, 32}, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0x3) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x4) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // R64G64_FLOAT (0x5) { "R64G64_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 0, 0 }, // Swizzle - { 64, 64, 0, 0 }, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 0, 0}, // Swizzle + {64, 64, 0, 0}, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R32G32B32X32_FLOAT (0x6) { "R32G32B32X32_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 32, 32, 32, 32 }, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {32, 32, 32, 32}, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R32G32B32A32_SSCALED (0x7) { "R32G32B32A32_SSCALED", - { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 32, 32, 32, 32 }, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {32, 32, 32, 32}, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R32G32B32A32_USCALED (0x8) { "R32G32B32A32_USCALED", - { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 32, 32, 32, 32 }, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {32, 32, 32, 32}, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0x9) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xA) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xB) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xC) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xD) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xE) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xF) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x10) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x11) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x12) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x13) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x14) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x15) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x16) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x17) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x18) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x19) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1A) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1B) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1C) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1D) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1E) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1F) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // R32G32B32A32_SFIXED (0x20) { "R32G32B32A32_SFIXED", - { SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_SFIXED }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 32, 32, 32, 32 }, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_SFIXED}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {32, 32, 32, 32}, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0x21) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x22) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x23) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x24) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x25) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x26) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x27) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x28) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x29) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x2A) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x2B) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x2C) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x2D) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x2E) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x2F) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x30) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x31) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x32) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x33) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x34) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x35) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x36) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x37) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x38) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x39) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x3A) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x3B) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x3C) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x3D) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x3E) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x3F) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // R32G32B32_FLOAT (0x40) { "R32G32B32_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 0 }, // Swizzle - { 32, 32, 32, 0 }, // Bits per component - 96, // Bits per element - 12, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 0}, // Swizzle + {32, 32, 32, 0}, // Bits per component + 96, // Bits per element + 12, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R32G32B32_SINT (0x41) { "R32G32B32_SINT", - { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 2, 0 }, // Swizzle - { 32, 32, 32, 0 }, // Bits per component - 96, // Bits per element - 12, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 2, 0}, // Swizzle + {32, 32, 32, 0}, // Bits per component + 96, // Bits per element + 12, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R32G32B32_UINT (0x42) { "R32G32B32_UINT", - { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 2, 0 }, // Swizzle - { 32, 32, 32, 0 }, // Bits per component - 96, // Bits per element - 12, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 2, 0}, // Swizzle + {32, 32, 32, 0}, // Bits per component + 96, // Bits per element + 12, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0x43) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x44) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // R32G32B32_SSCALED (0x45) { "R32G32B32_SSCALED", - { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 0 }, // Swizzle - { 32, 32, 32, 0 }, // Bits per component - 96, // Bits per element - 12, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 0}, // Swizzle + {32, 32, 32, 0}, // Bits per component + 96, // Bits per element + 12, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R32G32B32_USCALED (0x46) { "R32G32B32_USCALED", - { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 0 }, // Swizzle - { 32, 32, 32, 0 }, // Bits per component - 96, // Bits per element - 12, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 0}, // Swizzle + {32, 32, 32, 0}, // Bits per component + 96, // Bits per element + 12, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0x47) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x48) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x49) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x4A) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x4B) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x4C) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x4D) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x4E) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x4F) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // R32G32B32_SFIXED (0x50) { "R32G32B32_SFIXED", - { SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 0 }, // Swizzle - { 32, 32, 32, 0 }, // Bits per component - 96, // Bits per element - 12, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 0}, // Swizzle + {32, 32, 32, 0}, // Bits per component + 96, // Bits per element + 12, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0x51) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x52) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x53) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x54) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x55) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x56) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x57) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x58) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x59) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x5A) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x5B) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x5C) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x5D) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x5E) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x5F) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x60) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x61) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x62) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x63) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x64) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x65) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x66) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x67) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x68) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x69) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x6A) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x6B) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x6C) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x6D) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x6E) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x6F) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x70) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x71) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x72) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x73) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x74) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x75) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x76) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x77) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x78) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x79) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x7A) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x7B) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x7C) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x7D) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x7E) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x7F) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // R16G16B16A16_UNORM (0x80) { "R16G16B16A16_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 16, 16, 16, 16 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, true }, // Is normalized? - { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {16, 16, 16, 16}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, true}, // Is normalized? + {1.0f / 65535.0f, + 1.0f / 65535.0f, + 1.0f / 65535.0f, + 1.0f / 65535.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16G16B16A16_SNORM (0x81) { "R16G16B16A16_SNORM", - { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 16, 16, 16, 16 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, true }, // Is normalized? - { 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {16, 16, 16, 16}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, true}, // Is normalized? + {1.0f / 32767.0f, + 1.0f / 32767.0f, + 1.0f / 32767.0f, + 1.0f / 32767.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16G16B16A16_SINT (0x82) { "R16G16B16A16_SINT", - { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 16, 16, 16, 16 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {16, 16, 16, 16}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16G16B16A16_UINT (0x83) { "R16G16B16A16_UINT", - { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 16, 16, 16, 16 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {16, 16, 16, 16}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16G16B16A16_FLOAT (0x84) { "R16G16B16A16_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 16, 16, 16, 16 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {16, 16, 16, 16}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R32G32_FLOAT (0x85) { "R32G32_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 0, 0 }, // Swizzle - { 32, 32, 0, 0 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 0, 0}, // Swizzle + {32, 32, 0, 0}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R32G32_SINT (0x86) { "R32G32_SINT", - { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 0, 0 }, // Swizzle - { 32, 32, 0, 0 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 0, 0}, // Swizzle + {32, 32, 0, 0}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R32G32_UINT (0x87) { "R32G32_UINT", - { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 0, 0 }, // Swizzle - { 32, 32, 0, 0 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 0, 0}, // Swizzle + {32, 32, 0, 0}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R32_FLOAT_X8X24_TYPELESS (0x88) { "R32_FLOAT_X8X24_TYPELESS", - { SWR_TYPE_FLOAT, SWR_TYPE_UNUSED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 32, 32, 0, 0 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_UNUSED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {32, 32, 0, 0}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // X32_TYPELESS_G8X24_UINT (0x89) { "X32_TYPELESS_G8X24_UINT", - { SWR_TYPE_UINT, SWR_TYPE_UNUSED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 32, 32, 0, 0 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UINT, SWR_TYPE_UNUSED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {32, 32, 0, 0}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // L32A32_FLOAT (0x8A) { "L32A32_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 3, 0, 0 }, // Swizzle - { 32, 32, 0, 0 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 3, 0, 0}, // Swizzle + {32, 32, 0, 0}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0x8B) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x8C) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // R64_FLOAT (0x8D) { "R64_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 64, 0, 0, 0 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {64, 0, 0, 0}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16G16B16X16_UNORM (0x8E) { "R16G16B16X16_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 16, 16, 16, 16 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, false }, // Is normalized? - { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {16, 16, 16, 16}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, false}, // Is normalized? + {1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16G16B16X16_FLOAT (0x8F) { "R16G16B16X16_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 16, 16, 16, 16 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {16, 16, 16, 16}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0x90) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // L32X32_FLOAT (0x91) { "L32X32_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 3, 0, 0 }, // Swizzle - { 32, 32, 0, 0 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 3, 0, 0}, // Swizzle + {32, 32, 0, 0}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // I32X32_FLOAT (0x92) { "I32X32_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 3, 0, 0 }, // Swizzle - { 32, 32, 0, 0 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 3, 0, 0}, // Swizzle + {32, 32, 0, 0}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16G16B16A16_SSCALED (0x93) { "R16G16B16A16_SSCALED", - { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 16, 16, 16, 16 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {16, 16, 16, 16}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16G16B16A16_USCALED (0x94) { "R16G16B16A16_USCALED", - { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 16, 16, 16, 16 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {16, 16, 16, 16}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R32G32_SSCALED (0x95) { "R32G32_SSCALED", - { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 0, 0 }, // Swizzle - { 32, 32, 0, 0 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 0, 0}, // Swizzle + {32, 32, 0, 0}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R32G32_USCALED (0x96) { "R32G32_USCALED", - { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 0, 0 }, // Swizzle - { 32, 32, 0, 0 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 0, 0}, // Swizzle + {32, 32, 0, 0}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0x97) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x98) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x99) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x9A) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x9B) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x9C) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x9D) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x9E) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x9F) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // R32G32_SFIXED (0xA0) { "R32G32_SFIXED", - { SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 0, 0 }, // Swizzle - { 32, 32, 0, 0 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 0, 0}, // Swizzle + {32, 32, 0, 0}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0xA1) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xA2) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xA3) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xA4) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xA5) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xA6) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xA7) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xA8) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xA9) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xAA) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xAB) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xAC) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xAD) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xAE) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xAF) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xB0) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xB1) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xB2) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xB3) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xB4) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xB5) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xB6) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xB7) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xB8) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xB9) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xBA) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xBB) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xBC) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xBD) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xBE) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xBF) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // B8G8R8A8_UNORM (0xC0) { "B8G8R8A8_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 2, 1, 0, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, true }, // Is normalized? - { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {2, 1, 0, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, true}, // Is normalized? + {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // B8G8R8A8_UNORM_SRGB (0xC1) { "B8G8R8A8_UNORM_SRGB", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 2, 1, 0, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, true }, // Is normalized? - { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {2, 1, 0, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, true}, // Is normalized? + {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R10G10B10A2_UNORM (0xC2) { "R10G10B10A2_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 10, 10, 10, 2 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, true }, // Is normalized? - { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {10, 10, 10, 2}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, true}, // Is normalized? + {1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R10G10B10A2_UNORM_SRGB (0xC3) { "R10G10B10A2_UNORM_SRGB", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 10, 10, 10, 2 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, true }, // Is normalized? - { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {10, 10, 10, 2}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, true}, // Is normalized? + {1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R10G10B10A2_UINT (0xC4) { "R10G10B10A2_UINT", - { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 10, 10, 10, 2 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {10, 10, 10, 2}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0xC5) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xC6) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // R8G8B8A8_UNORM (0xC7) { "R8G8B8A8_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, true }, // Is normalized? - { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, true}, // Is normalized? + {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R8G8B8A8_UNORM_SRGB (0xC8) { "R8G8B8A8_UNORM_SRGB", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, true }, // Is normalized? - { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, true}, // Is normalized? + {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R8G8B8A8_SNORM (0xC9) { "R8G8B8A8_SNORM", - { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, true }, // Is normalized? - { 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, true}, // Is normalized? + {1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R8G8B8A8_SINT (0xCA) { "R8G8B8A8_SINT", - { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R8G8B8A8_UINT (0xCB) { "R8G8B8A8_UINT", - { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16G16_UNORM (0xCC) { "R16G16_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 0, 0 }, // Swizzle - { 16, 16, 0, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, false, false }, // Is normalized? - { 1.0f / 65535.0f, 1.0f / 65535.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 0, 0}, // Swizzle + {16, 16, 0, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, false, false}, // Is normalized? + {1.0f / 65535.0f, 1.0f / 65535.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16G16_SNORM (0xCD) { "R16G16_SNORM", - { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 0, 0 }, // Swizzle - { 16, 16, 0, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, false, false }, // Is normalized? - { 1.0f / 32767.0f, 1.0f / 32767.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 0, 0}, // Swizzle + {16, 16, 0, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, false, false}, // Is normalized? + {1.0f / 32767.0f, 1.0f / 32767.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16G16_SINT (0xCE) { "R16G16_SINT", - { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 0, 0 }, // Swizzle - { 16, 16, 0, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 0, 0}, // Swizzle + {16, 16, 0, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16G16_UINT (0xCF) { "R16G16_UINT", - { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 0, 0 }, // Swizzle - { 16, 16, 0, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 0, 0}, // Swizzle + {16, 16, 0, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16G16_FLOAT (0xD0) { "R16G16_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 0, 0 }, // Swizzle - { 16, 16, 0, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 0, 0}, // Swizzle + {16, 16, 0, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // B10G10R10A2_UNORM (0xD1) { "B10G10R10A2_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 2, 1, 0, 3 }, // Swizzle - { 10, 10, 10, 2 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, true }, // Is normalized? - { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {2, 1, 0, 3}, // Swizzle + {10, 10, 10, 2}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, true}, // Is normalized? + {1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // B10G10R10A2_UNORM_SRGB (0xD2) { "B10G10R10A2_UNORM_SRGB", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 2, 1, 0, 3 }, // Swizzle - { 10, 10, 10, 2 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, true }, // Is normalized? - { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {2, 1, 0, 3}, // Swizzle + {10, 10, 10, 2}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, true}, // Is normalized? + {1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R11G11B10_FLOAT (0xD3) { "R11G11B10_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 0 }, // Swizzle - { 11, 11, 10, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 0}, // Swizzle + {11, 11, 10, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0xD4) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // R10G10B10_FLOAT_A2_UNORM (0xD5) { "R10G10B10_FLOAT_A2_UNORM", - { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNORM }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 10, 10, 10, 2 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f / 3.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNORM}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {10, 10, 10, 2}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f / 3.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R32_SINT (0xD6) { "R32_SINT", - { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 32, 0, 0, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {32, 0, 0, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R32_UINT (0xD7) { "R32_UINT", - { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 32, 0, 0, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {32, 0, 0, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R32_FLOAT (0xD8) { "R32_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 32, 0, 0, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {32, 0, 0, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R24_UNORM_X8_TYPELESS (0xD9) { "R24_UNORM_X8_TYPELESS", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 24, 0, 0, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 16777215.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {24, 0, 0, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 16777215.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // X24_TYPELESS_G8_UINT (0xDA) { "X24_TYPELESS_G8_UINT", - { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 1, 0, 0, 0 }, // Swizzle - { 32, 0, 0, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {1, 0, 0, 0}, // Swizzle + {32, 0, 0, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0xDB) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xDC) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // L32_UNORM (0xDD) { "L32_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 32, 0, 0, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 4294967295.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {32, 0, 0, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 4294967295.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0xDE) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // L16A16_UNORM (0xDF) { "L16A16_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 3, 0, 0 }, // Swizzle - { 16, 16, 0, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { true, true, false, false }, // Is normalized? - { 1.0f / 65535.0f, 1.0f / 65535.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 3, 0, 0}, // Swizzle + {16, 16, 0, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {true, true, false, false}, // Is normalized? + {1.0f / 65535.0f, 1.0f / 65535.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // I24X8_UNORM (0xE0) { "I24X8_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 3, 0, 0 }, // Swizzle - { 24, 8, 0, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { true, true, false, false }, // Is normalized? - { 1.0f / 16777215.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 3, 0, 0}, // Swizzle + {24, 8, 0, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {true, true, false, false}, // Is normalized? + {1.0f / 16777215.0f, 1.0f / 255.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // L24X8_UNORM (0xE1) { "L24X8_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 3, 0, 0 }, // Swizzle - { 24, 8, 0, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { true, true, false, false }, // Is normalized? - { 1.0f / 16777215.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 3, 0, 0}, // Swizzle + {24, 8, 0, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {true, true, false, false}, // Is normalized? + {1.0f / 16777215.0f, 1.0f / 255.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0xE2) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // I32_FLOAT (0xE3) { "I32_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 32, 0, 0, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {32, 0, 0, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // L32_FLOAT (0xE4) { "L32_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 32, 0, 0, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {32, 0, 0, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // A32_FLOAT (0xE5) { "A32_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 3, 0, 0, 0 }, // Swizzle - { 32, 0, 0, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {3, 0, 0, 0}, // Swizzle + {32, 0, 0, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0xE6) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xE7) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xE8) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // B8G8R8X8_UNORM (0xE9) { "B8G8R8X8_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 2, 1, 0, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, false }, // Is normalized? - { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {2, 1, 0, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, false}, // Is normalized? + {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // B8G8R8X8_UNORM_SRGB (0xEA) { "B8G8R8X8_UNORM_SRGB", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 2, 1, 0, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, false }, // Is normalized? - { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {2, 1, 0, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, false}, // Is normalized? + {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R8G8B8X8_UNORM (0xEB) { "R8G8B8X8_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, false }, // Is normalized? - { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, false}, // Is normalized? + {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R8G8B8X8_UNORM_SRGB (0xEC) { "R8G8B8X8_UNORM_SRGB", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, false }, // Is normalized? - { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, false}, // Is normalized? + {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R9G9B9E5_SHAREDEXP (0xED) { "R9G9B9E5_SHAREDEXP", - { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 9, 9, 9, 5 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {9, 9, 9, 5}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // B10G10R10X2_UNORM (0xEE) { "B10G10R10X2_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 2, 1, 0, 3 }, // Swizzle - { 10, 10, 10, 2 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, false }, // Is normalized? - { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {2, 1, 0, 3}, // Swizzle + {10, 10, 10, 2}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, false}, // Is normalized? + {1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0xEF) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // L16A16_FLOAT (0xF0) { "L16A16_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 3, 0, 0 }, // Swizzle - { 16, 16, 0, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 3, 0, 0}, // Swizzle + {16, 16, 0, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0xF1) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xF2) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // R10G10B10X2_USCALED (0xF3) { "R10G10B10X2_USCALED", - { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNUSED }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 10, 10, 10, 2 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNUSED}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {10, 10, 10, 2}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R8G8B8A8_SSCALED (0xF4) { "R8G8B8A8_SSCALED", - { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R8G8B8A8_USCALED (0xF5) { "R8G8B8A8_USCALED", - { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16G16_SSCALED (0xF6) { "R16G16_SSCALED", - { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 0, 0 }, // Swizzle - { 16, 16, 0, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 0, 0}, // Swizzle + {16, 16, 0, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16G16_USCALED (0xF7) { "R16G16_USCALED", - { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 0, 0 }, // Swizzle - { 16, 16, 0, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 0, 0}, // Swizzle + {16, 16, 0, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R32_SSCALED (0xF8) { "R32_SSCALED", - { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 32, 0, 0, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {32, 0, 0, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R32_USCALED (0xF9) { "R32_USCALED", - { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 32, 0, 0, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {32, 0, 0, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0xFA) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xFB) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xFC) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xFD) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xFE) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0xFF) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // B5G6R5_UNORM (0x100) { "B5G6R5_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 2, 1, 0, 0 }, // Swizzle - { 5, 6, 5, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, false }, // Is normalized? - { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {2, 1, 0, 0}, // Swizzle + {5, 6, 5, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, false}, // Is normalized? + {1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // B5G6R5_UNORM_SRGB (0x101) { "B5G6R5_UNORM_SRGB", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 2, 1, 0, 0 }, // Swizzle - { 5, 6, 5, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 3, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, false }, // Is normalized? - { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {2, 1, 0, 0}, // Swizzle + {5, 6, 5, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 3, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, false}, // Is normalized? + {1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // B5G5R5A1_UNORM (0x102) { "B5G5R5A1_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 2, 1, 0, 3 }, // Swizzle - { 5, 5, 5, 1 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, true }, // Is normalized? - { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {2, 1, 0, 3}, // Swizzle + {5, 5, 5, 1}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, true}, // Is normalized? + {1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // B5G5R5A1_UNORM_SRGB (0x103) { "B5G5R5A1_UNORM_SRGB", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 2, 1, 0, 3 }, // Swizzle - { 5, 5, 5, 1 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 4, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, true }, // Is normalized? - { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {2, 1, 0, 3}, // Swizzle + {5, 5, 5, 1}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 4, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, true}, // Is normalized? + {1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // B4G4R4A4_UNORM (0x104) { "B4G4R4A4_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 2, 1, 0, 3 }, // Swizzle - { 4, 4, 4, 4 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, true }, // Is normalized? - { 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {2, 1, 0, 3}, // Swizzle + {4, 4, 4, 4}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, true}, // Is normalized? + {1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // B4G4R4A4_UNORM_SRGB (0x105) { "B4G4R4A4_UNORM_SRGB", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 2, 1, 0, 3 }, // Swizzle - { 4, 4, 4, 4 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 4, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, true }, // Is normalized? - { 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {2, 1, 0, 3}, // Swizzle + {4, 4, 4, 4}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 4, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, true}, // Is normalized? + {1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R8G8_UNORM (0x106) { "R8G8_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 0, 0 }, // Swizzle - { 8, 8, 0, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, false, false }, // Is normalized? - { 1.0f / 255.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 0, 0}, // Swizzle + {8, 8, 0, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, false, false}, // Is normalized? + {1.0f / 255.0f, 1.0f / 255.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R8G8_SNORM (0x107) { "R8G8_SNORM", - { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 0, 0 }, // Swizzle - { 8, 8, 0, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, false, false }, // Is normalized? - { 1.0f / 127.0f, 1.0f / 127.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 0, 0}, // Swizzle + {8, 8, 0, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, false, false}, // Is normalized? + {1.0f / 127.0f, 1.0f / 127.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R8G8_SINT (0x108) { "R8G8_SINT", - { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 0, 0 }, // Swizzle - { 8, 8, 0, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 0, 0}, // Swizzle + {8, 8, 0, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R8G8_UINT (0x109) { "R8G8_UINT", - { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 0, 0 }, // Swizzle - { 8, 8, 0, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 0, 0}, // Swizzle + {8, 8, 0, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16_UNORM (0x10A) { "R16_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 16, 0, 0, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {16, 0, 0, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 65535.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16_SNORM (0x10B) { "R16_SNORM", - { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 16, 0, 0, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 32767.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {16, 0, 0, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 32767.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16_SINT (0x10C) { "R16_SINT", - { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 16, 0, 0, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {16, 0, 0, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16_UINT (0x10D) { "R16_UINT", - { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 16, 0, 0, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {16, 0, 0, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16_FLOAT (0x10E) { "R16_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 16, 0, 0, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {16, 0, 0, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0x10F) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x110) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // I16_UNORM (0x111) { "I16_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 16, 0, 0, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {16, 0, 0, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 65535.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // L16_UNORM (0x112) { "L16_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 16, 0, 0, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {16, 0, 0, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 65535.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // A16_UNORM (0x113) { "A16_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 3, 0, 0, 0 }, // Swizzle - { 16, 0, 0, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {3, 0, 0, 0}, // Swizzle + {16, 0, 0, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 65535.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // L8A8_UNORM (0x114) { "L8A8_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 3, 0, 0 }, // Swizzle - { 8, 8, 0, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { true, true, false, false }, // Is normalized? - { 1.0f / 255.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 3, 0, 0}, // Swizzle + {8, 8, 0, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {true, true, false, false}, // Is normalized? + {1.0f / 255.0f, 1.0f / 255.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // I16_FLOAT (0x115) { "I16_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 16, 0, 0, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {16, 0, 0, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // L16_FLOAT (0x116) { "L16_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 16, 0, 0, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {16, 0, 0, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // A16_FLOAT (0x117) { "A16_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 3, 0, 0, 0 }, // Swizzle - { 16, 0, 0, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {3, 0, 0, 0}, // Swizzle + {16, 0, 0, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // L8A8_UNORM_SRGB (0x118) { "L8A8_UNORM_SRGB", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 3, 0, 0 }, // Swizzle - { 8, 8, 0, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 2, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { true, true, false, false }, // Is normalized? - { 1.0f / 255.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 3, 0, 0}, // Swizzle + {8, 8, 0, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 2, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {true, true, false, false}, // Is normalized? + {1.0f / 255.0f, 1.0f / 255.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0x119) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // B5G5R5X1_UNORM (0x11A) { "B5G5R5X1_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 2, 1, 0, 3 }, // Swizzle - { 5, 5, 5, 1 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, false }, // Is normalized? - { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {2, 1, 0, 3}, // Swizzle + {5, 5, 5, 1}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, false}, // Is normalized? + {1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // B5G5R5X1_UNORM_SRGB (0x11B) { "B5G5R5X1_UNORM_SRGB", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 2, 1, 0, 3 }, // Swizzle - { 5, 5, 5, 1 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 4, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, false }, // Is normalized? - { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {2, 1, 0, 3}, // Swizzle + {5, 5, 5, 1}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 4, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, false}, // Is normalized? + {1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R8G8_SSCALED (0x11C) { "R8G8_SSCALED", - { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 0, 0 }, // Swizzle - { 8, 8, 0, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 0, 0}, // Swizzle + {8, 8, 0, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R8G8_USCALED (0x11D) { "R8G8_USCALED", - { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 0, 0 }, // Swizzle - { 8, 8, 0, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 0, 0}, // Swizzle + {8, 8, 0, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16_SSCALED (0x11E) { "R16_SSCALED", - { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 16, 0, 0, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {16, 0, 0, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16_USCALED (0x11F) { "R16_USCALED", - { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 16, 0, 0, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {16, 0, 0, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0x120) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x121) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x122) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x123) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // A1B5G5R5_UNORM (0x124) { "A1B5G5R5_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 3, 2, 1, 0 }, // Swizzle - { 1, 5, 5, 5 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, true }, // Is normalized? - { 1.0f / 1.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {3, 2, 1, 0}, // Swizzle + {1, 5, 5, 5}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, true}, // Is normalized? + {1.0f / 1.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // A4B4G4R4_UNORM (0x125) { "A4B4G4R4_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 3, 2, 1, 0 }, // Swizzle - { 4, 4, 4, 4 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, true }, // Is normalized? - { 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {3, 2, 1, 0}, // Swizzle + {4, 4, 4, 4}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, true}, // Is normalized? + {1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // L8A8_UINT (0x126) { "L8A8_UINT", - { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 3, 0, 0 }, // Swizzle - { 8, 8, 0, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 3, 0, 0}, // Swizzle + {8, 8, 0, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // L8A8_SINT (0x127) { "L8A8_SINT", - { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 3, 0, 0 }, // Swizzle - { 8, 8, 0, 0 }, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 3, 0, 0}, // Swizzle + {8, 8, 0, 0}, // Bits per component + 16, // Bits per element + 2, // Bytes per element + 2, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0x128) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x129) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x12A) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x12B) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x12C) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x12D) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x12E) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x12F) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x130) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x131) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x132) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x133) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x134) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x135) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x136) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x137) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x138) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x139) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x13A) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x13B) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x13C) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x13D) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x13E) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x13F) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // R8_UNORM (0x140) { "R8_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 8, 0, 0, 0 }, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {8, 0, 0, 0}, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 255.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R8_SNORM (0x141) { "R8_SNORM", - { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 8, 0, 0, 0 }, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {8, 0, 0, 0}, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 127.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R8_SINT (0x142) { "R8_SINT", - { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 8, 0, 0, 0 }, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {8, 0, 0, 0}, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R8_UINT (0x143) { "R8_UINT", - { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 8, 0, 0, 0 }, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {8, 0, 0, 0}, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // A8_UNORM (0x144) { "A8_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 3, 0, 0, 0 }, // Swizzle - { 8, 0, 0, 0 }, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {3, 0, 0, 0}, // Swizzle + {8, 0, 0, 0}, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 255.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // I8_UNORM (0x145) { "I8_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 8, 0, 0, 0 }, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {8, 0, 0, 0}, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 255.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // L8_UNORM (0x146) { "L8_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 8, 0, 0, 0 }, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {8, 0, 0, 0}, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 255.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0x147) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x148) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // R8_SSCALED (0x149) { "R8_SSCALED", - { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 8, 0, 0, 0 }, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {8, 0, 0, 0}, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R8_USCALED (0x14A) { "R8_USCALED", - { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 8, 0, 0, 0 }, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {8, 0, 0, 0}, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0x14B) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // L8_UNORM_SRGB (0x14C) { "L8_UNORM_SRGB", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 8, 0, 0, 0 }, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {8, 0, 0, 0}, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 255.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0x14D) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x14E) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x14F) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x150) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x151) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // L8_UINT (0x152) { "L8_UINT", - { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 8, 0, 0, 0 }, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {8, 0, 0, 0}, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // L8_SINT (0x153) { "L8_SINT", - { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 8, 0, 0, 0 }, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {8, 0, 0, 0}, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // I8_UINT (0x154) { "I8_UINT", - { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 8, 0, 0, 0 }, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {8, 0, 0, 0}, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // I8_SINT (0x155) { "I8_SINT", - { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 8, 0, 0, 0 }, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {8, 0, 0, 0}, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + true, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0x156) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x157) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x158) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x159) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x15A) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x15B) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x15C) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x15D) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x15E) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x15F) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x160) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x161) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x162) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x163) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x164) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x165) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x166) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x167) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x168) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x169) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x16A) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x16B) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x16C) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x16D) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x16E) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x16F) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x170) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x171) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x172) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x173) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x174) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x175) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x176) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x177) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x178) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x179) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x17A) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x17B) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x17C) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x17D) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x17E) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x17F) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // DXT1_RGB_SRGB (0x180) { "DXT1_RGB_SRGB", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor - 4, // bcWidth - 4, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 1, // Num components + false, // isSRGB + true, // isBC + false, // isSubsampled + false, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 255.0f, 0, 0, 0}, // To float scale factor + 4, // bcWidth + 4, // bcHeight }, // padding (0x181) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x182) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // YCRCB_SWAPUVY (0x183) { "YCRCB_SWAPUVY", - { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - true, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 2, // bcWidth - 1, // bcHeight + {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + true, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 2, // bcWidth + 1, // bcHeight }, // padding (0x184) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x185) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // BC1_UNORM (0x186) { "BC1_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor - 4, // bcWidth - 4, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 1, // Num components + false, // isSRGB + true, // isBC + false, // isSubsampled + false, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 255.0f, 0, 0, 0}, // To float scale factor + 4, // bcWidth + 4, // bcHeight }, // BC2_UNORM (0x187) { "BC2_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor - 4, // bcWidth - 4, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 1, // Num components + false, // isSRGB + true, // isBC + false, // isSubsampled + false, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 255.0f, 0, 0, 0}, // To float scale factor + 4, // bcWidth + 4, // bcHeight }, // BC3_UNORM (0x188) { "BC3_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor - 4, // bcWidth - 4, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 1, // Num components + false, // isSRGB + true, // isBC + false, // isSubsampled + false, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 255.0f, 0, 0, 0}, // To float scale factor + 4, // bcWidth + 4, // bcHeight }, // BC4_UNORM (0x189) { "BC4_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor - 4, // bcWidth - 4, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 1, // Num components + false, // isSRGB + true, // isBC + false, // isSubsampled + false, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 255.0f, 0, 0, 0}, // To float scale factor + 4, // bcWidth + 4, // bcHeight }, // BC5_UNORM (0x18A) { "BC5_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor - 4, // bcWidth - 4, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 1, // Num components + false, // isSRGB + true, // isBC + false, // isSubsampled + false, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 255.0f, 0, 0, 0}, // To float scale factor + 4, // bcWidth + 4, // bcHeight }, // BC1_UNORM_SRGB (0x18B) { "BC1_UNORM_SRGB", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 1, // Num components - true, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor - 4, // bcWidth - 4, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 1, // Num components + true, // isSRGB + true, // isBC + false, // isSubsampled + false, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 255.0f, 0, 0, 0}, // To float scale factor + 4, // bcWidth + 4, // bcHeight }, // BC2_UNORM_SRGB (0x18C) { "BC2_UNORM_SRGB", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 1, // Num components - true, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor - 4, // bcWidth - 4, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 1, // Num components + true, // isSRGB + true, // isBC + false, // isSubsampled + false, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 255.0f, 0, 0, 0}, // To float scale factor + 4, // bcWidth + 4, // bcHeight }, // BC3_UNORM_SRGB (0x18D) { "BC3_UNORM_SRGB", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 1, // Num components - true, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor - 4, // bcWidth - 4, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 1, // Num components + true, // isSRGB + true, // isBC + false, // isSubsampled + false, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 255.0f, 0, 0, 0}, // To float scale factor + 4, // bcWidth + 4, // bcHeight }, // padding (0x18E) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // YCRCB_SWAPUV (0x18F) { "YCRCB_SWAPUV", - { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - true, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 2, // bcWidth - 1, // bcHeight + {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + true, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 2, // bcWidth + 1, // bcHeight }, // padding (0x190) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // DXT1_RGB (0x191) { "DXT1_RGB", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor - 4, // bcWidth - 4, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 1, // Num components + false, // isSRGB + true, // isBC + false, // isSubsampled + false, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 255.0f, 0, 0, 0}, // To float scale factor + 4, // bcWidth + 4, // bcHeight }, // padding (0x192) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // R8G8B8_UNORM (0x193) { "R8G8B8_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 0 }, // Swizzle - { 8, 8, 8, 0 }, // Bits per component - 24, // Bits per element - 3, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, false }, // Is normalized? - { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 0}, // Swizzle + {8, 8, 8, 0}, // Bits per component + 24, // Bits per element + 3, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, false}, // Is normalized? + {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R8G8B8_SNORM (0x194) { "R8G8B8_SNORM", - { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 0 }, // Swizzle - { 8, 8, 8, 0 }, // Bits per component - 24, // Bits per element - 3, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, false }, // Is normalized? - { 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 0}, // Swizzle + {8, 8, 8, 0}, // Bits per component + 24, // Bits per element + 3, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, false}, // Is normalized? + {1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R8G8B8_SSCALED (0x195) { "R8G8B8_SSCALED", - { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 0 }, // Swizzle - { 8, 8, 8, 0 }, // Bits per component - 24, // Bits per element - 3, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 0}, // Swizzle + {8, 8, 8, 0}, // Bits per component + 24, // Bits per element + 3, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R8G8B8_USCALED (0x196) { "R8G8B8_USCALED", - { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 0 }, // Swizzle - { 8, 8, 8, 0 }, // Bits per component - 24, // Bits per element - 3, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 0}, // Swizzle + {8, 8, 8, 0}, // Bits per component + 24, // Bits per element + 3, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R64G64B64A64_FLOAT (0x197) { "R64G64B64A64_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 64, 64, 64, 64 }, // Bits per component - 256, // Bits per element - 32, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {64, 64, 64, 64}, // Bits per component + 256, // Bits per element + 32, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R64G64B64_FLOAT (0x198) { "R64G64B64_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 0 }, // Swizzle - { 64, 64, 64, 0 }, // Bits per component - 192, // Bits per element - 24, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 0}, // Swizzle + {64, 64, 64, 0}, // Bits per component + 192, // Bits per element + 24, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // BC4_SNORM (0x199) { "BC4_SNORM", - { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor - 4, // bcWidth - 4, // bcHeight + {SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 64, // Bits per element + 8, // Bytes per element + 1, // Num components + false, // isSRGB + true, // isBC + false, // isSubsampled + false, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 127.0f, 0, 0, 0}, // To float scale factor + 4, // bcWidth + 4, // bcHeight }, // BC5_SNORM (0x19A) { "BC5_SNORM", - { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor - 4, // bcWidth - 4, // bcHeight + {SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 1, // Num components + false, // isSRGB + true, // isBC + false, // isSubsampled + false, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 127.0f, 0, 0, 0}, // To float scale factor + 4, // bcWidth + 4, // bcHeight }, // R16G16B16_FLOAT (0x19B) { "R16G16B16_FLOAT", - { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 0 }, // Swizzle - { 16, 16, 16, 0 }, // Bits per component - 48, // Bits per element - 6, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 0}, // Swizzle + {16, 16, 16, 0}, // Bits per component + 48, // Bits per element + 6, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16G16B16_UNORM (0x19C) { "R16G16B16_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 0 }, // Swizzle - { 16, 16, 16, 0 }, // Bits per component - 48, // Bits per element - 6, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, false }, // Is normalized? - { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 0}, // Swizzle + {16, 16, 16, 0}, // Bits per component + 48, // Bits per element + 6, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, false}, // Is normalized? + {1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16G16B16_SNORM (0x19D) { "R16G16B16_SNORM", - { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 0 }, // Swizzle - { 16, 16, 16, 0 }, // Bits per component - 48, // Bits per element - 6, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, false }, // Is normalized? - { 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 0}, // Swizzle + {16, 16, 16, 0}, // Bits per component + 48, // Bits per element + 6, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, false}, // Is normalized? + {1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16G16B16_SSCALED (0x19E) { "R16G16B16_SSCALED", - { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 0 }, // Swizzle - { 16, 16, 16, 0 }, // Bits per component - 48, // Bits per element - 6, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 0}, // Swizzle + {16, 16, 16, 0}, // Bits per component + 48, // Bits per element + 6, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16G16B16_USCALED (0x19F) { "R16G16B16_USCALED", - { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 0 }, // Swizzle - { 16, 16, 16, 0 }, // Bits per component - 48, // Bits per element - 6, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 0}, // Swizzle + {16, 16, 16, 0}, // Bits per component + 48, // Bits per element + 6, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0x1A0) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // BC6H_SF16 (0x1A1) { "BC6H_SF16", - { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor - 4, // bcWidth - 4, // bcHeight + {SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 1, // Num components + false, // isSRGB + true, // isBC + false, // isSubsampled + false, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 127.0f, 0, 0, 0}, // To float scale factor + 4, // bcWidth + 4, // bcHeight }, // BC7_UNORM (0x1A2) { "BC7_UNORM", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor - 4, // bcWidth - 4, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 1, // Num components + false, // isSRGB + true, // isBC + false, // isSubsampled + false, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 255.0f, 0, 0, 0}, // To float scale factor + 4, // bcWidth + 4, // bcHeight }, // BC7_UNORM_SRGB (0x1A3) { "BC7_UNORM_SRGB", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 1, // Num components - true, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor - 4, // bcWidth - 4, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 1, // Num components + true, // isSRGB + true, // isBC + false, // isSubsampled + false, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 255.0f, 0, 0, 0}, // To float scale factor + 4, // bcWidth + 4, // bcHeight }, // BC6H_UF16 (0x1A4) { "BC6H_UF16", - { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 8, 8, 8 }, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - { true, false, false, false }, // Is normalized? - { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor - 4, // bcWidth - 4, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 8, 8, 8}, // Bits per component + 128, // Bits per element + 16, // Bytes per element + 1, // Num components + false, // isSRGB + true, // isBC + false, // isSubsampled + false, // isLuminance + {true, false, false, false}, // Is normalized? + {1.0f / 255.0f, 0, 0, 0}, // To float scale factor + 4, // bcWidth + 4, // bcHeight }, // padding (0x1A5) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1A6) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1A7) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // R8G8B8_UNORM_SRGB (0x1A8) { "R8G8B8_UNORM_SRGB", - { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 0 }, // Swizzle - { 8, 8, 8, 0 }, // Bits per component - 24, // Bits per element - 3, // Bytes per element - 3, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, false }, // Is normalized? - { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 0}, // Swizzle + {8, 8, 8, 0}, // Bits per component + 24, // Bits per element + 3, // Bytes per element + 3, // Num components + true, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, false}, // Is normalized? + {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0x1A9) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1AA) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1AB) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1AC) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1AD) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1AE) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1AF) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // R16G16B16_UINT (0x1B0) { "R16G16B16_UINT", - { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 2, 0 }, // Swizzle - { 16, 16, 16, 0 }, // Bits per component - 48, // Bits per element - 6, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 2, 0}, // Swizzle + {16, 16, 16, 0}, // Bits per component + 48, // Bits per element + 6, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R16G16B16_SINT (0x1B1) { "R16G16B16_SINT", - { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 2, 0 }, // Swizzle - { 16, 16, 16, 0 }, // Bits per component - 48, // Bits per element - 6, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 2, 0}, // Swizzle + {16, 16, 16, 0}, // Bits per component + 48, // Bits per element + 6, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R32_SFIXED (0x1B2) { "R32_SFIXED", - { SWR_TYPE_SFIXED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 0, 0, 0 }, // Swizzle - { 32, 0, 0, 0 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SFIXED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 0, 0, 0}, // Swizzle + {32, 0, 0, 0}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R10G10B10A2_SNORM (0x1B3) { "R10G10B10A2_SNORM", - { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 10, 10, 10, 2 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, true }, // Is normalized? - { 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {10, 10, 10, 2}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, true}, // Is normalized? + {1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R10G10B10A2_USCALED (0x1B4) { "R10G10B10A2_USCALED", - { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 10, 10, 10, 2 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {10, 10, 10, 2}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R10G10B10A2_SSCALED (0x1B5) { "R10G10B10A2_SSCALED", - { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 10, 10, 10, 2 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {10, 10, 10, 2}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R10G10B10A2_SINT (0x1B6) { "R10G10B10A2_SINT", - { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 10, 10, 10, 2 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {10, 10, 10, 2}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // B10G10R10A2_SNORM (0x1B7) { "B10G10R10A2_SNORM", - { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 2, 1, 0, 3 }, // Swizzle - { 10, 10, 10, 2 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { true, true, true, true }, // Is normalized? - { 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {2, 1, 0, 3}, // Swizzle + {10, 10, 10, 2}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {true, true, true, true}, // Is normalized? + {1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // B10G10R10A2_USCALED (0x1B8) { "B10G10R10A2_USCALED", - { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 2, 1, 0, 3 }, // Swizzle - { 10, 10, 10, 2 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {2, 1, 0, 3}, // Swizzle + {10, 10, 10, 2}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // B10G10R10A2_SSCALED (0x1B9) { "B10G10R10A2_SSCALED", - { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED }, - { 0, 0, 0, 0x3f800000 }, // Defaults for missing components - { 2, 1, 0, 3 }, // Swizzle - { 10, 10, 10, 2 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED}, + {0, 0, 0, 0x3f800000}, // Defaults for missing components + {2, 1, 0, 3}, // Swizzle + {10, 10, 10, 2}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // B10G10R10A2_UINT (0x1BA) { "B10G10R10A2_UINT", - { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 2, 1, 0, 3 }, // Swizzle - { 10, 10, 10, 2 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT}, + {0, 0, 0, 0x1}, // Defaults for missing components + {2, 1, 0, 3}, // Swizzle + {10, 10, 10, 2}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // B10G10R10A2_SINT (0x1BB) { "B10G10R10A2_SINT", - { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 2, 1, 0, 3 }, // Swizzle - { 10, 10, 10, 2 }, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT}, + {0, 0, 0, 0x1}, // Defaults for missing components + {2, 1, 0, 3}, // Swizzle + {10, 10, 10, 2}, // Bits per component + 32, // Bits per element + 4, // Bytes per element + 4, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0x1BC) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1BD) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1BE) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1BF) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1C0) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1C1) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1C2) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1C3) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1C4) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1C5) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1C6) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1C7) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // R8G8B8_UINT (0x1C8) { "R8G8B8_UINT", - { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 2, 0 }, // Swizzle - { 8, 8, 8, 0 }, // Bits per component - 24, // Bits per element - 3, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 2, 0}, // Swizzle + {8, 8, 8, 0}, // Bits per component + 24, // Bits per element + 3, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // R8G8B8_SINT (0x1C9) { "R8G8B8_SINT", - { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 2, 0 }, // Swizzle - { 8, 8, 8, 0 }, // Bits per component - 24, // Bits per element - 3, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 2, 0}, // Swizzle + {8, 8, 8, 0}, // Bits per component + 24, // Bits per element + 3, // Bytes per element + 3, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 1.0f, 1.0f, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, // padding (0x1CA) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1CB) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1CC) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1CD) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1CE) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1CF) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1D0) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1D1) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1D2) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1D3) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1D4) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1D5) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1D6) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1D7) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1D8) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1D9) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1DA) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1DB) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1DC) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1DD) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1DE) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1DF) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1E0) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1E1) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1E2) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1E3) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1E4) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1E5) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1E6) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1E7) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1E8) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1E9) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1EA) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1EB) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1EC) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1ED) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1EE) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1EF) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1F0) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1F1) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1F2) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1F3) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1F4) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1F5) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1F6) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1F7) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1F8) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1F9) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1FA) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1FB) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1FC) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1FD) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // padding (0x1FE) - { - nullptr, - { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, - 0, 0, 0, false, false, false, false, - { false, false, false, false }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - 1, 1 - }, + {nullptr, + {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + 0, + 0, + 0, + false, + false, + false, + false, + {false, false, false, false}, + {0.0f, 0.0f, 0.0f, 0.0f}, + 1, + 1}, // RAW (0x1FF) { "RAW", - { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, - { 0, 0, 0, 0x1 }, // Defaults for missing components - { 0, 1, 2, 3 }, // Swizzle - { 8, 0, 0, 0 }, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - { false, false, false, false }, // Is normalized? - { 1.0f, 0, 0, 0 }, // To float scale factor - 1, // bcWidth - 1, // bcHeight + {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, + {0, 0, 0, 0x1}, // Defaults for missing components + {0, 1, 2, 3}, // Swizzle + {8, 0, 0, 0}, // Bits per component + 8, // Bits per element + 1, // Bytes per element + 1, // Num components + false, // isSRGB + false, // isBC + false, // isSubsampled + false, // isLuminance + {false, false, false, false}, // Is normalized? + {1.0f, 0, 0, 0}, // To float scale factor + 1, // bcWidth + 1, // bcHeight }, }; - diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.h b/src/gallium/drivers/swr/rasterizer/common/formats.h index f13f338f8b1..b7a3e533d15 100644 --- a/src/gallium/drivers/swr/rasterizer/common/formats.h +++ b/src/gallium/drivers/swr/rasterizer/common/formats.h @@ -1,32 +1,32 @@ /**************************************************************************** -* Copyright (C) 2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file formats.h -* -* @brief auto-generated file -* -* DO NOT EDIT -* -******************************************************************************/ + * Copyright (C) 2016 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file formats.h + * + * @brief auto-generated file + * + * DO NOT EDIT + * + ******************************************************************************/ #pragma once @@ -54,179 +54,179 @@ enum SWR_TYPE ////////////////////////////////////////////////////////////////////////// enum SWR_FORMAT { - R32G32B32A32_FLOAT = 0x0, - R32G32B32A32_SINT = 0x1, - R32G32B32A32_UINT = 0x2, - R64G64_FLOAT = 0x5, - R32G32B32X32_FLOAT = 0x6, - R32G32B32A32_SSCALED = 0x7, - R32G32B32A32_USCALED = 0x8, - R32G32B32A32_SFIXED = 0x20, - R32G32B32_FLOAT = 0x40, - R32G32B32_SINT = 0x41, - R32G32B32_UINT = 0x42, - R32G32B32_SSCALED = 0x45, - R32G32B32_USCALED = 0x46, - R32G32B32_SFIXED = 0x50, - R16G16B16A16_UNORM = 0x80, - R16G16B16A16_SNORM = 0x81, - R16G16B16A16_SINT = 0x82, - R16G16B16A16_UINT = 0x83, - R16G16B16A16_FLOAT = 0x84, - R32G32_FLOAT = 0x85, - R32G32_SINT = 0x86, - R32G32_UINT = 0x87, - R32_FLOAT_X8X24_TYPELESS = 0x88, - X32_TYPELESS_G8X24_UINT = 0x89, - L32A32_FLOAT = 0x8A, - R64_FLOAT = 0x8D, - R16G16B16X16_UNORM = 0x8E, - R16G16B16X16_FLOAT = 0x8F, - L32X32_FLOAT = 0x91, - I32X32_FLOAT = 0x92, - R16G16B16A16_SSCALED = 0x93, - R16G16B16A16_USCALED = 0x94, - R32G32_SSCALED = 0x95, - R32G32_USCALED = 0x96, - R32G32_SFIXED = 0xA0, - B8G8R8A8_UNORM = 0xC0, - B8G8R8A8_UNORM_SRGB = 0xC1, - R10G10B10A2_UNORM = 0xC2, - R10G10B10A2_UNORM_SRGB = 0xC3, - R10G10B10A2_UINT = 0xC4, - R8G8B8A8_UNORM = 0xC7, - R8G8B8A8_UNORM_SRGB = 0xC8, - R8G8B8A8_SNORM = 0xC9, - R8G8B8A8_SINT = 0xCA, - R8G8B8A8_UINT = 0xCB, - R16G16_UNORM = 0xCC, - R16G16_SNORM = 0xCD, - R16G16_SINT = 0xCE, - R16G16_UINT = 0xCF, - R16G16_FLOAT = 0xD0, - B10G10R10A2_UNORM = 0xD1, - B10G10R10A2_UNORM_SRGB = 0xD2, - R11G11B10_FLOAT = 0xD3, - R10G10B10_FLOAT_A2_UNORM = 0xD5, - R32_SINT = 0xD6, - R32_UINT = 0xD7, - R32_FLOAT = 0xD8, - R24_UNORM_X8_TYPELESS = 0xD9, - X24_TYPELESS_G8_UINT = 0xDA, - L32_UNORM = 0xDD, - L16A16_UNORM = 0xDF, - I24X8_UNORM = 0xE0, - L24X8_UNORM = 0xE1, - I32_FLOAT = 0xE3, - L32_FLOAT = 0xE4, - A32_FLOAT = 0xE5, - B8G8R8X8_UNORM = 0xE9, - B8G8R8X8_UNORM_SRGB = 0xEA, - R8G8B8X8_UNORM = 0xEB, - R8G8B8X8_UNORM_SRGB = 0xEC, - R9G9B9E5_SHAREDEXP = 0xED, - B10G10R10X2_UNORM = 0xEE, - L16A16_FLOAT = 0xF0, - R10G10B10X2_USCALED = 0xF3, - R8G8B8A8_SSCALED = 0xF4, - R8G8B8A8_USCALED = 0xF5, - R16G16_SSCALED = 0xF6, - R16G16_USCALED = 0xF7, - R32_SSCALED = 0xF8, - R32_USCALED = 0xF9, - B5G6R5_UNORM = 0x100, - B5G6R5_UNORM_SRGB = 0x101, - B5G5R5A1_UNORM = 0x102, - B5G5R5A1_UNORM_SRGB = 0x103, - B4G4R4A4_UNORM = 0x104, - B4G4R4A4_UNORM_SRGB = 0x105, - R8G8_UNORM = 0x106, - R8G8_SNORM = 0x107, - R8G8_SINT = 0x108, - R8G8_UINT = 0x109, - R16_UNORM = 0x10A, - R16_SNORM = 0x10B, - R16_SINT = 0x10C, - R16_UINT = 0x10D, - R16_FLOAT = 0x10E, - I16_UNORM = 0x111, - L16_UNORM = 0x112, - A16_UNORM = 0x113, - L8A8_UNORM = 0x114, - I16_FLOAT = 0x115, - L16_FLOAT = 0x116, - A16_FLOAT = 0x117, - L8A8_UNORM_SRGB = 0x118, - B5G5R5X1_UNORM = 0x11A, - B5G5R5X1_UNORM_SRGB = 0x11B, - R8G8_SSCALED = 0x11C, - R8G8_USCALED = 0x11D, - R16_SSCALED = 0x11E, - R16_USCALED = 0x11F, - A1B5G5R5_UNORM = 0x124, - A4B4G4R4_UNORM = 0x125, - L8A8_UINT = 0x126, - L8A8_SINT = 0x127, - R8_UNORM = 0x140, - R8_SNORM = 0x141, - R8_SINT = 0x142, - R8_UINT = 0x143, - A8_UNORM = 0x144, - I8_UNORM = 0x145, - L8_UNORM = 0x146, - R8_SSCALED = 0x149, - R8_USCALED = 0x14A, - L8_UNORM_SRGB = 0x14C, - L8_UINT = 0x152, - L8_SINT = 0x153, - I8_UINT = 0x154, - I8_SINT = 0x155, - DXT1_RGB_SRGB = 0x180, - YCRCB_SWAPUVY = 0x183, - BC1_UNORM = 0x186, - BC2_UNORM = 0x187, - BC3_UNORM = 0x188, - BC4_UNORM = 0x189, - BC5_UNORM = 0x18A, - BC1_UNORM_SRGB = 0x18B, - BC2_UNORM_SRGB = 0x18C, - BC3_UNORM_SRGB = 0x18D, - YCRCB_SWAPUV = 0x18F, - DXT1_RGB = 0x191, - R8G8B8_UNORM = 0x193, - R8G8B8_SNORM = 0x194, - R8G8B8_SSCALED = 0x195, - R8G8B8_USCALED = 0x196, - R64G64B64A64_FLOAT = 0x197, - R64G64B64_FLOAT = 0x198, - BC4_SNORM = 0x199, - BC5_SNORM = 0x19A, - R16G16B16_FLOAT = 0x19B, - R16G16B16_UNORM = 0x19C, - R16G16B16_SNORM = 0x19D, - R16G16B16_SSCALED = 0x19E, - R16G16B16_USCALED = 0x19F, - BC6H_SF16 = 0x1A1, - BC7_UNORM = 0x1A2, - BC7_UNORM_SRGB = 0x1A3, - BC6H_UF16 = 0x1A4, - R8G8B8_UNORM_SRGB = 0x1A8, - R16G16B16_UINT = 0x1B0, - R16G16B16_SINT = 0x1B1, - R32_SFIXED = 0x1B2, - R10G10B10A2_SNORM = 0x1B3, - R10G10B10A2_USCALED = 0x1B4, - R10G10B10A2_SSCALED = 0x1B5, - R10G10B10A2_SINT = 0x1B6, - B10G10R10A2_SNORM = 0x1B7, - B10G10R10A2_USCALED = 0x1B8, - B10G10R10A2_SSCALED = 0x1B9, - B10G10R10A2_UINT = 0x1BA, - B10G10R10A2_SINT = 0x1BB, - R8G8B8_UINT = 0x1C8, - R8G8B8_SINT = 0x1C9, - RAW = 0x1FF, - NUM_SWR_FORMATS = 0x200, + R32G32B32A32_FLOAT = 0x0, + R32G32B32A32_SINT = 0x1, + R32G32B32A32_UINT = 0x2, + R64G64_FLOAT = 0x5, + R32G32B32X32_FLOAT = 0x6, + R32G32B32A32_SSCALED = 0x7, + R32G32B32A32_USCALED = 0x8, + R32G32B32A32_SFIXED = 0x20, + R32G32B32_FLOAT = 0x40, + R32G32B32_SINT = 0x41, + R32G32B32_UINT = 0x42, + R32G32B32_SSCALED = 0x45, + R32G32B32_USCALED = 0x46, + R32G32B32_SFIXED = 0x50, + R16G16B16A16_UNORM = 0x80, + R16G16B16A16_SNORM = 0x81, + R16G16B16A16_SINT = 0x82, + R16G16B16A16_UINT = 0x83, + R16G16B16A16_FLOAT = 0x84, + R32G32_FLOAT = 0x85, + R32G32_SINT = 0x86, + R32G32_UINT = 0x87, + R32_FLOAT_X8X24_TYPELESS = 0x88, + X32_TYPELESS_G8X24_UINT = 0x89, + L32A32_FLOAT = 0x8A, + R64_FLOAT = 0x8D, + R16G16B16X16_UNORM = 0x8E, + R16G16B16X16_FLOAT = 0x8F, + L32X32_FLOAT = 0x91, + I32X32_FLOAT = 0x92, + R16G16B16A16_SSCALED = 0x93, + R16G16B16A16_USCALED = 0x94, + R32G32_SSCALED = 0x95, + R32G32_USCALED = 0x96, + R32G32_SFIXED = 0xA0, + B8G8R8A8_UNORM = 0xC0, + B8G8R8A8_UNORM_SRGB = 0xC1, + R10G10B10A2_UNORM = 0xC2, + R10G10B10A2_UNORM_SRGB = 0xC3, + R10G10B10A2_UINT = 0xC4, + R8G8B8A8_UNORM = 0xC7, + R8G8B8A8_UNORM_SRGB = 0xC8, + R8G8B8A8_SNORM = 0xC9, + R8G8B8A8_SINT = 0xCA, + R8G8B8A8_UINT = 0xCB, + R16G16_UNORM = 0xCC, + R16G16_SNORM = 0xCD, + R16G16_SINT = 0xCE, + R16G16_UINT = 0xCF, + R16G16_FLOAT = 0xD0, + B10G10R10A2_UNORM = 0xD1, + B10G10R10A2_UNORM_SRGB = 0xD2, + R11G11B10_FLOAT = 0xD3, + R10G10B10_FLOAT_A2_UNORM = 0xD5, + R32_SINT = 0xD6, + R32_UINT = 0xD7, + R32_FLOAT = 0xD8, + R24_UNORM_X8_TYPELESS = 0xD9, + X24_TYPELESS_G8_UINT = 0xDA, + L32_UNORM = 0xDD, + L16A16_UNORM = 0xDF, + I24X8_UNORM = 0xE0, + L24X8_UNORM = 0xE1, + I32_FLOAT = 0xE3, + L32_FLOAT = 0xE4, + A32_FLOAT = 0xE5, + B8G8R8X8_UNORM = 0xE9, + B8G8R8X8_UNORM_SRGB = 0xEA, + R8G8B8X8_UNORM = 0xEB, + R8G8B8X8_UNORM_SRGB = 0xEC, + R9G9B9E5_SHAREDEXP = 0xED, + B10G10R10X2_UNORM = 0xEE, + L16A16_FLOAT = 0xF0, + R10G10B10X2_USCALED = 0xF3, + R8G8B8A8_SSCALED = 0xF4, + R8G8B8A8_USCALED = 0xF5, + R16G16_SSCALED = 0xF6, + R16G16_USCALED = 0xF7, + R32_SSCALED = 0xF8, + R32_USCALED = 0xF9, + B5G6R5_UNORM = 0x100, + B5G6R5_UNORM_SRGB = 0x101, + B5G5R5A1_UNORM = 0x102, + B5G5R5A1_UNORM_SRGB = 0x103, + B4G4R4A4_UNORM = 0x104, + B4G4R4A4_UNORM_SRGB = 0x105, + R8G8_UNORM = 0x106, + R8G8_SNORM = 0x107, + R8G8_SINT = 0x108, + R8G8_UINT = 0x109, + R16_UNORM = 0x10A, + R16_SNORM = 0x10B, + R16_SINT = 0x10C, + R16_UINT = 0x10D, + R16_FLOAT = 0x10E, + I16_UNORM = 0x111, + L16_UNORM = 0x112, + A16_UNORM = 0x113, + L8A8_UNORM = 0x114, + I16_FLOAT = 0x115, + L16_FLOAT = 0x116, + A16_FLOAT = 0x117, + L8A8_UNORM_SRGB = 0x118, + B5G5R5X1_UNORM = 0x11A, + B5G5R5X1_UNORM_SRGB = 0x11B, + R8G8_SSCALED = 0x11C, + R8G8_USCALED = 0x11D, + R16_SSCALED = 0x11E, + R16_USCALED = 0x11F, + A1B5G5R5_UNORM = 0x124, + A4B4G4R4_UNORM = 0x125, + L8A8_UINT = 0x126, + L8A8_SINT = 0x127, + R8_UNORM = 0x140, + R8_SNORM = 0x141, + R8_SINT = 0x142, + R8_UINT = 0x143, + A8_UNORM = 0x144, + I8_UNORM = 0x145, + L8_UNORM = 0x146, + R8_SSCALED = 0x149, + R8_USCALED = 0x14A, + L8_UNORM_SRGB = 0x14C, + L8_UINT = 0x152, + L8_SINT = 0x153, + I8_UINT = 0x154, + I8_SINT = 0x155, + DXT1_RGB_SRGB = 0x180, + YCRCB_SWAPUVY = 0x183, + BC1_UNORM = 0x186, + BC2_UNORM = 0x187, + BC3_UNORM = 0x188, + BC4_UNORM = 0x189, + BC5_UNORM = 0x18A, + BC1_UNORM_SRGB = 0x18B, + BC2_UNORM_SRGB = 0x18C, + BC3_UNORM_SRGB = 0x18D, + YCRCB_SWAPUV = 0x18F, + DXT1_RGB = 0x191, + R8G8B8_UNORM = 0x193, + R8G8B8_SNORM = 0x194, + R8G8B8_SSCALED = 0x195, + R8G8B8_USCALED = 0x196, + R64G64B64A64_FLOAT = 0x197, + R64G64B64_FLOAT = 0x198, + BC4_SNORM = 0x199, + BC5_SNORM = 0x19A, + R16G16B16_FLOAT = 0x19B, + R16G16B16_UNORM = 0x19C, + R16G16B16_SNORM = 0x19D, + R16G16B16_SSCALED = 0x19E, + R16G16B16_USCALED = 0x19F, + BC6H_SF16 = 0x1A1, + BC7_UNORM = 0x1A2, + BC7_UNORM_SRGB = 0x1A3, + BC6H_UF16 = 0x1A4, + R8G8B8_UNORM_SRGB = 0x1A8, + R16G16B16_UINT = 0x1B0, + R16G16B16_SINT = 0x1B1, + R32_SFIXED = 0x1B2, + R10G10B10A2_SNORM = 0x1B3, + R10G10B10A2_USCALED = 0x1B4, + R10G10B10A2_SSCALED = 0x1B5, + R10G10B10A2_SINT = 0x1B6, + B10G10R10A2_SNORM = 0x1B7, + B10G10R10A2_USCALED = 0x1B8, + B10G10R10A2_SSCALED = 0x1B9, + B10G10R10A2_UINT = 0x1BA, + B10G10R10A2_SINT = 0x1BB, + R8G8B8_UINT = 0x1C8, + R8G8B8_SINT = 0x1C9, + RAW = 0x1FF, + NUM_SWR_FORMATS = 0x200, }; ////////////////////////////////////////////////////////////////////////// @@ -266,4 +266,3 @@ INLINE const SWR_FORMAT_INFO& GetFormatInfo(SWR_FORMAT format) // lookup table for unorm8 srgb -> float conversion extern const uint32_t srgb8Table[256]; - diff --git a/src/gallium/drivers/swr/rasterizer/common/intrin.h b/src/gallium/drivers/swr/rasterizer/common/intrin.h index 59d66bc60a8..4c413caf441 100644 --- a/src/gallium/drivers/swr/rasterizer/common/intrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/intrin.h @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #ifndef __SWR_INTRIN_H__ #define __SWR_INTRIN_H__ @@ -28,34 +28,34 @@ #if !defined(SIMD_ARCH) #define SIMD_ARCH KNOB_ARCH -#endif +#endif #include "simdlib_types.hpp" -typedef SIMDImpl::SIMD128Impl::Float simd4scalar; -typedef SIMDImpl::SIMD128Impl::Double simd4scalard; -typedef SIMDImpl::SIMD128Impl::Integer simd4scalari; -typedef SIMDImpl::SIMD128Impl::Vec4 simd4vector; -typedef SIMDImpl::SIMD128Impl::Mask simd4mask; - -typedef SIMDImpl::SIMD256Impl::Float simd8scalar; -typedef SIMDImpl::SIMD256Impl::Double simd8scalard; -typedef SIMDImpl::SIMD256Impl::Integer simd8scalari; -typedef SIMDImpl::SIMD256Impl::Vec4 simd8vector; -typedef SIMDImpl::SIMD256Impl::Mask simd8mask; - -typedef SIMDImpl::SIMD512Impl::Float simd16scalar; -typedef SIMDImpl::SIMD512Impl::Double simd16scalard; -typedef SIMDImpl::SIMD512Impl::Integer simd16scalari; -typedef SIMDImpl::SIMD512Impl::Vec4 simd16vector; -typedef SIMDImpl::SIMD512Impl::Mask simd16mask; - -#if KNOB_SIMD_WIDTH == 8 -typedef simd8scalar simdscalar; -typedef simd8scalard simdscalard; -typedef simd8scalari simdscalari; -typedef simd8vector simdvector; -typedef simd8mask simdmask; +typedef SIMDImpl::SIMD128Impl::Float simd4scalar; +typedef SIMDImpl::SIMD128Impl::Double simd4scalard; +typedef SIMDImpl::SIMD128Impl::Integer simd4scalari; +typedef SIMDImpl::SIMD128Impl::Vec4 simd4vector; +typedef SIMDImpl::SIMD128Impl::Mask simd4mask; + +typedef SIMDImpl::SIMD256Impl::Float simd8scalar; +typedef SIMDImpl::SIMD256Impl::Double simd8scalard; +typedef SIMDImpl::SIMD256Impl::Integer simd8scalari; +typedef SIMDImpl::SIMD256Impl::Vec4 simd8vector; +typedef SIMDImpl::SIMD256Impl::Mask simd8mask; + +typedef SIMDImpl::SIMD512Impl::Float simd16scalar; +typedef SIMDImpl::SIMD512Impl::Double simd16scalard; +typedef SIMDImpl::SIMD512Impl::Integer simd16scalari; +typedef SIMDImpl::SIMD512Impl::Vec4 simd16vector; +typedef SIMDImpl::SIMD512Impl::Mask simd16mask; + +#if KNOB_SIMD_WIDTH == 8 +typedef simd8scalar simdscalar; +typedef simd8scalard simdscalard; +typedef simd8scalari simdscalari; +typedef simd8vector simdvector; +typedef simd8mask simdmask; #else #error Unsupported vector width #endif @@ -68,7 +68,7 @@ UINT pdep_u32(UINT a, UINT mask) #else UINT result = 0; - // copied from http://wm.ite.pl/articles/pdep-soft-emu.html + // copied from http://wm.ite.pl/articles/pdep-soft-emu.html // using bsf instead of funky loop DWORD maskIndex; while (_BitScanForward(&maskIndex, mask)) @@ -99,8 +99,8 @@ UINT pext_u32(UINT a, UINT mask) #if KNOB_ARCH >= KNOB_ARCH_AVX2 return _pext_u32(a, mask); #else - UINT result = 0; - DWORD maskIndex; + UINT result = 0; + DWORD maskIndex; uint32_t currentBit = 0; while (_BitScanForward(&maskIndex, mask)) { @@ -117,4 +117,4 @@ UINT pext_u32(UINT a, UINT mask) #endif } -#endif//__SWR_INTRIN_H__ +#endif //__SWR_INTRIN_H__ diff --git a/src/gallium/drivers/swr/rasterizer/common/isa.hpp b/src/gallium/drivers/swr/rasterizer/common/isa.hpp index a62350f2b60..aea5740bb66 100644 --- a/src/gallium/drivers/swr/rasterizer/common/isa.hpp +++ b/src/gallium/drivers/swr/rasterizer/common/isa.hpp @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #pragma once @@ -44,7 +44,7 @@ class InstructionSet { public: - InstructionSet() : CPU_Rep() {}; + InstructionSet() : CPU_Rep(){}; // getters std::string Vendor(void) { return CPU_Rep.vendor_; } @@ -113,21 +113,11 @@ private: class InstructionSet_Internal { public: - InstructionSet_Internal() - : nIds_{ 0 }, - nExIds_{ 0 }, - isIntel_{ false }, - isAMD_{ false }, - f_1_ECX_{ 0 }, - f_1_EDX_{ 0 }, - f_7_EBX_{ 0 }, - f_7_ECX_{ 0 }, - f_81_ECX_{ 0 }, - f_81_EDX_{ 0 }, - data_{}, - extdata_{} + InstructionSet_Internal() : + nIds_{0}, nExIds_{0}, isIntel_{false}, isAMD_{false}, f_1_ECX_{0}, f_1_EDX_{0}, + f_7_EBX_{0}, f_7_ECX_{0}, f_81_ECX_{0}, f_81_EDX_{0}, data_{}, extdata_{} { - //int cpuInfo[4] = {-1}; + // int cpuInfo[4] = {-1}; std::array<int, 4> cpui; // Calling __cpuid with 0x0 as the function_id argument @@ -144,7 +134,7 @@ private: #if defined(_MSC_VER) && !defined(__clang__) __cpuidex(cpui.data(), i, 0); #else - int *data = cpui.data(); + int* data = cpui.data(); __cpuid_count(i, 0, data[0], data[1], data[2], data[3]); #endif data_.push_back(cpui); @@ -153,10 +143,10 @@ private: // Capture vendor string char vendor[0x20]; memset(vendor, 0, sizeof(vendor)); - *reinterpret_cast<int*>(vendor) = data_[0][1]; + *reinterpret_cast<int*>(vendor) = data_[0][1]; *reinterpret_cast<int*>(vendor + 4) = data_[0][3]; *reinterpret_cast<int*>(vendor + 8) = data_[0][2]; - vendor_ = vendor; + vendor_ = vendor; if (vendor_ == "GenuineIntel") { isIntel_ = true; @@ -197,7 +187,7 @@ private: #if defined(_MSC_VER) && !defined(__clang__) __cpuidex(cpui.data(), i, 0); #else - int *data = cpui.data(); + int* data = cpui.data(); __cpuid_count(i, 0, data[0], data[1], data[2], data[3]); #endif extdata_.push_back(cpui); @@ -220,18 +210,18 @@ private: } }; - int nIds_; - unsigned nExIds_; - std::string vendor_; - std::string brand_; - bool isIntel_; - bool isAMD_; - std::bitset<32> f_1_ECX_; - std::bitset<32> f_1_EDX_; - std::bitset<32> f_7_EBX_; - std::bitset<32> f_7_ECX_; - std::bitset<32> f_81_ECX_; - std::bitset<32> f_81_EDX_; + int nIds_; + unsigned nExIds_; + std::string vendor_; + std::string brand_; + bool isIntel_; + bool isAMD_; + std::bitset<32> f_1_ECX_; + std::bitset<32> f_1_EDX_; + std::bitset<32> f_7_EBX_; + std::bitset<32> f_7_ECX_; + std::bitset<32> f_81_ECX_; + std::bitset<32> f_81_EDX_; std::vector<std::array<int, 4>> data_; std::vector<std::array<int, 4>> extdata_; }; diff --git a/src/gallium/drivers/swr/rasterizer/common/os.cpp b/src/gallium/drivers/swr/rasterizer/common/os.cpp index 2d97270b997..aa817d451b4 100644 --- a/src/gallium/drivers/swr/rasterizer/common/os.cpp +++ b/src/gallium/drivers/swr/rasterizer/common/os.cpp @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2017 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2017 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #include "common/os.h" #include <vector> @@ -34,28 +34,26 @@ #include <pthread.h> #endif // Linux - - #if defined(_WIN32) static const DWORD MS_VC_EXCEPTION = 0x406D1388; -#pragma pack(push,8) +#pragma pack(push, 8) typedef struct tagTHREADNAME_INFO { - DWORD dwType; // Must be 0x1000. - LPCSTR szName; // Pointer to name (in user addr space). - DWORD dwThreadID; // Thread ID (-1=caller thread). - DWORD dwFlags; // Reserved for future use, must be zero. + DWORD dwType; // Must be 0x1000. + LPCSTR szName; // Pointer to name (in user addr space). + DWORD dwThreadID; // Thread ID (-1=caller thread). + DWORD dwFlags; // Reserved for future use, must be zero. } THREADNAME_INFO; #pragma pack(pop) void LegacySetThreadName(const char* pThreadName) { THREADNAME_INFO info; - info.dwType = 0x1000; - info.szName = pThreadName; + info.dwType = 0x1000; + info.szName = pThreadName; info.dwThreadID = GetCurrentThreadId(); - info.dwFlags = 0; + info.dwFlags = 0; if (!IsDebuggerPresent()) { @@ -63,14 +61,16 @@ void LegacySetThreadName(const char* pThreadName) return; } -#pragma warning(push) -#pragma warning(disable: 6320 6322) - __try { +#pragma warning(push) +#pragma warning(disable : 6320 6322) + __try + { RaiseException(MS_VC_EXCEPTION, 0, sizeof(info) / sizeof(ULONG_PTR), (ULONG_PTR*)&info); } - __except (EXCEPTION_EXECUTE_HANDLER) { + __except (EXCEPTION_EXECUTE_HANDLER) + { } -#pragma warning(pop) +#pragma warning(pop) } #endif // _WIN32 @@ -78,23 +78,21 @@ void SWR_API SetCurrentThreadName(const char* pThreadName) { #if defined(_WIN32) // The SetThreadDescription API was brought in version 1607 of Windows 10. - typedef HRESULT(WINAPI* PFNSetThreadDescription)(HANDLE hThread, PCWSTR lpThreadDescription); + typedef HRESULT(WINAPI * PFNSetThreadDescription)(HANDLE hThread, PCWSTR lpThreadDescription); // The SetThreadDescription API works even if no debugger is attached. - auto pfnSetThreadDescription = - reinterpret_cast<PFNSetThreadDescription>( - GetProcAddress(GetModuleHandleA("Kernel32.dll"), "SetThreadDescription")); + auto pfnSetThreadDescription = reinterpret_cast<PFNSetThreadDescription>( + GetProcAddress(GetModuleHandleA("Kernel32.dll"), "SetThreadDescription")); if (!pfnSetThreadDescription) { // try KernelBase.dll - pfnSetThreadDescription = - reinterpret_cast<PFNSetThreadDescription>( - GetProcAddress(GetModuleHandleA("KernelBase.dll"), "SetThreadDescription")); + pfnSetThreadDescription = reinterpret_cast<PFNSetThreadDescription>( + GetProcAddress(GetModuleHandleA("KernelBase.dll"), "SetThreadDescription")); } if (pfnSetThreadDescription) { - std::string utf8Name = pThreadName; + std::string utf8Name = pThreadName; std::wstring wideName; wideName.resize(utf8Name.size() + 1); swprintf_s(&(wideName.front()), wideName.size(), L"%S", utf8Name.c_str()); @@ -113,12 +111,13 @@ void SWR_API SetCurrentThreadName(const char* pThreadName) #endif // Linux } -static void SplitString(std::vector<std::string>& out_segments, const std::string& input, char splitToken) +static void +SplitString(std::vector<std::string>& out_segments, const std::string& input, char splitToken) { out_segments.clear(); std::istringstream f(input); - std::string s; + std::string s; while (std::getline(f, s, splitToken)) { if (s.size()) @@ -155,12 +154,11 @@ void SWR_API CreateDirectoryPath(const std::string& path) /// Execute Command (block until finished) /// @returns process exit value -int SWR_API ExecCmd( - const std::string& cmd, ///< (In) Command line string - const char* pOptEnvStrings, ///< (Optional In) Environment block for new process - std::string* pOptStdOut, ///< (Optional Out) Standard Output text - std::string* pOptStdErr, ///< (Optional Out) Standard Error text - const std::string* pOptStdIn) ///< (Optional In) Standard Input text +int SWR_API ExecCmd(const std::string& cmd, ///< (In) Command line string + const char* pOptEnvStrings, ///< (Optional In) Environment block for new process + std::string* pOptStdOut, ///< (Optional Out) Standard Output text + std::string* pOptStdErr, ///< (Optional Out) Standard Error text + const std::string* pOptStdIn) ///< (Optional In) Standard Input text { int rvalue = -1; @@ -172,8 +170,8 @@ int SWR_API ExecCmd( }; std::array<WinPipe, 3> hPipes = {}; - SECURITY_ATTRIBUTES saAttr = { sizeof(SECURITY_ATTRIBUTES) }; - saAttr.bInheritHandle = TRUE; //Pipe handles are inherited by child process. + SECURITY_ATTRIBUTES saAttr = {sizeof(SECURITY_ATTRIBUTES)}; + saAttr.bInheritHandle = TRUE; // Pipe handles are inherited by child process. saAttr.lpSecurityDescriptor = NULL; { @@ -198,7 +196,7 @@ int SWR_API ExecCmd( } STARTUPINFOA StartupInfo{}; - StartupInfo.cb = sizeof(STARTUPINFOA); + StartupInfo.cb = sizeof(STARTUPINFOA); StartupInfo.dwFlags = STARTF_USESTDHANDLES; StartupInfo.dwFlags |= STARTF_USESHOWWINDOW; StartupInfo.wShowWindow = SW_HIDE; @@ -207,30 +205,28 @@ int SWR_API ExecCmd( StartupInfo.hStdInput = hPipes[0].hRead; } StartupInfo.hStdOutput = hPipes[1].hWrite; - StartupInfo.hStdError = hPipes[2].hWrite; + StartupInfo.hStdError = hPipes[2].hWrite; PROCESS_INFORMATION procInfo{}; // CreateProcess can modify the string std::string local_cmd = cmd; - BOOL ProcessValue = CreateProcessA( - NULL, - (LPSTR)local_cmd.c_str(), - NULL, - NULL, - TRUE, - 0, - (LPVOID)pOptEnvStrings, - NULL, - &StartupInfo, - &procInfo); + BOOL ProcessValue = CreateProcessA(NULL, + (LPSTR)local_cmd.c_str(), + NULL, + NULL, + TRUE, + 0, + (LPVOID)pOptEnvStrings, + NULL, + &StartupInfo, + &procInfo); if (ProcessValue && procInfo.hProcess) { - auto ReadFromPipe = [](HANDLE hPipe, std::string* pOutStr) - { - char buf[1024]; - DWORD dwRead = 0; + auto ReadFromPipe = [](HANDLE hPipe, std::string* pOutStr) { + char buf[1024]; + DWORD dwRead = 0; DWORD dwAvail = 0; while (true) { @@ -244,7 +240,12 @@ int SWR_API ExecCmd( break; } - if (!::ReadFile(hPipe, buf, std::min<size_t>(sizeof(buf) - 1, size_t(dwAvail)), &dwRead, NULL) || !dwRead) + if (!::ReadFile(hPipe, + buf, + std::min<size_t>(sizeof(buf) - 1, size_t(dwAvail)), + &dwRead, + NULL) || + !dwRead) { // error, the child process might ended break; @@ -257,17 +258,18 @@ int SWR_API ExecCmd( } } }; - bool bProcessEnded = false; - size_t bytesWritten = 0; + bool bProcessEnded = false; + size_t bytesWritten = 0; do { if (pOptStdIn && (pOptStdIn->size() > bytesWritten)) { DWORD bytesToWrite = static_cast<DWORD>(pOptStdIn->size()) - bytesWritten; - if (!::WriteFile( - hPipes[0].hWrite, - pOptStdIn->data() + bytesWritten, - bytesToWrite, &bytesToWrite, nullptr)) + if (!::WriteFile(hPipes[0].hWrite, + pOptStdIn->data() + bytesWritten, + bytesToWrite, + &bytesToWrite, + nullptr)) { // Failed to write to pipe break; @@ -280,8 +282,7 @@ int SWR_API ExecCmd( ReadFromPipe(hPipes[1].hRead, pOptStdOut); ReadFromPipe(hPipes[2].hRead, pOptStdErr); - } - while (!bProcessEnded); + } while (!bProcessEnded); DWORD exitVal = 0; if (!GetExitCodeProcess(procInfo.hProcess, &exitVal)) diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h index e779562225e..d33c8735d11 100644 --- a/src/gallium/drivers/swr/rasterizer/common/os.h +++ b/src/gallium/drivers/swr/rasterizer/common/os.h @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2014-2017 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2014-2017 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #ifndef __SWR_OS_H__ #define __SWR_OS_H__ @@ -30,7 +30,7 @@ #if (defined(FORCE_WINDOWS) || defined(_WIN32)) && !defined(FORCE_LINUX) #define SWR_API __cdecl -#define SWR_VISIBLE __declspec(dllexport) +#define SWR_VISIBLE __declspec(dllexport) #ifndef NOMINMAX #define NOMINMAX @@ -64,12 +64,12 @@ #define DEBUGBREAK __debugbreak() #define PRAGMA_WARNING_PUSH_DISABLE(...) \ - __pragma(warning(push));\ - __pragma(warning(disable:__VA_ARGS__)); + __pragma(warning(push)); \ + __pragma(warning(disable : __VA_ARGS__)); #define PRAGMA_WARNING_POP() __pragma(warning(pop)) -static inline void *AlignedMalloc(size_t _Size, size_t _Alignment) +static inline void* AlignedMalloc(size_t _Size, size_t _Alignment) { return _aligned_malloc(_Size, _Alignment); } @@ -104,13 +104,13 @@ static inline void AlignedFree(void* p) #include <stdio.h> #include <limits.h> -typedef void VOID; -typedef void* LPVOID; -typedef int INT; -typedef unsigned int UINT; -typedef void* HANDLE; -typedef int LONG; -typedef unsigned int DWORD; +typedef void VOID; +typedef void* LPVOID; +typedef int INT; +typedef unsigned int UINT; +typedef void* HANDLE; +typedef int LONG; +typedef unsigned int DWORD; #undef FALSE #define FALSE 0 @@ -124,7 +124,7 @@ typedef unsigned int DWORD; #ifndef INLINE #define INLINE __inline #endif -#define DEBUGBREAK asm ("int $3") +#define DEBUGBREAK asm("int $3") #if !defined(__CYGWIN__) @@ -136,28 +136,25 @@ typedef unsigned int DWORD; #endif #if defined(__GNUC__) && !defined(__INTEL_COMPILER) - #define __declspec(x) __declspec_##x - #define __declspec_align(y) __attribute__((aligned(y))) - #define __declspec_deprecated __attribute__((deprecated)) - #define __declspec_dllexport - #define __declspec_dllimport - #define __declspec_noinline __attribute__((__noinline__)) - #define __declspec_nothrow __attribute__((nothrow)) - #define __declspec_novtable - #define __declspec_thread __thread +#define __declspec(x) __declspec_##x +#define __declspec_align(y) __attribute__((aligned(y))) +#define __declspec_deprecated __attribute__((deprecated)) +#define __declspec_dllexport +#define __declspec_dllimport +#define __declspec_noinline __attribute__((__noinline__)) +#define __declspec_nothrow __attribute__((nothrow)) +#define __declspec_novtable +#define __declspec_thread __thread #else - #define __declspec(X) +#define __declspec(X) #endif #endif -#define GCC_VERSION (__GNUC__ * 10000 \ - + __GNUC_MINOR__ * 100 \ - + __GNUC_PATCHLEVEL__) +#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) #if !defined(__clang__) && (__GNUC__) && (GCC_VERSION < 40500) -inline -uint64_t __rdtsc() +inline uint64_t __rdtsc() { long low, high; asm volatile("rdtsc" : "=a"(low), "=d"(high)); @@ -165,10 +162,9 @@ uint64_t __rdtsc() } #endif -#if !defined( __clang__) && !defined(__INTEL_COMPILER) +#if !defined(__clang__) && !defined(__INTEL_COMPILER) // Intrinsic not defined in gcc -static INLINE -void _mm256_storeu2_m128i(__m128i *hi, __m128i *lo, __m256i a) +static INLINE void _mm256_storeu2_m128i(__m128i* hi, __m128i* lo, __m256i a) { _mm_storeu_si128((__m128i*)lo, _mm256_castsi256_si128(a)); _mm_storeu_si128((__m128i*)hi, _mm256_extractf128_si256(a, 0x1)); @@ -181,29 +177,25 @@ void _mm256_storeu2_m128i(__m128i *hi, __m128i *lo, __m256i a) #endif #endif -inline -unsigned char _BitScanForward(unsigned long *Index, unsigned long Mask) +inline unsigned char _BitScanForward(unsigned long* Index, unsigned long Mask) { *Index = __builtin_ctz(Mask); return (Mask != 0); } -inline -unsigned char _BitScanForward(unsigned int *Index, unsigned int Mask) +inline unsigned char _BitScanForward(unsigned int* Index, unsigned int Mask) { *Index = __builtin_ctz(Mask); return (Mask != 0); } -inline -unsigned char _BitScanReverse(unsigned long *Index, unsigned long Mask) +inline unsigned char _BitScanReverse(unsigned long* Index, unsigned long Mask) { *Index = __builtin_clz(Mask); return (Mask != 0); } -inline -unsigned char _BitScanReverse(unsigned int *Index, unsigned int Mask) +inline unsigned char _BitScanReverse(unsigned int* Index, unsigned int Mask) { *Index = __builtin_clz(Mask); return (Mask != 0); @@ -212,10 +204,9 @@ unsigned char _BitScanReverse(unsigned int *Index, unsigned int Mask) #define _BitScanForward64 _BitScanForward #define _BitScanReverse64 _BitScanReverse -inline -void *AlignedMalloc(size_t size, size_t alignment) +inline void* AlignedMalloc(size_t size, size_t alignment) { - void *ret; + void* ret; if (posix_memalign(&ret, alignment, size)) { return NULL; @@ -223,19 +214,19 @@ void *AlignedMalloc(size_t size, size_t alignment) return ret; } -static inline -void AlignedFree(void* p) +static inline void AlignedFree(void* p) { free(p); } -#define _countof(a) (sizeof(a)/sizeof(*(a))) +#define _countof(a) (sizeof(a) / sizeof(*(a))) #define sprintf_s sprintf -#define strcpy_s(dst,size,src) strncpy(dst,src,size) +#define strcpy_s(dst, size, src) strncpy(dst, src, size) #define GetCurrentProcessId getpid -#define InterlockedCompareExchange(Dest, Exchange, Comparand) __sync_val_compare_and_swap(Dest, Comparand, Exchange) +#define InterlockedCompareExchange(Dest, Exchange, Comparand) \ + __sync_val_compare_and_swap(Dest, Comparand, Exchange) #define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value) #define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1) #define InterlockedDecrement64(Append) __sync_sub_and_fetch(Append, 1) @@ -257,9 +248,9 @@ void AlignedFree(void* p) #define THREAD thread_local // Universal types -typedef uint8_t KILOBYTE[1024]; -typedef KILOBYTE MEGABYTE[1024]; -typedef MEGABYTE GIGABYTE[1024]; +typedef uint8_t KILOBYTE[1024]; +typedef KILOBYTE MEGABYTE[1024]; +typedef MEGABYTE GIGABYTE[1024]; #define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64) #define OSALIGNSIMD(RWORD) OSALIGN(RWORD, KNOB_SIMD_BYTES) @@ -275,9 +266,9 @@ typedef MEGABYTE GIGABYTE[1024]; #define ATTR_UNUSED #endif -#define SWR_FUNC(_retType, _funcName, /* args */...) \ - typedef _retType (SWR_API * PFN##_funcName)(__VA_ARGS__); \ - _retType SWR_API _funcName(__VA_ARGS__); +#define SWR_FUNC(_retType, _funcName, /* args */...) \ + typedef _retType(SWR_API* PFN##_funcName)(__VA_ARGS__); \ + _retType SWR_API _funcName(__VA_ARGS__); // Defined in os.cpp void SWR_API SetCurrentThreadName(const char* pThreadName); @@ -285,11 +276,11 @@ void SWR_API CreateDirectoryPath(const std::string& path); /// Execute Command (block until finished) /// @returns process exit value -int SWR_API ExecCmd( - const std::string& cmd, ///< (In) Command line string - const char* pOptEnvStrings = nullptr, ///< (Optional In) Environment block for new process - std::string* pOptStdOut = nullptr, ///< (Optional Out) Standard Output text - std::string* pOptStdErr = nullptr, ///< (Optional Out) Standard Error text - const std::string* pOptStdIn = nullptr); ///< (Optional In) Standard Input text - -#endif//__SWR_OS_H__ +int SWR_API + ExecCmd(const std::string& cmd, ///< (In) Command line string + const char* pOptEnvStrings = nullptr, ///< (Optional In) Environment block for new process + std::string* pOptStdOut = nullptr, ///< (Optional Out) Standard Output text + std::string* pOptStdErr = nullptr, ///< (Optional Out) Standard Error text + const std::string* pOptStdIn = nullptr); ///< (Optional In) Standard Input text + +#endif //__SWR_OS_H__ diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp index 79e82c4e6b3..e19a2d11045 100644 --- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp +++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp @@ -1,32 +1,32 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file rdtsc_buckets.cpp -* -* @brief implementation of rdtsc buckets. -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file rdtsc_buckets.cpp + * + * @brief implementation of rdtsc buckets. + * + * Notes: + * + ******************************************************************************/ #include "rdtsc_buckets.h" #include <inttypes.h> @@ -50,16 +50,16 @@ void BucketManager::RegisterThread(const std::string& name) BUCKET_THREAD newThread; newThread.name = name; newThread.root.children.reserve(mBuckets.size()); - newThread.root.id = 0; + newThread.root.id = 0; newThread.root.pParent = nullptr; - newThread.pCurrent = &newThread.root; + newThread.pCurrent = &newThread.root; mThreadMutex.lock(); // assign unique thread id for this thread - size_t id = mThreads.size(); + size_t id = mThreads.size(); newThread.id = (UINT)id; - tlsThreadId = (UINT)id; + tlsThreadId = (UINT)id; // store new thread mThreads.push_back(newThread); @@ -76,9 +76,10 @@ UINT BucketManager::RegisterBucket(const BUCKET_DESC& desc) return (UINT)id; } -void BucketManager::PrintBucket(FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket) +void BucketManager::PrintBucket( + FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket) { - const char *arrows[] = { + const char* arrows[] = { "", "|-> ", " |-> ", @@ -99,7 +100,7 @@ void BucketManager::PrintBucket(FILE* f, UINT level, uint64_t threadCycles, uint // compute average cycle count per invocation uint64_t CPE = bucket.elapsed / bucket.count; - BUCKET_DESC &desc = mBuckets[bucket.id]; + BUCKET_DESC& desc = mBuckets[bucket.id]; // construct hierarchy visualization char hier[80]; @@ -107,16 +108,16 @@ void BucketManager::PrintBucket(FILE* f, UINT level, uint64_t threadCycles, uint strcat(hier, desc.name.c_str()); // print out - fprintf(f, "%6.2f %6.2f %-10" PRIu64 " %-10" PRIu64 " %-10u %-10lu %-10u %s\n", - percentTotal, - percentParent, - bucket.elapsed, - CPE, - bucket.count, - (unsigned long)0, - (uint32_t)0, - hier - ); + fprintf(f, + "%6.2f %6.2f %-10" PRIu64 " %-10" PRIu64 " %-10u %-10lu %-10u %s\n", + percentTotal, + percentParent, + bucket.elapsed, + CPE, + bucket.count, + (unsigned long)0, + (uint32_t)0, + hier); // dump all children of this bucket for (const BUCKET& child : bucket.children) @@ -135,8 +136,8 @@ void BucketManager::PrintThread(FILE* f, const BUCKET_THREAD& thread) fprintf(f, " %%Tot %%Par Cycles CPE NumEvent CPE2 NumEvent2 Bucket\n"); // compute thread level total cycle counts across all buckets from root - const BUCKET& root = thread.root; - uint64_t totalCycles = 0; + const BUCKET& root = thread.root; + uint64_t totalCycles = 0; for (const BUCKET& child : root.children) { totalCycles += child.elapsed; diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h index 48042ac2233..bbc9538b86d 100644 --- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h +++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h @@ -1,32 +1,32 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file rdtsc_buckets.h -* -* @brief declaration for rdtsc buckets. -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file rdtsc_buckets.h + * + * @brief declaration for rdtsc buckets. + * + * Notes: + * + ******************************************************************************/ #pragma once #include "os.h" @@ -48,7 +48,7 @@ extern THREAD UINT tlsThreadId; class BucketManager { public: - BucketManager() { } + BucketManager() {} ~BucketManager(); // removes all registered thread data @@ -112,7 +112,8 @@ public: // @param id generated by RegisterBucket INLINE void StartBucket(UINT id) { - if (!mCapturing) return; + if (!mCapturing) + return; SWR_ASSERT(tlsThreadId < mThreads.size()); @@ -125,10 +126,10 @@ public: { bt.pCurrent->children.resize(mBuckets.size()); } - BUCKET &child = bt.pCurrent->children[id]; + BUCKET& child = bt.pCurrent->children[id]; child.pParent = bt.pCurrent; - child.id = id; - child.start = tsc; + child.id = id; + child.start = tsc; // update thread's currently executing bucket bt.pCurrent = &child; @@ -142,7 +143,7 @@ public: INLINE void StopBucket(UINT id) { SWR_ASSERT(tlsThreadId < mThreads.size()); - BUCKET_THREAD &bt = mThreads[tlsThreadId]; + BUCKET_THREAD& bt = mThreads[tlsThreadId]; if (bt.level == 0) { @@ -152,7 +153,8 @@ public: uint64_t tsc = __rdtsc(); { - if (bt.pCurrent->start == 0) return; + if (bt.pCurrent->start == 0) + return; SWR_ASSERT(bt.pCurrent->id == id, "Mismatched buckets detected"); bt.pCurrent->elapsed += (tsc - bt.pCurrent->start); @@ -167,7 +169,8 @@ public: INLINE void AddEvent(uint32_t id, uint32_t count) { - if (!mCapturing) return; + if (!mCapturing) + return; SWR_ASSERT(tlsThreadId < mThreads.size()); @@ -179,15 +182,16 @@ public: { bt.pCurrent->children.resize(mBuckets.size()); } - BUCKET &child = bt.pCurrent->children[id]; + BUCKET& child = bt.pCurrent->children[id]; child.pParent = bt.pCurrent; - child.id = id; + child.id = id; child.count += count; } } private: - void PrintBucket(FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket); + void PrintBucket( + FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket); void PrintThread(FILE* f, const BUCKET_THREAD& thread); // list of active threads that have registered with this manager @@ -197,10 +201,10 @@ private: std::vector<BUCKET_DESC> mBuckets; // is capturing currently enabled - volatile bool mCapturing{ false }; + volatile bool mCapturing{false}; // has capturing completed - volatile bool mDoneCapturing{ false }; + volatile bool mDoneCapturing{false}; std::mutex mThreadMutex; @@ -208,7 +212,6 @@ private: }; - // C helpers for jitter void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id); void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id); diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h index f6e75cda89d..fd3b1df746a 100644 --- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h +++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h @@ -1,32 +1,32 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file rdtsc_buckets.h -* -* @brief declaration for rdtsc buckets. -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file rdtsc_buckets.h + * + * @brief declaration for rdtsc buckets. + * + * Notes: + * + ******************************************************************************/ #pragma once #include <vector> @@ -34,12 +34,12 @@ struct BUCKET { - uint32_t id{ 0 }; - uint64_t start{ 0 }; - uint64_t elapsed{ 0 }; - uint32_t count{ 0 }; + uint32_t id{0}; + uint64_t start{0}; + uint64_t elapsed{0}; + uint32_t count{0}; - BUCKET* pParent{ nullptr }; + BUCKET* pParent{nullptr}; std::vector<BUCKET> children; }; @@ -65,29 +65,29 @@ struct BUCKET_THREAD std::string name; // id for this thread, assigned by the thread manager - uint32_t id{ 0 }; + uint32_t id{0}; // root of the bucket hierarchy for this thread BUCKET root; // currently executing bucket somewhere in the hierarchy - BUCKET* pCurrent{ nullptr }; + BUCKET* pCurrent{nullptr}; // currently executing hierarchy level - uint32_t level{ 0 }; + uint32_t level{0}; // threadviz file object - FILE* vizFile{ nullptr }; + FILE* vizFile{nullptr}; BUCKET_THREAD() {} BUCKET_THREAD(const BUCKET_THREAD& that) { - name = that.name; - id = that.id; - root = that.root; + name = that.name; + id = that.id; + root = that.root; pCurrent = &root; - vizFile = that.vizFile; + vizFile = that.vizFile; } }; @@ -100,14 +100,14 @@ enum VIZ_TYPE struct VIZ_START_DATA { - uint8_t type; + uint8_t type; uint32_t bucketId; uint64_t timestamp; }; struct VIZ_STOP_DATA { - uint8_t type; + uint8_t type; uint64_t timestamp; }; @@ -144,7 +144,7 @@ inline void Serialize(FILE* f, const std::string& string) inline void Deserialize(FILE* f, std::string& string) { - char cstr[256]; + char cstr[256]; uint8_t length; fread(&length, sizeof(length), 1, f); fread(cstr, length, 1, f); diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h index 98a8b9b2f9f..b08fb2eaaea 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #ifndef __SWR_SIMD16INTRIN_H__ #define __SWR_SIMD16INTRIN_H__ @@ -27,144 +27,146 @@ #if ENABLE_AVX512_SIMD16 #if KNOB_SIMD16_WIDTH == 16 -typedef SIMD512 SIMD16; +typedef SIMD512 SIMD16; #else #error Unsupported vector width -#endif//KNOB_SIMD16_WIDTH == 16 +#endif // KNOB_SIMD16_WIDTH == 16 -#define _simd16_setzero_ps SIMD16::setzero_ps -#define _simd16_setzero_si SIMD16::setzero_si -#define _simd16_set1_ps SIMD16::set1_ps -#define _simd16_set1_epi8 SIMD16::set1_epi8 -#define _simd16_set1_epi32 SIMD16::set1_epi32 -#define _simd16_set_ps SIMD16::set_ps -#define _simd16_set_epi32 SIMD16::set_epi32 -#define _simd16_load_ps SIMD16::load_ps -#define _simd16_loadu_ps SIMD16::loadu_ps -#if 1 -#define _simd16_load1_ps SIMD16::broadcast_ss -#endif -#define _simd16_load_si SIMD16::load_si -#define _simd16_loadu_si SIMD16::loadu_si -#define _simd16_broadcast_ss(m) SIMD16::broadcast_ss((float const*)m) -#define _simd16_store_ps SIMD16::store_ps -#define _simd16_store_si SIMD16::store_si -#define _simd16_extract_ps(a, imm8) SIMD16::extract_ps<imm8>(a) -#define _simd16_extract_si(a, imm8) SIMD16::extract_si<imm8>(a) -#define _simd16_insert_ps(a, b, imm8) SIMD16::insert_ps<imm8>(a, b) -#define _simd16_insert_si(a, b, imm8) SIMD16::insert_si<imm8>(a, b) -#define _simd16_maskstore_ps SIMD16::maskstore_ps -#define _simd16_blend_ps(a, b, mask) SIMD16::blend_ps<mask>(a, b) -#define _simd16_blendv_ps SIMD16::blendv_ps -#define _simd16_blendv_epi32 SIMD16::blendv_epi32 -#define _simd16_mul_ps SIMD16::mul_ps -#define _simd16_div_ps SIMD16::div_ps -#define _simd16_add_ps SIMD16::add_ps -#define _simd16_sub_ps SIMD16::sub_ps -#define _simd16_rsqrt_ps SIMD16::rsqrt_ps -#define _simd16_min_ps SIMD16::min_ps -#define _simd16_max_ps SIMD16::max_ps -#define _simd16_movemask_ps SIMD16::movemask_ps -#define _simd16_movemask_pd SIMD16::movemask_pd -#define _simd16_cvtps_epi32 SIMD16::cvtps_epi32 -#define _simd16_cvttps_epi32 SIMD16::cvttps_epi32 -#define _simd16_cvtepi32_ps SIMD16::cvtepi32_ps -#define _simd16_cmp_ps(a, b, comp) SIMD16::cmp_ps<SIMD16::CompareType(comp)>(a, b) -#define _simd16_cmplt_ps SIMD16::cmplt_ps -#define _simd16_cmpgt_ps SIMD16::cmpgt_ps -#define _simd16_cmpneq_ps SIMD16::cmpneq_ps -#define _simd16_cmpeq_ps SIMD16::cmpeq_ps -#define _simd16_cmpge_ps SIMD16::cmpge_ps -#define _simd16_cmple_ps SIMD16::cmple_ps -#define _simd16_castsi_ps SIMD16::castsi_ps -#define _simd16_castps_si SIMD16::castps_si -#define _simd16_castsi_pd SIMD16::castsi_pd -#define _simd16_castpd_si SIMD16::castpd_si -#define _simd16_castpd_ps SIMD16::castpd_ps -#define _simd16_castps_pd SIMD16::castps_pd -#define _simd16_and_ps SIMD16::and_ps -#define _simd16_andnot_ps SIMD16::andnot_ps -#define _simd16_or_ps SIMD16::or_ps -#define _simd16_xor_ps SIMD16::xor_ps -#define _simd16_round_ps(a, mode) SIMD16::round_ps<SIMD16::RoundMode(mode)>(a) -#define _simd16_mul_epi32 SIMD16::mul_epi32 -#define _simd16_mullo_epi32 SIMD16::mullo_epi32 -#define _simd16_sub_epi32 SIMD16::sub_epi32 -#define _simd16_sub_epi64 SIMD16::sub_epi64 -#define _simd16_min_epi32 SIMD16::min_epi32 -#define _simd16_max_epi32 SIMD16::max_epi32 -#define _simd16_min_epu32 SIMD16::min_epu32 -#define _simd16_max_epu32 SIMD16::max_epu32 -#define _simd16_add_epi32 SIMD16::add_epi32 -#define _simd16_and_si SIMD16::and_si -#define _simd16_andnot_si SIMD16::andnot_si -#define _simd16_or_si SIMD16::or_si -#define _simd16_xor_si SIMD16::xor_si -#define _simd16_cmpeq_epi32 SIMD16::cmpeq_epi32 -#define _simd16_cmpgt_epi32 SIMD16::cmpgt_epi32 -#define _simd16_cmplt_epi32 SIMD16::cmplt_epi32 -#define _simd16_testz_ps SIMD16::testz_ps -#define _simd16_unpacklo_ps SIMD16::unpacklo_ps -#define _simd16_unpackhi_ps SIMD16::unpackhi_ps -#define _simd16_unpacklo_pd SIMD16::unpacklo_pd -#define _simd16_unpackhi_pd SIMD16::unpackhi_pd -#define _simd16_unpacklo_epi8 SIMD16::unpacklo_epi8 -#define _simd16_unpackhi_epi8 SIMD16::unpackhi_epi8 -#define _simd16_unpacklo_epi16 SIMD16::unpacklo_epi16 -#define _simd16_unpackhi_epi16 SIMD16::unpackhi_epi16 -#define _simd16_unpacklo_epi32 SIMD16::unpacklo_epi32 -#define _simd16_unpackhi_epi32 SIMD16::unpackhi_epi32 -#define _simd16_unpacklo_epi64 SIMD16::unpacklo_epi64 -#define _simd16_unpackhi_epi64 SIMD16::unpackhi_epi64 -#define _simd16_slli_epi32(a, i) SIMD16::slli_epi32<i>(a) -#define _simd16_srli_epi32(a, i) SIMD16::srli_epi32<i>(a) -#define _simd16_srai_epi32(a, i) SIMD16::srai_epi32<i>(a) -#define _simd16_fmadd_ps SIMD16::fmadd_ps -#define _simd16_fmsub_ps SIMD16::fmsub_ps -#define _simd16_adds_epu8 SIMD16::adds_epu8 -#define _simd16_subs_epu8 SIMD16::subs_epu8 -#define _simd16_add_epi8 SIMD16::add_epi8 -#define _simd16_shuffle_epi8 SIMD16::shuffle_epi8 +#define _simd16_setzero_ps SIMD16::setzero_ps +#define _simd16_setzero_si SIMD16::setzero_si +#define _simd16_set1_ps SIMD16::set1_ps +#define _simd16_set1_epi8 SIMD16::set1_epi8 +#define _simd16_set1_epi32 SIMD16::set1_epi32 +#define _simd16_set_ps SIMD16::set_ps +#define _simd16_set_epi32 SIMD16::set_epi32 +#define _simd16_load_ps SIMD16::load_ps +#define _simd16_loadu_ps SIMD16::loadu_ps +#if 1 +#define _simd16_load1_ps SIMD16::broadcast_ss +#endif +#define _simd16_load_si SIMD16::load_si +#define _simd16_loadu_si SIMD16::loadu_si +#define _simd16_broadcast_ss(m) SIMD16::broadcast_ss((float const*)m) +#define _simd16_store_ps SIMD16::store_ps +#define _simd16_store_si SIMD16::store_si +#define _simd16_extract_ps(a, imm8) SIMD16::extract_ps<imm8>(a) +#define _simd16_extract_si(a, imm8) SIMD16::extract_si<imm8>(a) +#define _simd16_insert_ps(a, b, imm8) SIMD16::insert_ps<imm8>(a, b) +#define _simd16_insert_si(a, b, imm8) SIMD16::insert_si<imm8>(a, b) +#define _simd16_maskstore_ps SIMD16::maskstore_ps +#define _simd16_blend_ps(a, b, mask) SIMD16::blend_ps<mask>(a, b) +#define _simd16_blendv_ps SIMD16::blendv_ps +#define _simd16_blendv_epi32 SIMD16::blendv_epi32 +#define _simd16_mul_ps SIMD16::mul_ps +#define _simd16_div_ps SIMD16::div_ps +#define _simd16_add_ps SIMD16::add_ps +#define _simd16_sub_ps SIMD16::sub_ps +#define _simd16_rsqrt_ps SIMD16::rsqrt_ps +#define _simd16_min_ps SIMD16::min_ps +#define _simd16_max_ps SIMD16::max_ps +#define _simd16_movemask_ps SIMD16::movemask_ps +#define _simd16_movemask_pd SIMD16::movemask_pd +#define _simd16_cvtps_epi32 SIMD16::cvtps_epi32 +#define _simd16_cvttps_epi32 SIMD16::cvttps_epi32 +#define _simd16_cvtepi32_ps SIMD16::cvtepi32_ps +#define _simd16_cmp_ps(a, b, comp) SIMD16::cmp_ps<SIMD16::CompareType(comp)>(a, b) +#define _simd16_cmplt_ps SIMD16::cmplt_ps +#define _simd16_cmpgt_ps SIMD16::cmpgt_ps +#define _simd16_cmpneq_ps SIMD16::cmpneq_ps +#define _simd16_cmpeq_ps SIMD16::cmpeq_ps +#define _simd16_cmpge_ps SIMD16::cmpge_ps +#define _simd16_cmple_ps SIMD16::cmple_ps +#define _simd16_castsi_ps SIMD16::castsi_ps +#define _simd16_castps_si SIMD16::castps_si +#define _simd16_castsi_pd SIMD16::castsi_pd +#define _simd16_castpd_si SIMD16::castpd_si +#define _simd16_castpd_ps SIMD16::castpd_ps +#define _simd16_castps_pd SIMD16::castps_pd +#define _simd16_and_ps SIMD16::and_ps +#define _simd16_andnot_ps SIMD16::andnot_ps +#define _simd16_or_ps SIMD16::or_ps +#define _simd16_xor_ps SIMD16::xor_ps +#define _simd16_round_ps(a, mode) SIMD16::round_ps<SIMD16::RoundMode(mode)>(a) +#define _simd16_mul_epi32 SIMD16::mul_epi32 +#define _simd16_mullo_epi32 SIMD16::mullo_epi32 +#define _simd16_sub_epi32 SIMD16::sub_epi32 +#define _simd16_sub_epi64 SIMD16::sub_epi64 +#define _simd16_min_epi32 SIMD16::min_epi32 +#define _simd16_max_epi32 SIMD16::max_epi32 +#define _simd16_min_epu32 SIMD16::min_epu32 +#define _simd16_max_epu32 SIMD16::max_epu32 +#define _simd16_add_epi32 SIMD16::add_epi32 +#define _simd16_and_si SIMD16::and_si +#define _simd16_andnot_si SIMD16::andnot_si +#define _simd16_or_si SIMD16::or_si +#define _simd16_xor_si SIMD16::xor_si +#define _simd16_cmpeq_epi32 SIMD16::cmpeq_epi32 +#define _simd16_cmpgt_epi32 SIMD16::cmpgt_epi32 +#define _simd16_cmplt_epi32 SIMD16::cmplt_epi32 +#define _simd16_testz_ps SIMD16::testz_ps +#define _simd16_unpacklo_ps SIMD16::unpacklo_ps +#define _simd16_unpackhi_ps SIMD16::unpackhi_ps +#define _simd16_unpacklo_pd SIMD16::unpacklo_pd +#define _simd16_unpackhi_pd SIMD16::unpackhi_pd +#define _simd16_unpacklo_epi8 SIMD16::unpacklo_epi8 +#define _simd16_unpackhi_epi8 SIMD16::unpackhi_epi8 +#define _simd16_unpacklo_epi16 SIMD16::unpacklo_epi16 +#define _simd16_unpackhi_epi16 SIMD16::unpackhi_epi16 +#define _simd16_unpacklo_epi32 SIMD16::unpacklo_epi32 +#define _simd16_unpackhi_epi32 SIMD16::unpackhi_epi32 +#define _simd16_unpacklo_epi64 SIMD16::unpacklo_epi64 +#define _simd16_unpackhi_epi64 SIMD16::unpackhi_epi64 +#define _simd16_slli_epi32(a, i) SIMD16::slli_epi32<i>(a) +#define _simd16_srli_epi32(a, i) SIMD16::srli_epi32<i>(a) +#define _simd16_srai_epi32(a, i) SIMD16::srai_epi32<i>(a) +#define _simd16_fmadd_ps SIMD16::fmadd_ps +#define _simd16_fmsub_ps SIMD16::fmsub_ps +#define _simd16_adds_epu8 SIMD16::adds_epu8 +#define _simd16_subs_epu8 SIMD16::subs_epu8 +#define _simd16_add_epi8 SIMD16::add_epi8 +#define _simd16_shuffle_epi8 SIMD16::shuffle_epi8 -#define _simd16_i32gather_ps(m, index, scale) SIMD16::i32gather_ps<SIMD16::ScaleFactor(scale)>(m, index) -#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) SIMD16::mask_i32gather_ps<SIMD16::ScaleFactor(scale)>(a, m, index, mask) +#define _simd16_i32gather_ps(m, index, scale) \ + SIMD16::i32gather_ps<SIMD16::ScaleFactor(scale)>(m, index) +#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) \ + SIMD16::mask_i32gather_ps<SIMD16::ScaleFactor(scale)>(a, m, index, mask) -#define _simd16_abs_epi32 SIMD16::abs_epi32 +#define _simd16_abs_epi32 SIMD16::abs_epi32 -#define _simd16_cmpeq_epi64 SIMD16::cmpeq_epi64 -#define _simd16_cmpgt_epi64 SIMD16::cmpgt_epi64 -#define _simd16_cmpeq_epi16 SIMD16::cmpeq_epi16 -#define _simd16_cmpgt_epi16 SIMD16::cmpgt_epi16 -#define _simd16_cmpeq_epi8 SIMD16::cmpeq_epi8 -#define _simd16_cmpgt_epi8 SIMD16::cmpgt_epi8 +#define _simd16_cmpeq_epi64 SIMD16::cmpeq_epi64 +#define _simd16_cmpgt_epi64 SIMD16::cmpgt_epi64 +#define _simd16_cmpeq_epi16 SIMD16::cmpeq_epi16 +#define _simd16_cmpgt_epi16 SIMD16::cmpgt_epi16 +#define _simd16_cmpeq_epi8 SIMD16::cmpeq_epi8 +#define _simd16_cmpgt_epi8 SIMD16::cmpgt_epi8 -#define _simd16_permute_ps_i(a, i) SIMD16::permute_ps<i>(a) -#define _simd16_permute_ps SIMD16::permute_ps -#define _simd16_permute_epi32 SIMD16::permute_epi32 -#define _simd16_sllv_epi32 SIMD16::sllv_epi32 -#define _simd16_srlv_epi32 SIMD16::sllv_epi32 -#define _simd16_permute2f128_ps(a, b, i) SIMD16::permute2f128_ps<i>(a, b) -#define _simd16_permute2f128_pd(a, b, i) SIMD16::permute2f128_pd<i>(a, b) -#define _simd16_permute2f128_si(a, b, i) SIMD16::permute2f128_si<i>(a, b) -#define _simd16_shuffle_ps(a, b, i) SIMD16::shuffle_ps<i>(a, b) -#define _simd16_shuffle_pd(a, b, i) SIMD16::shuffle_pd<i>(a, b) -#define _simd16_shuffle_epi32(a, b, imm8) SIMD16::shuffle_epi32<imm8>(a, b) -#define _simd16_shuffle_epi64(a, b, imm8) SIMD16::shuffle_epi64<imm8>(a, b) -#define _simd16_cvtepu8_epi16 SIMD16::cvtepu8_epi16 -#define _simd16_cvtepu8_epi32 SIMD16::cvtepu8_epi32 -#define _simd16_cvtepu16_epi32 SIMD16::cvtepu16_epi32 -#define _simd16_cvtepu16_epi64 SIMD16::cvtepu16_epi64 -#define _simd16_cvtepu32_epi64 SIMD16::cvtepu32_epi64 -#define _simd16_packus_epi16 SIMD16::packus_epi16 -#define _simd16_packs_epi16 SIMD16::packs_epi16 -#define _simd16_packus_epi32 SIMD16::packus_epi32 -#define _simd16_packs_epi32 SIMD16::packs_epi32 -#define _simd16_cmplt_ps_mask SIMD16::cmp_ps_mask<SIMD16::CompareType::LT_OQ> -#define _simd16_cmpeq_ps_mask SIMD16::cmp_ps_mask<SIMD16::CompareType::EQ_OQ> -#define _simd16_int2mask(mask) simd16mask(mask) -#define _simd16_mask2int(mask) int(mask) -#define _simd16_vmask_ps SIMD16::vmask_ps +#define _simd16_permute_ps_i(a, i) SIMD16::permute_ps<i>(a) +#define _simd16_permute_ps SIMD16::permute_ps +#define _simd16_permute_epi32 SIMD16::permute_epi32 +#define _simd16_sllv_epi32 SIMD16::sllv_epi32 +#define _simd16_srlv_epi32 SIMD16::sllv_epi32 +#define _simd16_permute2f128_ps(a, b, i) SIMD16::permute2f128_ps<i>(a, b) +#define _simd16_permute2f128_pd(a, b, i) SIMD16::permute2f128_pd<i>(a, b) +#define _simd16_permute2f128_si(a, b, i) SIMD16::permute2f128_si<i>(a, b) +#define _simd16_shuffle_ps(a, b, i) SIMD16::shuffle_ps<i>(a, b) +#define _simd16_shuffle_pd(a, b, i) SIMD16::shuffle_pd<i>(a, b) +#define _simd16_shuffle_epi32(a, b, imm8) SIMD16::shuffle_epi32<imm8>(a, b) +#define _simd16_shuffle_epi64(a, b, imm8) SIMD16::shuffle_epi64<imm8>(a, b) +#define _simd16_cvtepu8_epi16 SIMD16::cvtepu8_epi16 +#define _simd16_cvtepu8_epi32 SIMD16::cvtepu8_epi32 +#define _simd16_cvtepu16_epi32 SIMD16::cvtepu16_epi32 +#define _simd16_cvtepu16_epi64 SIMD16::cvtepu16_epi64 +#define _simd16_cvtepu32_epi64 SIMD16::cvtepu32_epi64 +#define _simd16_packus_epi16 SIMD16::packus_epi16 +#define _simd16_packs_epi16 SIMD16::packs_epi16 +#define _simd16_packus_epi32 SIMD16::packus_epi32 +#define _simd16_packs_epi32 SIMD16::packs_epi32 +#define _simd16_cmplt_ps_mask SIMD16::cmp_ps_mask<SIMD16::CompareType::LT_OQ> +#define _simd16_cmpeq_ps_mask SIMD16::cmp_ps_mask<SIMD16::CompareType::EQ_OQ> +#define _simd16_int2mask(mask) simd16mask(mask) +#define _simd16_mask2int(mask) int(mask) +#define _simd16_vmask_ps SIMD16::vmask_ps -#endif//ENABLE_AVX512_SIMD16 +#endif // ENABLE_AVX512_SIMD16 -#endif//__SWR_SIMD16INTRIN_H_ +#endif //__SWR_SIMD16INTRIN_H_ diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h index b1471a97250..8ffda3f8458 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #ifndef __SWR_SIMDINTRIN_H__ #define __SWR_SIMDINTRIN_H__ @@ -28,176 +28,177 @@ #include "common/simdlib.hpp" #if KNOB_SIMD_WIDTH == 8 -typedef SIMD256 SIMD; +typedef SIMD256 SIMD; #else #error Unsupported vector width -#endif//KNOB_SIMD16_WIDTH == 16 - - -#define _simd128_maskstore_ps SIMD128::maskstore_ps -#define _simd128_fmadd_ps SIMD128::fmadd_ps - -#define _simd_load_ps SIMD::load_ps -#define _simd_load1_ps SIMD::broadcast_ss -#define _simd_loadu_ps SIMD::loadu_ps -#define _simd_setzero_ps SIMD::setzero_ps -#define _simd_set1_ps SIMD::set1_ps -#define _simd_blend_ps(a, b, i) SIMD::blend_ps<i>(a, b) -#define _simd_blend_epi32(a, b, i) SIMD::blend_epi32<i>(a, b) -#define _simd_blendv_ps SIMD::blendv_ps -#define _simd_store_ps SIMD::store_ps -#define _simd_mul_ps SIMD::mul_ps -#define _simd_add_ps SIMD::add_ps -#define _simd_sub_ps SIMD::sub_ps -#define _simd_rsqrt_ps SIMD::rsqrt_ps -#define _simd_min_ps SIMD::min_ps -#define _simd_max_ps SIMD::max_ps -#define _simd_movemask_ps SIMD::movemask_ps -#define _simd_cvtps_epi32 SIMD::cvtps_epi32 -#define _simd_cvttps_epi32 SIMD::cvttps_epi32 -#define _simd_cvtepi32_ps SIMD::cvtepi32_ps -#define _simd_cmplt_ps SIMD::cmplt_ps -#define _simd_cmpgt_ps SIMD::cmpgt_ps -#define _simd_cmpneq_ps SIMD::cmpneq_ps -#define _simd_cmpeq_ps SIMD::cmpeq_ps -#define _simd_cmpge_ps SIMD::cmpge_ps -#define _simd_cmple_ps SIMD::cmple_ps -#define _simd_cmp_ps(a, b, imm) SIMD::cmp_ps<SIMD::CompareType(imm)>(a, b) -#define _simd_and_ps SIMD::and_ps -#define _simd_or_ps SIMD::or_ps -#define _simd_rcp_ps SIMD::rcp_ps -#define _simd_div_ps SIMD::div_ps -#define _simd_castsi_ps SIMD::castsi_ps -#define _simd_castps_pd SIMD::castps_pd -#define _simd_castpd_ps SIMD::castpd_ps -#define _simd_andnot_ps SIMD::andnot_ps -#define _simd_round_ps(a, i) SIMD::round_ps<SIMD::RoundMode(i)>(a) -#define _simd_castpd_ps SIMD::castpd_ps -#define _simd_broadcast_ps(a) SIMD::broadcast_ps((SIMD128::Float const *)(a)) -#define _simd_stream_ps SIMD::stream_ps - -#define _simd_movemask_pd SIMD::movemask_pd -#define _simd_castsi_pd SIMD::castsi_pd - -#define _simd_mul_epi32 SIMD::mul_epi32 -#define _simd_mullo_epi32 SIMD::mullo_epi32 -#define _simd_sub_epi32 SIMD::sub_epi32 -#define _simd_sub_epi64 SIMD::sub_epi64 -#define _simd_min_epi32 SIMD::min_epi32 -#define _simd_min_epu32 SIMD::min_epu32 -#define _simd_max_epi32 SIMD::max_epi32 -#define _simd_max_epu32 SIMD::max_epu32 -#define _simd_add_epi32 SIMD::add_epi32 -#define _simd_and_si SIMD::and_si -#define _simd_andnot_si SIMD::andnot_si -#define _simd_cmpeq_epi32 SIMD::cmpeq_epi32 -#define _simd_cmplt_epi32 SIMD::cmplt_epi32 -#define _simd_cmpgt_epi32 SIMD::cmpgt_epi32 -#define _simd_or_si SIMD::or_si -#define _simd_xor_si SIMD::xor_si -#define _simd_castps_si SIMD::castps_si -#define _simd_adds_epu8 SIMD::adds_epu8 -#define _simd_subs_epu8 SIMD::subs_epu8 -#define _simd_add_epi8 SIMD::add_epi8 -#define _simd_cmpeq_epi64 SIMD::cmpeq_epi64 -#define _simd_cmpgt_epi64 SIMD::cmpgt_epi64 -#define _simd_cmpgt_epi8 SIMD::cmpgt_epi8 -#define _simd_cmpeq_epi8 SIMD::cmpeq_epi8 -#define _simd_cmpgt_epi16 SIMD::cmpgt_epi16 -#define _simd_cmpeq_epi16 SIMD::cmpeq_epi16 -#define _simd_movemask_epi8 SIMD::movemask_epi8 -#define _simd_permute_ps_i(a, i) SIMD::permute_ps<i>(a) -#define _simd_permute_ps SIMD::permute_ps -#define _simd_permute_epi32 SIMD::permute_epi32 -#define _simd_srlv_epi32 SIMD::srlv_epi32 -#define _simd_sllv_epi32 SIMD::sllv_epi32 - -#define _simd_unpacklo_epi8 SIMD::unpacklo_epi8 -#define _simd_unpackhi_epi8 SIMD::unpackhi_epi8 -#define _simd_unpacklo_epi16 SIMD::unpacklo_epi16 -#define _simd_unpackhi_epi16 SIMD::unpackhi_epi16 -#define _simd_unpacklo_epi32 SIMD::unpacklo_epi32 -#define _simd_unpackhi_epi32 SIMD::unpackhi_epi32 -#define _simd_unpacklo_epi64 SIMD::unpacklo_epi64 -#define _simd_unpackhi_epi64 SIMD::unpackhi_epi64 - -#define _simd_slli_epi32(a,i) SIMD::slli_epi32<i>(a) -#define _simd_srai_epi32(a,i) SIMD::srai_epi32<i>(a) -#define _simd_srli_epi32(a,i) SIMD::srli_epi32<i>(a) -#define _simd_srlisi_ps(a,i) SIMD::srlisi_ps<i>(a) - -#define _simd_fmadd_ps SIMD::fmadd_ps -#define _simd_fmsub_ps SIMD::fmsub_ps -#define _simd_shuffle_epi8 SIMD::shuffle_epi8 - -#define _simd_i32gather_ps(p, o, s) SIMD::i32gather_ps<SIMD::ScaleFactor(s)>(p, o) -#define _simd_mask_i32gather_ps(r, p, o, m, s) SIMD::mask_i32gather_ps<SIMD::ScaleFactor(s)>(r, p, o, m) -#define _simd_abs_epi32 SIMD::abs_epi32 - -#define _simd_cvtepu8_epi16 SIMD::cvtepu8_epi16 -#define _simd_cvtepu8_epi32 SIMD::cvtepu8_epi32 -#define _simd_cvtepu16_epi32 SIMD::cvtepu16_epi32 -#define _simd_cvtepu16_epi64 SIMD::cvtepu16_epi64 -#define _simd_cvtepu32_epi64 SIMD::cvtepu32_epi64 - -#define _simd_packus_epi16 SIMD::packus_epi16 -#define _simd_packs_epi16 SIMD::packs_epi16 -#define _simd_packus_epi32 SIMD::packus_epi32 -#define _simd_packs_epi32 SIMD::packs_epi32 - -#define _simd_unpacklo_ps SIMD::unpacklo_ps -#define _simd_unpackhi_ps SIMD::unpackhi_ps -#define _simd_unpacklo_pd SIMD::unpacklo_pd -#define _simd_unpackhi_pd SIMD::unpackhi_pd -#define _simd_insertf128_ps SIMD::insertf128_ps -#define _simd_insertf128_pd SIMD::insertf128_pd -#define _simd_insertf128_si(a, b, i) SIMD::insertf128_si<i>(a, b) -#define _simd_extractf128_ps(a, i) SIMD::extractf128_ps<i>(a) -#define _simd_extractf128_pd(a, i) SIMD::extractf128_pd<i>(a) -#define _simd_extractf128_si(a, i) SIMD::extractf128_si<i>(a) -#define _simd_permute2f128_ps(a, b, i) SIMD::permute2f128_ps<i>(a, b) -#define _simd_permute2f128_pd(a, b, i) SIMD::permute2f128_pd<i>(a, b) -#define _simd_permute2f128_si(a, b, i) SIMD::permute2f128_si<i>(a, b) -#define _simd_shuffle_ps(a, b, i) SIMD::shuffle_ps<i>(a, b) -#define _simd_shuffle_pd(a, b, i) SIMD::shuffle_pd<i>(a, b) -#define _simd_shuffle_epi32(a, b, imm8) SIMD::shuffle_epi32<imm8>(a, b) -#define _simd_shuffle_epi64(a, b, imm8) SIMD::shuffle_epi64<imm8>(a, b) -#define _simd_set1_epi32 SIMD::set1_epi32 -#define _simd_set_epi32 SIMD::set_epi32 -#define _simd_set_ps SIMD::set_ps -#define _simd_set1_epi8 SIMD::set1_epi8 -#define _simd_setzero_si SIMD::setzero_si -#define _simd_cvttps_epi32 SIMD::cvttps_epi32 -#define _simd_store_si SIMD::store_si -#define _simd_broadcast_ss SIMD::broadcast_ss -#define _simd_maskstore_ps SIMD::maskstore_ps -#define _simd_load_si SIMD::load_si -#define _simd_loadu_si SIMD::loadu_si -#define _simd_sub_ps SIMD::sub_ps -#define _simd_testz_ps SIMD::testz_ps -#define _simd_testz_si SIMD::testz_si -#define _simd_xor_ps SIMD::xor_ps - -#define _simd_loadu2_si SIMD::loadu2_si -#define _simd_storeu2_si SIMD::storeu2_si - -#define _simd_blendv_epi32 SIMD::blendv_epi32 -#define _simd_vmask_ps SIMD::vmask_ps - -template<int mask> SIMDINLINE -SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer const &a, SIMD128::Integer const &b) +#endif // KNOB_SIMD16_WIDTH == 16 + +#define _simd128_maskstore_ps SIMD128::maskstore_ps +#define _simd128_fmadd_ps SIMD128::fmadd_ps + +#define _simd_load_ps SIMD::load_ps +#define _simd_load1_ps SIMD::broadcast_ss +#define _simd_loadu_ps SIMD::loadu_ps +#define _simd_setzero_ps SIMD::setzero_ps +#define _simd_set1_ps SIMD::set1_ps +#define _simd_blend_ps(a, b, i) SIMD::blend_ps<i>(a, b) +#define _simd_blend_epi32(a, b, i) SIMD::blend_epi32<i>(a, b) +#define _simd_blendv_ps SIMD::blendv_ps +#define _simd_store_ps SIMD::store_ps +#define _simd_mul_ps SIMD::mul_ps +#define _simd_add_ps SIMD::add_ps +#define _simd_sub_ps SIMD::sub_ps +#define _simd_rsqrt_ps SIMD::rsqrt_ps +#define _simd_min_ps SIMD::min_ps +#define _simd_max_ps SIMD::max_ps +#define _simd_movemask_ps SIMD::movemask_ps +#define _simd_cvtps_epi32 SIMD::cvtps_epi32 +#define _simd_cvttps_epi32 SIMD::cvttps_epi32 +#define _simd_cvtepi32_ps SIMD::cvtepi32_ps +#define _simd_cmplt_ps SIMD::cmplt_ps +#define _simd_cmpgt_ps SIMD::cmpgt_ps +#define _simd_cmpneq_ps SIMD::cmpneq_ps +#define _simd_cmpeq_ps SIMD::cmpeq_ps +#define _simd_cmpge_ps SIMD::cmpge_ps +#define _simd_cmple_ps SIMD::cmple_ps +#define _simd_cmp_ps(a, b, imm) SIMD::cmp_ps<SIMD::CompareType(imm)>(a, b) +#define _simd_and_ps SIMD::and_ps +#define _simd_or_ps SIMD::or_ps +#define _simd_rcp_ps SIMD::rcp_ps +#define _simd_div_ps SIMD::div_ps +#define _simd_castsi_ps SIMD::castsi_ps +#define _simd_castps_pd SIMD::castps_pd +#define _simd_castpd_ps SIMD::castpd_ps +#define _simd_andnot_ps SIMD::andnot_ps +#define _simd_round_ps(a, i) SIMD::round_ps<SIMD::RoundMode(i)>(a) +#define _simd_castpd_ps SIMD::castpd_ps +#define _simd_broadcast_ps(a) SIMD::broadcast_ps((SIMD128::Float const*)(a)) +#define _simd_stream_ps SIMD::stream_ps + +#define _simd_movemask_pd SIMD::movemask_pd +#define _simd_castsi_pd SIMD::castsi_pd + +#define _simd_mul_epi32 SIMD::mul_epi32 +#define _simd_mullo_epi32 SIMD::mullo_epi32 +#define _simd_sub_epi32 SIMD::sub_epi32 +#define _simd_sub_epi64 SIMD::sub_epi64 +#define _simd_min_epi32 SIMD::min_epi32 +#define _simd_min_epu32 SIMD::min_epu32 +#define _simd_max_epi32 SIMD::max_epi32 +#define _simd_max_epu32 SIMD::max_epu32 +#define _simd_add_epi32 SIMD::add_epi32 +#define _simd_and_si SIMD::and_si +#define _simd_andnot_si SIMD::andnot_si +#define _simd_cmpeq_epi32 SIMD::cmpeq_epi32 +#define _simd_cmplt_epi32 SIMD::cmplt_epi32 +#define _simd_cmpgt_epi32 SIMD::cmpgt_epi32 +#define _simd_or_si SIMD::or_si +#define _simd_xor_si SIMD::xor_si +#define _simd_castps_si SIMD::castps_si +#define _simd_adds_epu8 SIMD::adds_epu8 +#define _simd_subs_epu8 SIMD::subs_epu8 +#define _simd_add_epi8 SIMD::add_epi8 +#define _simd_cmpeq_epi64 SIMD::cmpeq_epi64 +#define _simd_cmpgt_epi64 SIMD::cmpgt_epi64 +#define _simd_cmpgt_epi8 SIMD::cmpgt_epi8 +#define _simd_cmpeq_epi8 SIMD::cmpeq_epi8 +#define _simd_cmpgt_epi16 SIMD::cmpgt_epi16 +#define _simd_cmpeq_epi16 SIMD::cmpeq_epi16 +#define _simd_movemask_epi8 SIMD::movemask_epi8 +#define _simd_permute_ps_i(a, i) SIMD::permute_ps<i>(a) +#define _simd_permute_ps SIMD::permute_ps +#define _simd_permute_epi32 SIMD::permute_epi32 +#define _simd_srlv_epi32 SIMD::srlv_epi32 +#define _simd_sllv_epi32 SIMD::sllv_epi32 + +#define _simd_unpacklo_epi8 SIMD::unpacklo_epi8 +#define _simd_unpackhi_epi8 SIMD::unpackhi_epi8 +#define _simd_unpacklo_epi16 SIMD::unpacklo_epi16 +#define _simd_unpackhi_epi16 SIMD::unpackhi_epi16 +#define _simd_unpacklo_epi32 SIMD::unpacklo_epi32 +#define _simd_unpackhi_epi32 SIMD::unpackhi_epi32 +#define _simd_unpacklo_epi64 SIMD::unpacklo_epi64 +#define _simd_unpackhi_epi64 SIMD::unpackhi_epi64 + +#define _simd_slli_epi32(a, i) SIMD::slli_epi32<i>(a) +#define _simd_srai_epi32(a, i) SIMD::srai_epi32<i>(a) +#define _simd_srli_epi32(a, i) SIMD::srli_epi32<i>(a) +#define _simd_srlisi_ps(a, i) SIMD::srlisi_ps<i>(a) + +#define _simd_fmadd_ps SIMD::fmadd_ps +#define _simd_fmsub_ps SIMD::fmsub_ps +#define _simd_shuffle_epi8 SIMD::shuffle_epi8 + +#define _simd_i32gather_ps(p, o, s) SIMD::i32gather_ps<SIMD::ScaleFactor(s)>(p, o) +#define _simd_mask_i32gather_ps(r, p, o, m, s) \ + SIMD::mask_i32gather_ps<SIMD::ScaleFactor(s)>(r, p, o, m) +#define _simd_abs_epi32 SIMD::abs_epi32 + +#define _simd_cvtepu8_epi16 SIMD::cvtepu8_epi16 +#define _simd_cvtepu8_epi32 SIMD::cvtepu8_epi32 +#define _simd_cvtepu16_epi32 SIMD::cvtepu16_epi32 +#define _simd_cvtepu16_epi64 SIMD::cvtepu16_epi64 +#define _simd_cvtepu32_epi64 SIMD::cvtepu32_epi64 + +#define _simd_packus_epi16 SIMD::packus_epi16 +#define _simd_packs_epi16 SIMD::packs_epi16 +#define _simd_packus_epi32 SIMD::packus_epi32 +#define _simd_packs_epi32 SIMD::packs_epi32 + +#define _simd_unpacklo_ps SIMD::unpacklo_ps +#define _simd_unpackhi_ps SIMD::unpackhi_ps +#define _simd_unpacklo_pd SIMD::unpacklo_pd +#define _simd_unpackhi_pd SIMD::unpackhi_pd +#define _simd_insertf128_ps SIMD::insertf128_ps +#define _simd_insertf128_pd SIMD::insertf128_pd +#define _simd_insertf128_si(a, b, i) SIMD::insertf128_si<i>(a, b) +#define _simd_extractf128_ps(a, i) SIMD::extractf128_ps<i>(a) +#define _simd_extractf128_pd(a, i) SIMD::extractf128_pd<i>(a) +#define _simd_extractf128_si(a, i) SIMD::extractf128_si<i>(a) +#define _simd_permute2f128_ps(a, b, i) SIMD::permute2f128_ps<i>(a, b) +#define _simd_permute2f128_pd(a, b, i) SIMD::permute2f128_pd<i>(a, b) +#define _simd_permute2f128_si(a, b, i) SIMD::permute2f128_si<i>(a, b) +#define _simd_shuffle_ps(a, b, i) SIMD::shuffle_ps<i>(a, b) +#define _simd_shuffle_pd(a, b, i) SIMD::shuffle_pd<i>(a, b) +#define _simd_shuffle_epi32(a, b, imm8) SIMD::shuffle_epi32<imm8>(a, b) +#define _simd_shuffle_epi64(a, b, imm8) SIMD::shuffle_epi64<imm8>(a, b) +#define _simd_set1_epi32 SIMD::set1_epi32 +#define _simd_set_epi32 SIMD::set_epi32 +#define _simd_set_ps SIMD::set_ps +#define _simd_set1_epi8 SIMD::set1_epi8 +#define _simd_setzero_si SIMD::setzero_si +#define _simd_cvttps_epi32 SIMD::cvttps_epi32 +#define _simd_store_si SIMD::store_si +#define _simd_broadcast_ss SIMD::broadcast_ss +#define _simd_maskstore_ps SIMD::maskstore_ps +#define _simd_load_si SIMD::load_si +#define _simd_loadu_si SIMD::loadu_si +#define _simd_sub_ps SIMD::sub_ps +#define _simd_testz_ps SIMD::testz_ps +#define _simd_testz_si SIMD::testz_si +#define _simd_xor_ps SIMD::xor_ps + +#define _simd_loadu2_si SIMD::loadu2_si +#define _simd_storeu2_si SIMD::storeu2_si + +#define _simd_blendv_epi32 SIMD::blendv_epi32 +#define _simd_vmask_ps SIMD::vmask_ps + +template <int mask> +SIMDINLINE SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer const& a, SIMD128::Integer const& b) { - return SIMD128::castps_si(SIMD128::blend_ps<mask>(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b))); + return SIMD128::castps_si( + SIMD128::blend_ps<mask>(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b))); } SIMDINLINE -void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int slane) +void _simd_mov(simdscalar& r, unsigned int rlane, simdscalar& s, unsigned int slane) { OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH]; SIMD256::store_ps(rArray, r); SIMD256::store_ps(sArray, s); rArray[rlane] = sArray[slane]; - r = SIMD256::load_ps(rArray); + r = SIMD256::load_ps(rArray); } // Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww. @@ -228,34 +229,42 @@ void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int #endif -#define _simdvec_dp3_ps SIMD::vec4_dp3_ps -#define _simdvec_dp4_ps SIMD::vec4_dp4_ps -#define _simdvec_rcp_length_ps SIMD::vec4_rcp_length_ps -#define _simdvec_normalize_ps SIMD::vec4_normalize_ps -#define _simdvec_mul_ps SIMD::vec4_mul_ps -#define _simdvec_add_ps SIMD::vec4_add_ps -#define _simdvec_min_ps SIMD::vec4_min_ps -#define _simdvec_max_ps SIMD::vec4_max_ps -#define _simd_mat4x4_vec4_multiply SIMD::mat4x4_vec4_multiply -#define _simd_mat3x3_vec3_w0_multiply SIMD::mat3x3_vec3_w0_multiply -#define _simd_mat4x4_vec3_w1_multiply SIMD::mat4x4_vec3_w1_multiply -#define _simd_mat4x3_vec3_w1_multiply SIMD::mat4x3_vec3_w1_multiply +#define _simdvec_dp3_ps SIMD::vec4_dp3_ps +#define _simdvec_dp4_ps SIMD::vec4_dp4_ps +#define _simdvec_rcp_length_ps SIMD::vec4_rcp_length_ps +#define _simdvec_normalize_ps SIMD::vec4_normalize_ps +#define _simdvec_mul_ps SIMD::vec4_mul_ps +#define _simdvec_add_ps SIMD::vec4_add_ps +#define _simdvec_min_ps SIMD::vec4_min_ps +#define _simdvec_max_ps SIMD::vec4_max_ps +#define _simd_mat4x4_vec4_multiply SIMD::mat4x4_vec4_multiply +#define _simd_mat3x3_vec3_w0_multiply SIMD::mat3x3_vec3_w0_multiply +#define _simd_mat4x4_vec3_w1_multiply SIMD::mat4x4_vec3_w1_multiply +#define _simd_mat4x3_vec3_w1_multiply SIMD::mat4x3_vec3_w1_multiply ////////////////////////////////////////////////////////////////////////// /// @brief Compute plane equation vA * vX + vB * vY + vC -SIMDINLINE simdscalar vplaneps(simdscalar const &vA, simdscalar const &vB, simdscalar const &vC, simdscalar const &vX, simdscalar const &vY) +SIMDINLINE simdscalar vplaneps(simdscalar const& vA, + simdscalar const& vB, + simdscalar const& vC, + simdscalar const& vX, + simdscalar const& vY) { simdscalar vOut = _simd_fmadd_ps(vA, vX, vC); - vOut = _simd_fmadd_ps(vB, vY, vOut); + vOut = _simd_fmadd_ps(vB, vY, vOut); return vOut; } ////////////////////////////////////////////////////////////////////////// /// @brief Compute plane equation vA * vX + vB * vY + vC -SIMDINLINE simd4scalar vplaneps(simd4scalar const &vA, simd4scalar const &vB, simd4scalar const &vC, simd4scalar const &vX, simd4scalar const &vY) +SIMDINLINE simd4scalar vplaneps(simd4scalar const& vA, + simd4scalar const& vB, + simd4scalar const& vC, + simd4scalar const& vX, + simd4scalar const& vY) { simd4scalar vOut = _simd128_fmadd_ps(vA, vX, vC); - vOut = _simd128_fmadd_ps(vB, vY, vOut); + vOut = _simd128_fmadd_ps(vB, vY, vOut); return vOut; } @@ -264,30 +273,32 @@ SIMDINLINE simd4scalar vplaneps(simd4scalar const &vA, simd4scalar const &vB, si /// @param vI - barycentric I /// @param vJ - barycentric J /// @param pInterpBuffer - pointer to attribute barycentric coeffs -template<UINT Attrib, UINT Comp, UINT numComponents = 4> -static SIMDINLINE simdscalar InterpolateComponent(simdscalar const &vI, simdscalar const &vJ, const float *pInterpBuffer) +template <UINT Attrib, UINT Comp, UINT numComponents = 4> +static SIMDINLINE simdscalar InterpolateComponent(simdscalar const& vI, + simdscalar const& vJ, + const float* pInterpBuffer) { - const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp]; - const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp]; - const float *pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp]; + const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp]; + const float* pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp]; + const float* pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp]; simdscalar vA = _simd_broadcast_ss(pInterpA); simdscalar vB = _simd_broadcast_ss(pInterpB); simdscalar vC = _simd_broadcast_ss(pInterpC); simdscalar vk = _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f), vI), vJ); - vC = _simd_mul_ps(vk, vC); - + vC = _simd_mul_ps(vk, vC); + return vplaneps(vA, vB, vC, vI, vJ); } ////////////////////////////////////////////////////////////////////////// /// @brief Interpolates a single component (flat shade). /// @param pInterpBuffer - pointer to attribute barycentric coeffs -template<UINT Attrib, UINT Comp, UINT numComponents = 4> -static SIMDINLINE simdscalar InterpolateComponentFlat(const float *pInterpBuffer) +template <UINT Attrib, UINT Comp, UINT numComponents = 4> +static SIMDINLINE simdscalar InterpolateComponentFlat(const float* pInterpBuffer) { - const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp]; + const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp]; simdscalar vA = _simd_broadcast_ss(pInterpA); @@ -299,38 +310,39 @@ static SIMDINLINE simdscalar InterpolateComponentFlat(const float *pInterpBuffer /// @param vI - barycentric I /// @param vJ - barycentric J /// @param pInterpBuffer - pointer to attribute barycentric coeffs -template<UINT Attrib, UINT Comp, UINT numComponents = 4> -static SIMDINLINE simd4scalar InterpolateComponent(simd4scalar const &vI, simd4scalar const &vJ, const float *pInterpBuffer) +template <UINT Attrib, UINT Comp, UINT numComponents = 4> +static SIMDINLINE simd4scalar InterpolateComponent(simd4scalar const& vI, + simd4scalar const& vJ, + const float* pInterpBuffer) { - const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp]; - const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp]; - const float *pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp]; + const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp]; + const float* pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp]; + const float* pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp]; simd4scalar vA = SIMD128::broadcast_ss(pInterpA); simd4scalar vB = SIMD128::broadcast_ss(pInterpB); simd4scalar vC = SIMD128::broadcast_ss(pInterpC); simd4scalar vk = SIMD128::sub_ps(SIMD128::sub_ps(SIMD128::set1_ps(1.0f), vI), vJ); - vC = SIMD128::mul_ps(vk, vC); + vC = SIMD128::mul_ps(vk, vC); return vplaneps(vA, vB, vC, vI, vJ); } -static SIMDINLINE simd4scalar _simd128_abs_ps(simd4scalar const &a) +static SIMDINLINE simd4scalar _simd128_abs_ps(simd4scalar const& a) { simd4scalari ai = SIMD128::castps_si(a); return SIMD128::castsi_ps(SIMD128::and_si(ai, SIMD128::set1_epi32(0x7fffffff))); } -static SIMDINLINE simdscalar _simd_abs_ps(simdscalar const &a) +static SIMDINLINE simdscalar _simd_abs_ps(simdscalar const& a) { simdscalari ai = _simd_castps_si(a); return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff))); } - #if ENABLE_AVX512_SIMD16 #include "simd16intrin.h" -#endif//ENABLE_AVX512_SIMD16 +#endif // ENABLE_AVX512_SIMD16 -#endif//__SWR_SIMDINTRIN_H__ +#endif //__SWR_SIMDINTRIN_H__ diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp index 24cf27d4dbc..bd48fb2aae7 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2017 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2017 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #pragma once #include "simdlib_types.hpp" @@ -38,8 +38,7 @@ namespace SIMDImpl #include "simdlib_128_avx.inl" #undef __SIMD_LIB_AVX_HPP__ }; // struct AVXImpl -#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX - +#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX #if SIMD_ARCH >= SIMD_ARCH_AVX2 struct AVX2Impl : AVXImpl @@ -48,7 +47,7 @@ namespace SIMDImpl #include "simdlib_128_avx2.inl" #undef __SIMD_LIB_AVX2_HPP__ }; // struct AVX2Impl -#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2 +#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2 #if SIMD_ARCH >= SIMD_ARCH_AVX512 struct AVX512Impl : AVX2Impl @@ -62,9 +61,9 @@ namespace SIMDImpl #include "simdlib_128_avx512_core.inl" #endif // defined(SIMD_ARCH_KNIGHTS) #undef __SIMD_LIB_AVX512_HPP__ -#endif // SIMD_OPT_128_AVX512 +#endif // SIMD_OPT_128_AVX512 }; // struct AVX2Impl -#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512 +#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512 struct Traits : SIMDImpl::Traits { @@ -78,13 +77,13 @@ namespace SIMDImpl #error Invalid value for SIMD_ARCH #endif - using Float = SIMD128Impl::Float; - using Double = SIMD128Impl::Double; - using Integer = SIMD128Impl::Integer; - using Vec4 = SIMD128Impl::Vec4; - using Mask = SIMD128Impl::Mask; + using Float = SIMD128Impl::Float; + using Double = SIMD128Impl::Double; + using Integer = SIMD128Impl::Integer; + using Vec4 = SIMD128Impl::Vec4; + using Mask = SIMD128Impl::Mask; }; - } // ns SIMD128Impl + } // namespace SIMD128Impl namespace SIMD256Impl { @@ -95,8 +94,7 @@ namespace SIMDImpl #include "simdlib_256_avx.inl" #undef __SIMD_LIB_AVX_HPP__ }; // struct AVXImpl -#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX - +#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX #if SIMD_ARCH >= SIMD_ARCH_AVX2 struct AVX2Impl : AVXImpl @@ -105,7 +103,7 @@ namespace SIMDImpl #include "simdlib_256_avx2.inl" #undef __SIMD_LIB_AVX2_HPP__ }; // struct AVX2Impl -#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2 +#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2 #if SIMD_ARCH >= SIMD_ARCH_AVX512 struct AVX512Impl : AVX2Impl @@ -119,9 +117,9 @@ namespace SIMDImpl #include "simdlib_256_avx512_core.inl" #endif // defined(SIMD_ARCH_KNIGHTS) #undef __SIMD_LIB_AVX512_HPP__ -#endif // SIMD_OPT_256_AVX512 +#endif // SIMD_OPT_256_AVX512 }; // struct AVX2Impl -#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512 +#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512 struct Traits : SIMDImpl::Traits { @@ -135,18 +133,18 @@ namespace SIMDImpl #error Invalid value for SIMD_ARCH #endif - using Float = SIMD256Impl::Float; - using Double = SIMD256Impl::Double; - using Integer = SIMD256Impl::Integer; - using Vec4 = SIMD256Impl::Vec4; - using Mask = SIMD256Impl::Mask; + using Float = SIMD256Impl::Float; + using Double = SIMD256Impl::Double; + using Integer = SIMD256Impl::Integer; + using Vec4 = SIMD256Impl::Vec4; + using Mask = SIMD256Impl::Mask; }; - } // ns SIMD256Impl + } // namespace SIMD256Impl namespace SIMD512Impl { #if SIMD_ARCH >= SIMD_ARCH_AVX - template<typename SIMD256T> + template <typename SIMD256T> struct AVXImplBase { #define __SIMD_LIB_AVX_HPP__ @@ -157,12 +155,10 @@ namespace SIMDImpl using AVXImpl = AVXImplBase<SIMD256Impl::AVXImpl>; #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX - #if SIMD_ARCH >= SIMD_ARCH_AVX2 using AVX2Impl = AVXImplBase<SIMD256Impl::AVX2Impl>; #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2 - #if SIMD_ARCH >= SIMD_ARCH_AVX512 struct AVX512Impl : AVXImplBase<SIMD256Impl::AVX512Impl> { @@ -178,7 +174,7 @@ namespace SIMDImpl #endif // defined(SIMD_ARCH_KNIGHTS) #undef __SIMD_LIB_AVX512_HPP__ }; // struct AVX512ImplBase -#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512 +#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512 struct Traits : SIMDImpl::Traits { @@ -192,33 +188,32 @@ namespace SIMDImpl #error Invalid value for SIMD_ARCH #endif - using Float = SIMD512Impl::Float; - using Double = SIMD512Impl::Double; - using Integer = SIMD512Impl::Integer; - using Vec4 = SIMD512Impl::Vec4; - using Mask = SIMD512Impl::Mask; + using Float = SIMD512Impl::Float; + using Double = SIMD512Impl::Double; + using Integer = SIMD512Impl::Integer; + using Vec4 = SIMD512Impl::Vec4; + using Mask = SIMD512Impl::Mask; }; - } // ns SIMD512Impl -} // ns SIMDImpl + } // namespace SIMD512Impl +} // namespace SIMDImpl template <typename Traits> struct SIMDBase : Traits::IsaImpl { - using CompareType = typename Traits::CompareType; - using ScaleFactor = typename Traits::ScaleFactor; - using RoundMode = typename Traits::RoundMode; - using SIMD = typename Traits::IsaImpl; - using Float = typename Traits::Float; - using Double = typename Traits::Double; - using Integer = typename Traits::Integer; - using Vec4 = typename Traits::Vec4; - using Mask = typename Traits::Mask; + using CompareType = typename Traits::CompareType; + using ScaleFactor = typename Traits::ScaleFactor; + using RoundMode = typename Traits::RoundMode; + using SIMD = typename Traits::IsaImpl; + using Float = typename Traits::Float; + using Double = typename Traits::Double; + using Integer = typename Traits::Integer; + using Vec4 = typename Traits::Vec4; + using Mask = typename Traits::Mask; static const size_t VECTOR_BYTES = sizeof(Float); // Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes xxxx yyyy zzzz wwww. - static SIMDINLINE - void vec4_load1_ps(Vec4& r, const float *p) + static SIMDINLINE void vec4_load1_ps(Vec4& r, const float* p) { r[0] = SIMD::set1_ps(p[0]); r[1] = SIMD::set1_ps(p[1]); @@ -226,8 +221,7 @@ struct SIMDBase : Traits::IsaImpl r[3] = SIMD::set1_ps(p[3]); } - static SIMDINLINE - void vec4_set1_vps(Vec4& r, Float const &s) + static SIMDINLINE void vec4_set1_vps(Vec4& r, Float const& s) { r[0] = s; r[1] = s; @@ -235,48 +229,44 @@ struct SIMDBase : Traits::IsaImpl r[3] = s; } - static SIMDINLINE - Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1) + static SIMDINLINE Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1) { Float tmp, r; - r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x) + r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x) - tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y) - r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y) + r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) - tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z) - r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) + tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z) + r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) return r; } - static SIMDINLINE - Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1) + static SIMDINLINE Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1) { Float tmp, r; - r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x) + r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x) - tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y) - r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y) + r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) - tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z) - r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) + tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z) + r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) - tmp = SIMD::mul_ps(v0[3], v1[3]); // (v0.w*v1.w) - r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) + tmp = SIMD::mul_ps(v0[3], v1[3]); // (v0.w*v1.w) + r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) return r; } - static SIMDINLINE - Float vec4_rcp_length_ps(const Vec4& v) + static SIMDINLINE Float vec4_rcp_length_ps(const Vec4& v) { Float length = vec4_dp4_ps(v, v); return SIMD::rsqrt_ps(length); } - static SIMDINLINE - void vec4_normalize_ps(Vec4& r, const Vec4& v) + static SIMDINLINE void vec4_normalize_ps(Vec4& r, const Vec4& v) { Float rcpLength = vec4_rcp_length_ps(v); @@ -286,8 +276,7 @@ struct SIMDBase : Traits::IsaImpl r[3] = SIMD::mul_ps(v[3], rcpLength); } - static SIMDINLINE - void vec4_mul_ps(Vec4& r, const Vec4& v, Float const &s) + static SIMDINLINE void vec4_mul_ps(Vec4& r, const Vec4& v, Float const& s) { r[0] = SIMD::mul_ps(v[0], s); r[1] = SIMD::mul_ps(v[1], s); @@ -295,8 +284,7 @@ struct SIMDBase : Traits::IsaImpl r[3] = SIMD::mul_ps(v[3], s); } - static SIMDINLINE - void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1) + static SIMDINLINE void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1) { r[0] = SIMD::mul_ps(v0[0], v1[0]); r[1] = SIMD::mul_ps(v0[1], v1[1]); @@ -304,8 +292,7 @@ struct SIMDBase : Traits::IsaImpl r[3] = SIMD::mul_ps(v0[3], v1[3]); } - static SIMDINLINE - void vec4_add_ps(Vec4& r, const Vec4& v0, Float const &s) + static SIMDINLINE void vec4_add_ps(Vec4& r, const Vec4& v0, Float const& s) { r[0] = SIMD::add_ps(v0[0], s); r[1] = SIMD::add_ps(v0[1], s); @@ -313,8 +300,7 @@ struct SIMDBase : Traits::IsaImpl r[3] = SIMD::add_ps(v0[3], s); } - static SIMDINLINE - void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1) + static SIMDINLINE void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1) { r[0] = SIMD::add_ps(v0[0], v1[0]); r[1] = SIMD::add_ps(v0[1], v1[1]); @@ -322,8 +308,7 @@ struct SIMDBase : Traits::IsaImpl r[3] = SIMD::add_ps(v0[3], v1[3]); } - static SIMDINLINE - void vec4_min_ps(Vec4& r, const Vec4& v0, Float const &s) + static SIMDINLINE void vec4_min_ps(Vec4& r, const Vec4& v0, Float const& s) { r[0] = SIMD::min_ps(v0[0], s); r[1] = SIMD::min_ps(v0[1], s); @@ -331,8 +316,7 @@ struct SIMDBase : Traits::IsaImpl r[3] = SIMD::min_ps(v0[3], s); } - static SIMDINLINE - void vec4_max_ps(Vec4& r, const Vec4& v0, Float const &s) + static SIMDINLINE void vec4_max_ps(Vec4& r, const Vec4& v0, Float const& s) { r[0] = SIMD::max_ps(v0[0], s); r[1] = SIMD::max_ps(v0[1], s); @@ -345,66 +329,64 @@ struct SIMDBase : Traits::IsaImpl // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w) // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w) // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w) - static SIMDINLINE - void SIMDCALL mat4x4_vec4_multiply( - Vec4& result, - const float *pMatrix, - const Vec4& v) + static SIMDINLINE void SIMDCALL mat4x4_vec4_multiply(Vec4& result, + const float* pMatrix, + const Vec4& v) { Float m; Float r0; Float r1; - m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = SIMD::load1_ps(pMatrix + 0*4 + 3); // m[row][3] - r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) + m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0] + r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) + m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1] + r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2] + r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3] + r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) result[0] = r0; - m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = SIMD::load1_ps(pMatrix + 1*4 + 3); // m[row][3] - r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) + m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0] + r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) + m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1] + r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2] + r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3] + r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) result[1] = r0; - m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = SIMD::load1_ps(pMatrix + 2*4 + 3); // m[row][3] - r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) + m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0] + r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) + m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1] + r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2] + r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3] + r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) result[2] = r0; - m = SIMD::load1_ps(pMatrix + 3*4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 3*4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 3*4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = SIMD::load1_ps(pMatrix + 3*4 + 3); // m[row][3] - r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) + m = SIMD::load1_ps(pMatrix + 3 * 4 + 0); // m[row][0] + r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) + m = SIMD::load1_ps(pMatrix + 3 * 4 + 1); // m[row][1] + r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = SIMD::load1_ps(pMatrix + 3 * 4 + 2); // m[row][2] + r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = SIMD::load1_ps(pMatrix + 3 * 4 + 3); // m[row][3] + r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) result[3] = r0; } @@ -413,44 +395,42 @@ struct SIMDBase : Traits::IsaImpl // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0) // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0) // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0) - static SIMDINLINE - void SIMDCALL mat3x3_vec3_w0_multiply( - Vec4& result, - const float *pMatrix, - const Vec4& v) + static SIMDINLINE void SIMDCALL mat3x3_vec3_w0_multiply(Vec4& result, + const float* pMatrix, + const Vec4& v) { Float m; Float r0; Float r1; - m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0] + r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) + m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1] + r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2] + r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) result[0] = r0; - m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0] + r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) + m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1] + r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2] + r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) result[1] = r0; - m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0] + r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) + m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1] + r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2] + r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) result[2] = r0; result[3] = SIMD::setzero_ps(); @@ -461,108 +441,104 @@ struct SIMDBase : Traits::IsaImpl // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1) // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1) // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1) - static SIMDINLINE - void SIMDCALL mat4x4_vec3_w1_multiply( - Vec4& result, - const float *pMatrix, - const Vec4& v) + static SIMDINLINE void SIMDCALL mat4x4_vec3_w1_multiply(Vec4& result, + const float* pMatrix, + const Vec4& v) { Float m; Float r0; Float r1; - m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = SIMD::load1_ps(pMatrix + 0*4 + 3); // m[row][3] - r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0] + r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) + m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1] + r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2] + r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3] + r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) result[0] = r0; - m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = SIMD::load1_ps(pMatrix + 1*4 + 3); // m[row][3] - r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0] + r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) + m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1] + r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2] + r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3] + r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) result[1] = r0; - m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = SIMD::load1_ps(pMatrix + 2*4 + 3); // m[row][3] - r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0] + r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) + m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1] + r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2] + r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3] + r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) result[2] = r0; - m = SIMD::load1_ps(pMatrix + 3*4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 3*4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 3*4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = SIMD::load1_ps(pMatrix + 3*4 + 3); // m[row][3] - result[3] = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + m = SIMD::load1_ps(pMatrix + 3 * 4 + 0); // m[row][0] + r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) + m = SIMD::load1_ps(pMatrix + 3 * 4 + 1); // m[row][1] + r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = SIMD::load1_ps(pMatrix + 3 * 4 + 2); // m[row][2] + r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = SIMD::load1_ps(pMatrix + 3 * 4 + 3); // m[row][3] + result[3] = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) } - static SIMDINLINE - void SIMDCALL mat4x3_vec3_w1_multiply( - Vec4& result, - const float *pMatrix, - const Vec4& v) + static SIMDINLINE void SIMDCALL mat4x3_vec3_w1_multiply(Vec4& result, + const float* pMatrix, + const Vec4& v) { Float m; Float r0; Float r1; - m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = SIMD::load1_ps(pMatrix + 0*4 + 3); // m[row][3] - r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0] + r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) + m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1] + r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2] + r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3] + r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) result[0] = r0; - m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = SIMD::load1_ps(pMatrix + 1*4 + 3); // m[row][3] - r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0] + r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) + m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1] + r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2] + r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3] + r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) result[1] = r0; - m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = SIMD::load1_ps(pMatrix + 2*4 + 3); // m[row][3] - r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0] + r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) + m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1] + r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2] + r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) + r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3] + r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) result[2] = r0; result[3] = SIMD::set1_ps(1.0f); } @@ -572,30 +548,38 @@ using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>; using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>; using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>; -template <typename SIMD_T> using CompareType = typename SIMD_T::CompareType; -template <typename SIMD_T> using ScaleFactor = typename SIMD_T::ScaleFactor; -template <typename SIMD_T> using RoundMode = typename SIMD_T::RoundMode; -template <typename SIMD_T> using Float = typename SIMD_T::Float; -template <typename SIMD_T> using Double = typename SIMD_T::Double; -template <typename SIMD_T> using Integer = typename SIMD_T::Integer; -template <typename SIMD_T> using Vec4 = typename SIMD_T::Vec4; -template <typename SIMD_T> using Mask = typename SIMD_T::Mask; +template <typename SIMD_T> +using CompareType = typename SIMD_T::CompareType; +template <typename SIMD_T> +using ScaleFactor = typename SIMD_T::ScaleFactor; +template <typename SIMD_T> +using RoundMode = typename SIMD_T::RoundMode; +template <typename SIMD_T> +using Float = typename SIMD_T::Float; +template <typename SIMD_T> +using Double = typename SIMD_T::Double; +template <typename SIMD_T> +using Integer = typename SIMD_T::Integer; +template <typename SIMD_T> +using Vec4 = typename SIMD_T::Vec4; +template <typename SIMD_T> +using Mask = typename SIMD_T::Mask; template <typename SIMD_T> struct SIMDVecEqual { - INLINE bool operator () (Integer<SIMD_T> a, Integer<SIMD_T> b) const + INLINE bool operator()(Integer<SIMD_T> a, Integer<SIMD_T> b) const { Integer<SIMD_T> c = SIMD_T::xor_si(a, b); return SIMD_T::testz_si(c, c); } - INLINE bool operator () (Float<SIMD_T> a, Float<SIMD_T> b) const + INLINE bool operator()(Float<SIMD_T> a, Float<SIMD_T> b) const { return this->operator()(SIMD_T::castps_si(a), SIMD_T::castps_si(b)); } - INLINE bool operator () (Double<SIMD_T> a, Double<SIMD_T> b) const + INLINE bool operator()(Double<SIMD_T> a, Double<SIMD_T> b) const { return this->operator()(SIMD_T::castpd_si(a), SIMD_T::castpd_si(b)); } @@ -604,13 +588,13 @@ struct SIMDVecEqual template <typename SIMD_T> struct SIMDVecHash { - INLINE uint32_t operator ()(Integer<SIMD_T> val) const + INLINE uint32_t operator()(Integer<SIMD_T> val) const { #if defined(_WIN64) || !defined(_WIN32) // assume non-Windows is always 64-bit static_assert(sizeof(void*) == 8, "This path only meant for 64-bit code"); - uint64_t crc32 = 0; - const uint64_t *pData = reinterpret_cast<const uint64_t*>(&val); + uint64_t crc32 = 0; + const uint64_t* pData = reinterpret_cast<const uint64_t*>(&val); static const uint32_t loopIterations = sizeof(val) / sizeof(void*); static_assert(loopIterations * sizeof(void*) == sizeof(val), "bad vector size"); @@ -624,7 +608,7 @@ struct SIMDVecHash static_assert(sizeof(void*) == 4, "This path only meant for 32-bit code"); uint32_t crc32 = 0; - const uint32_t *pData = reinterpret_cast<const uint32_t*>(&val); + const uint32_t* pData = reinterpret_cast<const uint32_t*>(&val); static const uint32_t loopIterations = sizeof(val) / sizeof(void*); static_assert(loopIterations * sizeof(void*) == sizeof(val), "bad vector size"); @@ -637,11 +621,11 @@ struct SIMDVecHash #endif }; - INLINE uint32_t operator ()(Float<SIMD_T> val) const + INLINE uint32_t operator()(Float<SIMD_T> val) const { return operator()(SIMD_T::castps_si(val)); }; - INLINE uint32_t operator ()(Double<SIMD_T> val) const + INLINE uint32_t operator()(Double<SIMD_T> val) const { return operator()(SIMD_T::castpd_si(val)); } diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl index b1511c6c0e2..0c5795cf136 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2017 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2017 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #if !defined(__SIMD_LIB_AVX_HPP__) #error Do not include this file directly, use "simdlib.hpp" instead. #endif @@ -28,100 +28,79 @@ // SIMD128 AVX (1) implementation //============================================================================ -#define SIMD_WRAPPER_1(op) \ - static SIMDINLINE Float SIMDCALL op(Float a) \ - {\ - return _mm_##op(a);\ - } +#define SIMD_WRAPPER_1(op) \ + static SIMDINLINE Float SIMDCALL op(Float a) { return _mm_##op(a); } -#define SIMD_WRAPPER_2(op) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - {\ - return _mm_##op(a, b);\ - } +#define SIMD_WRAPPER_2(op) \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm_##op(a, b); } -#define SIMD_DWRAPPER_2(op) \ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ - {\ - return _mm_##op(a, b);\ - } +#define SIMD_DWRAPPER_2(op) \ + static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm_##op(a, b); } -#define SIMD_WRAPPER_2I(op) \ - template<int ImmT>\ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - {\ - return _mm_##op(a, b, ImmT);\ +#define SIMD_WRAPPER_2I(op) \ + template <int ImmT> \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ + { \ + return _mm_##op(a, b, ImmT); \ } -#define SIMD_DWRAPPER_2I(op) \ - template<int ImmT>\ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ - {\ - return _mm_##op(a, b, ImmT);\ +#define SIMD_DWRAPPER_2I(op) \ + template <int ImmT> \ + static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ + { \ + return _mm_##op(a, b, ImmT); \ } -#define SIMD_WRAPPER_3(op) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \ - {\ - return _mm_##op(a, b, c);\ - } +#define SIMD_WRAPPER_3(op) \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm_##op(a, b, c); } -#define SIMD_IWRAPPER_1(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - {\ - return _mm_##op(a);\ - } +#define SIMD_IWRAPPER_1(op) \ + static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm_##op(a); } -#define SIMD_IWRAPPER_1I_(op, intrin) \ - template<int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - {\ - return intrin(a, ImmT);\ +#define SIMD_IWRAPPER_1I_(op, intrin) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer a) \ + { \ + return intrin(a, ImmT); \ } #define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm_##op) -#define SIMD_IWRAPPER_2_(op, intrin) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - {\ - return intrin(a, b);\ - } +#define SIMD_IWRAPPER_2_(op, intrin) \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return intrin(a, b); } -#define SIMD_IWRAPPER_2(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - {\ - return _mm_##op(a, b);\ - } +#define SIMD_IWRAPPER_2(op) \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm_##op(a, b); } -#define SIMD_IFWRAPPER_2(op, intrin) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - {\ - return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\ +#define SIMD_IFWRAPPER_2(op, intrin) \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ + { \ + return castps_si(intrin(castsi_ps(a), castsi_ps(b))); \ } -#define SIMD_IWRAPPER_2I(op) \ - template<int ImmT>\ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - {\ - return _mm_##op(a, b, ImmT);\ +#define SIMD_IWRAPPER_2I(op) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ + { \ + return _mm_##op(a, b, ImmT); \ } //----------------------------------------------------------------------- // Single precision floating point arithmetic operations //----------------------------------------------------------------------- -SIMD_WRAPPER_2(add_ps); // return a + b -SIMD_WRAPPER_2(div_ps); // return a / b -SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b -SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b -SIMD_WRAPPER_2(mul_ps); // return a * b -SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a -SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a) -SIMD_WRAPPER_2(sub_ps); // return a - b +SIMD_WRAPPER_2(add_ps); // return a + b +SIMD_WRAPPER_2(div_ps); // return a / b +SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b +SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b +SIMD_WRAPPER_2(mul_ps); // return a * b +SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a +SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a) +SIMD_WRAPPER_2(sub_ps); // return a - b -static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c +static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c { return add_ps(mul_ps(a, b), c); } -static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c) // return (a * b) - c +static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c) // return (a * b) - c { return sub_ps(mul_ps(a, b), c); } @@ -132,8 +111,14 @@ static SIMDINLINE Float SIMDCALL round_ps(Float a) return _mm_round_ps(a, static_cast<int>(RMT)); } -static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); } -static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); } +static SIMDINLINE Float SIMDCALL ceil_ps(Float a) +{ + return round_ps<RoundMode::CEIL_NOEXC>(a); +} +static SIMDINLINE Float SIMDCALL floor_ps(Float a) +{ + return round_ps<RoundMode::FLOOR_NOEXC>(a); +} //----------------------------------------------------------------------- // Integer (various width) arithmetic operations @@ -141,7 +126,7 @@ static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode:: SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32) SIMD_IWRAPPER_2(add_epi32); // return a + b (int32) SIMD_IWRAPPER_2(add_epi8); // return a + b (int8) -SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) +SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32) SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32) SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32) @@ -160,41 +145,40 @@ SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) //----------------------------------------------------------------------- // Logical operations //----------------------------------------------------------------------- -SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int) -SIMD_IWRAPPER_2_(and_si, _mm_and_si128); // return a & b (int) -SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int) -SIMD_IWRAPPER_2_(andnot_si, _mm_andnot_si128); // return (~a) & b (int) -SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int) -SIMD_IWRAPPER_2_(or_si, _mm_or_si128); // return a | b (int) -SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int) -SIMD_IWRAPPER_2_(xor_si, _mm_xor_si128); // return a ^ b (int) - +SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int) +SIMD_IWRAPPER_2_(and_si, _mm_and_si128); // return a & b (int) +SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int) +SIMD_IWRAPPER_2_(andnot_si, _mm_andnot_si128); // return (~a) & b (int) +SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int) +SIMD_IWRAPPER_2_(or_si, _mm_or_si128); // return a | b (int) +SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int) +SIMD_IWRAPPER_2_(xor_si, _mm_xor_si128); // return a ^ b (int) //----------------------------------------------------------------------- // Shift operations //----------------------------------------------------------------------- -SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT -SIMD_IWRAPPER_1I(slli_epi64); // return a << ImmT +SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT +SIMD_IWRAPPER_1I(slli_epi64); // return a << ImmT static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b (uint32) { int32_t a, count; - a = _mm_extract_epi32(vA, 0); + a = _mm_extract_epi32(vA, 0); count = _mm_extract_epi32(vB, 0); a <<= count; vA = _mm_insert_epi32(vA, a, 0); - a = _mm_extract_epi32(vA, 1); + a = _mm_extract_epi32(vA, 1); count = _mm_extract_epi32(vB, 1); a <<= count; vA = _mm_insert_epi32(vA, a, 1); - a = _mm_extract_epi32(vA, 2); + a = _mm_extract_epi32(vA, 2); count = _mm_extract_epi32(vB, 2); a <<= count; vA = _mm_insert_epi32(vA, a, 2); - a = _mm_extract_epi32(vA, 3); + a = _mm_extract_epi32(vA, 3); count = _mm_extract_epi32(vB, 3); a <<= count; vA = _mm_insert_epi32(vA, a, 3); @@ -211,7 +195,7 @@ static SIMDINLINE Integer SIMDCALL srl_epi64(Integer a, Integer n) return _mm_srl_epi64(a, n); } -template<int ImmT> // same as srli_si, but with Float cast to int +template <int ImmT> // same as srli_si, but with Float cast to int static SIMDINLINE Float SIMDCALL srlisi_ps(Float a) { return castsi_ps(srli_si<ImmT>(castps_si(a))); @@ -220,22 +204,22 @@ static SIMDINLINE Float SIMDCALL srlisi_ps(Float a) static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b (uint32) { int32_t a, count; - a = _mm_extract_epi32(vA, 0); + a = _mm_extract_epi32(vA, 0); count = _mm_extract_epi32(vB, 0); a >>= count; vA = _mm_insert_epi32(vA, a, 0); - a = _mm_extract_epi32(vA, 1); + a = _mm_extract_epi32(vA, 1); count = _mm_extract_epi32(vB, 1); a >>= count; vA = _mm_insert_epi32(vA, a, 1); - a = _mm_extract_epi32(vA, 2); + a = _mm_extract_epi32(vA, 2); count = _mm_extract_epi32(vB, 2); a >>= count; vA = _mm_insert_epi32(vA, a, 2); - a = _mm_extract_epi32(vA, 3); + a = _mm_extract_epi32(vA, 3); count = _mm_extract_epi32(vB, 3); a >>= count; vA = _mm_insert_epi32(vA, a, 3); @@ -243,32 +227,30 @@ static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return return vA; } - - //----------------------------------------------------------------------- // Conversion operations //----------------------------------------------------------------------- -static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a) +static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a) { return _mm_castpd_ps(a); } -static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a) +static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a) { return _mm_castps_si128(a); } -static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a) +static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a) { return _mm_castsi128_pd(a); } -static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a) +static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a) { return _mm_castps_pd(a); } -static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a) +static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a) { return _mm_castsi128_ps(a); } @@ -288,18 +270,19 @@ static SIMDINLINE Integer SIMDCALL cvtsi32_si128(int32_t n) // return a[0] = n, return _mm_cvtsi32_si128(n); } -SIMD_IWRAPPER_1(cvtepu8_epi16); // return (int16)a (uint8 --> int16) -SIMD_IWRAPPER_1(cvtepu8_epi32); // return (int32)a (uint8 --> int32) -SIMD_IWRAPPER_1(cvtepu16_epi32); // return (int32)a (uint16 --> int32) -SIMD_IWRAPPER_1(cvtepu16_epi64); // return (int64)a (uint16 --> int64) -SIMD_IWRAPPER_1(cvtepu32_epi64); // return (int64)a (uint32 --> int64) +SIMD_IWRAPPER_1(cvtepu8_epi16); // return (int16)a (uint8 --> int16) +SIMD_IWRAPPER_1(cvtepu8_epi32); // return (int32)a (uint8 --> int32) +SIMD_IWRAPPER_1(cvtepu16_epi32); // return (int32)a (uint16 --> int32) +SIMD_IWRAPPER_1(cvtepu16_epi64); // return (int64)a (uint16 --> int64) +SIMD_IWRAPPER_1(cvtepu32_epi64); // return (int64)a (uint32 --> int64) -static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32) +static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32) { return _mm_cvtps_epi32(a); } -static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32) +static SIMDINLINE Integer SIMDCALL + cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32) { return _mm_cvttps_epi32(a); } @@ -307,77 +290,104 @@ static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a) // return (in //----------------------------------------------------------------------- // Comparison operations //----------------------------------------------------------------------- -template<CompareType CmpTypeT> +template <CompareType CmpTypeT> static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b { return _mm_cmp_ps(a, b, static_cast<const int>(CmpTypeT)); } -static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); } -static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); } -static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); } -static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); } -static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); } -static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); } +static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) +{ + return cmp_ps<CompareType::LT_OQ>(a, b); +} +static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) +{ + return cmp_ps<CompareType::GT_OQ>(a, b); +} +static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) +{ + return cmp_ps<CompareType::NEQ_OQ>(a, b); +} +static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) +{ + return cmp_ps<CompareType::EQ_OQ>(a, b); +} +static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) +{ + return cmp_ps<CompareType::GE_OQ>(a, b); +} +static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) +{ + return cmp_ps<CompareType::LE_OQ>(a, b); +} -SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8) -SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16) -SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32) -SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64) -SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8) -SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16) -SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32) -SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64) -SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32) +SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8) +SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16) +SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32) +SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64) +SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8) +SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16) +SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32) +SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64) +SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32) -static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float) +static SIMDINLINE bool SIMDCALL testz_ps(Float a, + Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float) { - return 0 != _mm_testz_ps(a, b); + return 0 != _mm_testz_ps(a, b); } -static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int) +static SIMDINLINE bool SIMDCALL testz_si(Integer a, + Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int) { - return 0 != _mm_testz_si128(a, b); + return 0 != _mm_testz_si128(a, b); } //----------------------------------------------------------------------- // Blend / shuffle / permute operations //----------------------------------------------------------------------- -SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float) -SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float) +SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float) +SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float) -static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int) +static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, + Integer b, + Float mask) // return mask ? b : a (int) { return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask)); } -static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int) +static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, + Integer b, + Integer mask) // return mask ? b : a (int) { return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask))); } -static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value) +static SIMDINLINE Float SIMDCALL + broadcast_ss(float const* p) // return *p (all elements in vector get same value) { return _mm_broadcast_ss(p); } -SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm_packs_epi16 and _mm512_packs_epi16 -SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm_packs_epi32 and _mm512_packs_epi32 -SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm_packus_epi16 and _mm512_packus_epi16 -SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm_packus_epi32 and _mm512_packus_epi32 +SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm_packs_epi16 and _mm512_packs_epi16 +SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm_packs_epi32 and _mm512_packs_epi32 +SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm_packus_epi16 and _mm512_packus_epi16 +SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm_packus_epi32 and _mm512_packus_epi32 -static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float) +static SIMDINLINE Integer SIMDCALL + permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float) { return castps_si(_mm_permutevar_ps(castsi_ps(a), swiz)); } -static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float) +static SIMDINLINE Float SIMDCALL + permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float) { return _mm_permutevar_ps(a, swiz); } SIMD_IWRAPPER_1I(shuffle_epi32); -template<int ImmT> +template <int ImmT> static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) = delete; SIMD_IWRAPPER_2(shuffle_epi8); @@ -385,7 +395,7 @@ SIMD_DWRAPPER_2I(shuffle_pd); SIMD_WRAPPER_2I(shuffle_ps); SIMD_IWRAPPER_2(unpackhi_epi16); -//SIMD_IFWRAPPER_2(unpackhi_epi32, _mm_unpackhi_ps); +// SIMD_IFWRAPPER_2(unpackhi_epi32, _mm_unpackhi_ps); static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b) { return castps_si(_mm_unpackhi_ps(castsi_ps(a), castsi_ps(b))); @@ -405,68 +415,74 @@ SIMD_WRAPPER_2(unpacklo_ps); //----------------------------------------------------------------------- // Load / store operations //----------------------------------------------------------------------- -template<ScaleFactor ScaleT> -static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) +template <ScaleFactor ScaleT> +static SIMDINLINE Float SIMDCALL + i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) { - uint32_t *pOffsets = (uint32_t*)&idx; - Float vResult; - float* pResult = (float*)&vResult; + uint32_t* pOffsets = (uint32_t*)&idx; + Float vResult; + float* pResult = (float*)&vResult; for (uint32_t i = 0; i < SIMD_WIDTH; ++i) { uint32_t offset = pOffsets[i]; - offset = offset * static_cast<uint32_t>(ScaleT); - pResult[i] = *(float const*)(((uint8_t const*)p + offset)); + offset = offset * static_cast<uint32_t>(ScaleT); + pResult[i] = *(float const*)(((uint8_t const*)p + offset)); } return vResult; } -static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements) +static SIMDINLINE Float SIMDCALL + load1_ps(float const* p) // return *p (broadcast 1 value to all elements) { return broadcast_ss(p); } -static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory) +static SIMDINLINE Float SIMDCALL + load_ps(float const* p) // return *p (loads SIMD width elements from memory) { return _mm_load_ps(p); } -static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p +static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p { return _mm_load_si128(&p->v); } -static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem) +static SIMDINLINE Float SIMDCALL + loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem) { return _mm_loadu_ps(p); } -static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem) +static SIMDINLINE Integer SIMDCALL + loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem) { return _mm_lddqu_si128(&p->v); } // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old -template<ScaleFactor ScaleT> -static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) -{ - uint32_t *pOffsets = (uint32_t*)&idx; - Float vResult = old; - float* pResult = (float*)&vResult; - DWORD index; - uint32_t umask = movemask_ps(mask); +template <ScaleFactor ScaleT> +static SIMDINLINE Float SIMDCALL + mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) +{ + uint32_t* pOffsets = (uint32_t*)&idx; + Float vResult = old; + float* pResult = (float*)&vResult; + DWORD index; + uint32_t umask = movemask_ps(mask); while (_BitScanForward(&index, umask)) { umask &= ~(1 << index); uint32_t offset = pOffsets[index]; - offset = offset * static_cast<uint32_t>(ScaleT); - pResult[index] = *(float const *)(((uint8_t const *)p + offset)); + offset = offset * static_cast<uint32_t>(ScaleT); + pResult[index] = *(float const*)(((uint8_t const*)p + offset)); } return vResult; } -static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src) +static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src) { _mm_maskstore_ps(p, mask, src); } @@ -495,37 +511,40 @@ static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements a return _mm_set1_epi8(i); } -static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value) +static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value) { return _mm_set1_ps(f); } -static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float) +static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float) { return _mm_setzero_ps(); } -static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer) +static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer) { return _mm_setzero_si128(); } -static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory) +static SIMDINLINE void SIMDCALL + store_ps(float* p, Float a) // *p = a (stores all elements contiguously in memory) { _mm_store_ps(p, a); } -static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a +static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a { _mm_store_si128(&p->v, a); } -static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a (same as store_si but allows for unaligned mem) +static SIMDINLINE void SIMDCALL + storeu_si(Integer* p, Integer a) // *p = a (same as store_si but allows for unaligned mem) { _mm_storeu_si128(&p->v, a); } -static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache) +static SIMDINLINE void SIMDCALL + stream_ps(float* p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache) { _mm_stream_ps(p, a); } @@ -549,11 +568,10 @@ static SIMDINLINE float SIMDCALL extract_ps(Float a) static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask) { - Integer vec = set1_epi32(mask); - const Integer bit = set_epi32( - 0x08, 0x04, 0x02, 0x01); - vec = and_si(vec, bit); - vec = cmplt_epi32(setzero_si(), vec); + Integer vec = set1_epi32(mask); + const Integer bit = set_epi32(0x08, 0x04, 0x02, 0x01); + vec = and_si(vec, bit); + vec = cmplt_epi32(setzero_si(), vec); return castsi_ps(vec); } @@ -573,4 +591,3 @@ static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask) #undef SIMD_IWRAPPER_2 #undef SIMD_IWRAPPER_2_ #undef SIMD_IWRAPPER_2I - diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl index e8ee0b4d87b..35f9175ea46 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2017 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2017 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #if !defined(__SIMD_LIB_AVX2_HPP__) #error Do not include this file directly, use "simdlib.hpp" instead. #endif @@ -32,14 +32,11 @@ // Only 2 shifts and 2 gathers were introduced with AVX 2 // Also, add native support for FMA operations //============================================================================ -#define SIMD_WRAPPER_3(op) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \ - {\ - return _mm_##op(a, b, c);\ - } +#define SIMD_WRAPPER_3(op) \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm_##op(a, b, c); } -SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c -SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c +SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c +SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b (uint32) { @@ -51,18 +48,19 @@ static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return return _mm_srlv_epi32(vA, vB); } -template<ScaleFactor ScaleT> -static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) +template <ScaleFactor ScaleT> +static SIMDINLINE Float SIMDCALL + i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) { return _mm_i32gather_ps(p, idx, static_cast<const int>(ScaleT)); } // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old -template<ScaleFactor ScaleT> -static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) +template <ScaleFactor ScaleT> +static SIMDINLINE Float SIMDCALL + mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) { return _mm_mask_i32gather_ps(old, p, idx, mask, static_cast<const int>(ScaleT)); } #undef SIMD_WRAPPER_3 - diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl index b70a7691e2b..2ce3caa582f 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2017 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2017 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #if !defined(__SIMD_LIB_AVX512_HPP__) #error Do not include this file directly, use "simdlib.hpp" instead. #endif @@ -34,120 +34,138 @@ //============================================================================ private: - static SIMDINLINE __m512 __conv(Float r) { return _mm512_castps128_ps512(r.v); } - static SIMDINLINE __m512d __conv(Double r) { return _mm512_castpd128_pd512(r.v); } - static SIMDINLINE __m512i __conv(Integer r) { return _mm512_castsi128_si512(r.v); } - static SIMDINLINE Float __conv(__m512 r) { return _mm512_castps512_ps128(r); } - static SIMDINLINE Double __conv(__m512d r) { return _mm512_castpd512_pd128(r); } - static SIMDINLINE Integer __conv(__m512i r) { return _mm512_castsi512_si128(r); } -public: +static SIMDINLINE __m512 __conv(Float r) +{ + return _mm512_castps128_ps512(r.v); +} +static SIMDINLINE __m512d __conv(Double r) +{ + return _mm512_castpd128_pd512(r.v); +} +static SIMDINLINE __m512i __conv(Integer r) +{ + return _mm512_castsi128_si512(r.v); +} +static SIMDINLINE Float __conv(__m512 r) +{ + return _mm512_castps512_ps128(r); +} +static SIMDINLINE Double __conv(__m512d r) +{ + return _mm512_castpd512_pd128(r); +} +static SIMDINLINE Integer __conv(__m512i r) +{ + return _mm512_castsi512_si128(r); +} -#define SIMD_WRAPPER_1_(op, intrin, mask) \ - static SIMDINLINE Float SIMDCALL op(Float a) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\ +public: +#define SIMD_WRAPPER_1_(op, intrin, mask) \ + static SIMDINLINE Float SIMDCALL op(Float a) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \ } -#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xf)) +#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xf)) -#define SIMD_WRAPPER_1I_(op, intrin, mask) \ - template<int ImmT> \ - static SIMDINLINE Float SIMDCALL op(Float a) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\ +#define SIMD_WRAPPER_1I_(op, intrin, mask) \ + template <int ImmT> \ + static SIMDINLINE Float SIMDCALL op(Float a) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \ } -#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xf)) +#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xf)) -#define SIMD_WRAPPER_2_(op, intrin, mask) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\ +#define SIMD_WRAPPER_2_(op, intrin, mask) \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \ } -#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xf)) +#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xf)) -#define SIMD_WRAPPER_2I(op) \ - template<int ImmT>\ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - {\ - return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\ +#define SIMD_WRAPPER_2I(op) \ + template <int ImmT> \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ + { \ + return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \ } -#define SIMD_WRAPPER_3_(op, intrin, mask) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\ +#define SIMD_WRAPPER_3_(op, intrin, mask) \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c))); \ } -#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf)) +#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf)) -#define SIMD_DWRAPPER_2I(op) \ - template<int ImmT>\ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ - {\ - return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT));\ +#define SIMD_DWRAPPER_2I(op) \ + template <int ImmT> \ + static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ + { \ + return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT)); \ } -#define SIMD_IWRAPPER_1_(op, intrin, mask) \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\ +#define SIMD_IWRAPPER_1_(op, intrin, mask) \ + static SIMDINLINE Integer SIMDCALL op(Integer a) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \ } -#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xf)) +#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xf)) -#define SIMD_IWRAPPER_1I_(op, intrin, mask) \ - template<int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\ +#define SIMD_IWRAPPER_1I_(op, intrin, mask) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer a) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \ } -#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xf)) +#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xf)) -#define SIMD_IWRAPPER_2_(op, intrin, mask) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\ +#define SIMD_IWRAPPER_2_(op, intrin, mask) \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \ } -#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xf)) +#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xf)) -#define SIMD_IWRAPPER_2I(op) \ - template<int ImmT>\ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - {\ - return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\ +#define SIMD_IWRAPPER_2I(op) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ + { \ + return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \ } //----------------------------------------------------------------------- // Single precision floating point arithmetic operations //----------------------------------------------------------------------- -SIMD_WRAPPER_2(add_ps); // return a + b -SIMD_WRAPPER_2(div_ps); // return a / b -SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c -SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c -SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b -SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b -SIMD_WRAPPER_2(mul_ps); // return a * b +SIMD_WRAPPER_2(add_ps); // return a + b +SIMD_WRAPPER_2(div_ps); // return a / b +SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c +SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c +SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b +SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b +SIMD_WRAPPER_2(mul_ps); // return a * b SIMD_WRAPPER_1_(rcp_ps, rcp14_ps, __mmask16(0xf)); // return 1.0f / a -SIMD_WRAPPER_1_(rsqrt_ps, rsqrt14_ps, __mmask16(0xf)); // return 1.0f / sqrt(a) -SIMD_WRAPPER_2(sub_ps); // return a - b +SIMD_WRAPPER_1_(rsqrt_ps, rsqrt14_ps, __mmask16(0xf)); // return 1.0f / sqrt(a) +SIMD_WRAPPER_2(sub_ps); // return a - b //----------------------------------------------------------------------- // Integer (various width) arithmetic operations //----------------------------------------------------------------------- -SIMD_IWRAPPER_1_32(abs_epi32); // return absolute_value(a) (int32) -SIMD_IWRAPPER_2_32(add_epi32); // return a + b (int32) -SIMD_IWRAPPER_2_32(max_epi32); // return (a > b) ? a : b (int32) -SIMD_IWRAPPER_2_32(max_epu32); // return (a > b) ? a : b (uint32) -SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32) -SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32) -SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32) +SIMD_IWRAPPER_1_32(abs_epi32); // return absolute_value(a) (int32) +SIMD_IWRAPPER_2_32(add_epi32); // return a + b (int32) +SIMD_IWRAPPER_2_32(max_epi32); // return (a > b) ? a : b (int32) +SIMD_IWRAPPER_2_32(max_epu32); // return (a > b) ? a : b (uint32) +SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32) +SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32) +SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32) // SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8) -// SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) +// SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) // return (a * b) & 0xFFFFFFFF // // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, // and store the low 32 bits of the intermediate integers in dst. SIMD_IWRAPPER_2_32(mullo_epi32); -SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32) +SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32) // SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64) // SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) @@ -155,23 +173,22 @@ SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32) //----------------------------------------------------------------------- // Logical operations //----------------------------------------------------------------------- -SIMD_IWRAPPER_2_(and_si, and_epi32, __mmask16(0xf)); // return a & b (int) +SIMD_IWRAPPER_2_(and_si, and_epi32, __mmask16(0xf)); // return a & b (int) SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xf)); // return (~a) & b (int) -SIMD_IWRAPPER_2_(or_si, or_epi32, __mmask16(0xf)); // return a | b (int) -SIMD_IWRAPPER_2_(xor_si, xor_epi32, __mmask16(0xf)); // return a ^ b (int) - +SIMD_IWRAPPER_2_(or_si, or_epi32, __mmask16(0xf)); // return a | b (int) +SIMD_IWRAPPER_2_(xor_si, xor_epi32, __mmask16(0xf)); // return a ^ b (int) //----------------------------------------------------------------------- // Shift operations //----------------------------------------------------------------------- -SIMD_IWRAPPER_1I_32(slli_epi32); // return a << ImmT -SIMD_IWRAPPER_2_32(sllv_epi32); // return a << b (uint32) -SIMD_IWRAPPER_1I_32(srai_epi32); // return a >> ImmT (int32) -SIMD_IWRAPPER_1I_32(srli_epi32); // return a >> ImmT (uint32) -SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32) +SIMD_IWRAPPER_1I_32(slli_epi32); // return a << ImmT +SIMD_IWRAPPER_2_32(sllv_epi32); // return a << b (uint32) +SIMD_IWRAPPER_1I_32(srai_epi32); // return a >> ImmT (int32) +SIMD_IWRAPPER_1I_32(srli_epi32); // return a >> ImmT (uint32) +SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32) // use AVX2 version -//SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint) +// SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint) //----------------------------------------------------------------------- // Conversion operations (Use AVX2 versions) @@ -185,16 +202,16 @@ SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32) //----------------------------------------------------------------------- // Comparison operations (Use AVX2 versions //----------------------------------------------------------------------- -//SIMD_IWRAPPER_2_CMP(cmpeq_epi8); // return a == b (int8) -//SIMD_IWRAPPER_2_CMP(cmpeq_epi16); // return a == b (int16) -//SIMD_IWRAPPER_2_CMP(cmpeq_epi32); // return a == b (int32) -//SIMD_IWRAPPER_2_CMP(cmpeq_epi64); // return a == b (int64) -//SIMD_IWRAPPER_2_CMP(cmpgt_epi8,); // return a > b (int8) -//SIMD_IWRAPPER_2_CMP(cmpgt_epi16); // return a > b (int16) -//SIMD_IWRAPPER_2_CMP(cmpgt_epi32); // return a > b (int32) -//SIMD_IWRAPPER_2_CMP(cmpgt_epi64); // return a > b (int64) +// SIMD_IWRAPPER_2_CMP(cmpeq_epi8); // return a == b (int8) +// SIMD_IWRAPPER_2_CMP(cmpeq_epi16); // return a == b (int16) +// SIMD_IWRAPPER_2_CMP(cmpeq_epi32); // return a == b (int32) +// SIMD_IWRAPPER_2_CMP(cmpeq_epi64); // return a == b (int64) +// SIMD_IWRAPPER_2_CMP(cmpgt_epi8,); // return a > b (int8) +// SIMD_IWRAPPER_2_CMP(cmpgt_epi16); // return a > b (int16) +// SIMD_IWRAPPER_2_CMP(cmpgt_epi32); // return a > b (int32) +// SIMD_IWRAPPER_2_CMP(cmpgt_epi64); // return a > b (int64) // -//static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b) // return a < b (int32) +// static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b) // return a < b (int32) //{ // return cmpgt_epi32(b, a); //} @@ -202,24 +219,27 @@ SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32) //----------------------------------------------------------------------- // Blend / shuffle / permute operations //----------------------------------------------------------------------- -// SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16 -// SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32 -// SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 -// SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32 -// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32); - -//static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float) +// SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 +// and _mm512_packs_epi16 SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation +// for _mm256_packs_epi32 and _mm512_packs_epi32 SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> +// uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 +// SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for +// _mm256_packus_epi32 and _mm512_packus_epi32 SIMD_IWRAPPER_2_(permute_epi32, +// permutevar8x32_epi32); + +// static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for +// each 32-bit lane i (float) //{ // return _mm256_permutevar8x32_ps(a, swiz); //} SIMD_IWRAPPER_1I_32(shuffle_epi32); -//template<int ImmT> -//static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) +// template<int ImmT> +// static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) //{ // return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b))); //} -//SIMD_IWRAPPER_2(shuffle_epi8); +// SIMD_IWRAPPER_2(shuffle_epi8); SIMD_IWRAPPER_2_32(unpackhi_epi32); SIMD_IWRAPPER_2_32(unpacklo_epi32); @@ -233,50 +253,47 @@ SIMD_IWRAPPER_2_32(unpacklo_epi32); //----------------------------------------------------------------------- // Load / store operations //----------------------------------------------------------------------- -static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory) +static SIMDINLINE Float SIMDCALL + load_ps(float const* p) // return *p (loads SIMD width elements from memory) { return __conv(_mm512_maskz_loadu_ps(__mmask16(0xf), p)); } -static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p +static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p { return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p)); } -static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem) +static SIMDINLINE Float SIMDCALL + loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem) { return __conv(_mm512_maskz_loadu_ps(__mmask16(0xf), p)); } -static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem) +static SIMDINLINE Integer SIMDCALL + loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem) { return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p)); } -template<ScaleFactor ScaleT> -static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) +template <ScaleFactor ScaleT> +static SIMDINLINE Float SIMDCALL + i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) { return __conv(_mm512_mask_i32gather_ps( - _mm512_setzero_ps(), - __mmask16(0xf), - __conv(idx), - p, - static_cast<int>(ScaleT))); + _mm512_setzero_ps(), __mmask16(0xf), __conv(idx), p, static_cast<int>(ScaleT))); } // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old -template<ScaleFactor ScaleT> -static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) +template <ScaleFactor ScaleT> +static SIMDINLINE Float SIMDCALL + mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) { __mmask16 m = 0xf; - m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)), - _mm512_set1_epi32(0x80000000)); - return __conv(_mm512_mask_i32gather_ps( - __conv(old), - m, - __conv(idx), - p, - static_cast<int>(ScaleT))); + m = _mm512_mask_test_epi32_mask( + m, _mm512_castps_si512(__conv(mask)), _mm512_set1_epi32(0x80000000)); + return __conv( + _mm512_mask_i32gather_ps(__conv(old), m, __conv(idx), p, static_cast<int>(ScaleT))); } // static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a) @@ -286,19 +303,20 @@ static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, In // _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80))); // } -static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src) +static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src) { __mmask16 m = 0xf; - m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000)); + m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000)); _mm512_mask_storeu_ps(p, m, __conv(src)); } -static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory) +static SIMDINLINE void SIMDCALL + store_ps(float* p, Float a) // *p = a (stores all elements contiguously in memory) { _mm512_mask_storeu_ps(p, __mmask16(0xf), __conv(a)); } -static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a +static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a { _mm512_mask_storeu_epi32(p, __mmask16(0xf), __conv(a)); } diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_core.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_core.inl index a4ecd09f164..16e59c4decb 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_core.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_core.inl @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2017 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2017 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #if !defined(__SIMD_LIB_AVX512_HPP__) #error Do not include this file directly, use "simdlib.hpp" instead. #endif @@ -33,114 +33,118 @@ // register set. //============================================================================ -#define SIMD_WRAPPER_1_(op, intrin, mask) \ - static SIMDINLINE Float SIMDCALL op(Float a) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\ +#define SIMD_WRAPPER_1_(op, intrin, mask) \ + static SIMDINLINE Float SIMDCALL op(Float a) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \ } -#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xf)) +#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xf)) -#define SIMD_WRAPPER_1I_(op, intrin, mask) \ - template<int ImmT> \ - static SIMDINLINE Float SIMDCALL op(Float a) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\ +#define SIMD_WRAPPER_1I_(op, intrin, mask) \ + template <int ImmT> \ + static SIMDINLINE Float SIMDCALL op(Float a) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \ } -#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xf)) +#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xf)) -#define SIMD_WRAPPER_2_(op, intrin, mask) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\ +#define SIMD_WRAPPER_2_(op, intrin, mask) \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \ } -#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xf)) +#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xf)) -#define SIMD_WRAPPER_2I(op) \ - template<int ImmT>\ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - {\ - return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\ +#define SIMD_WRAPPER_2I(op) \ + template <int ImmT> \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ + { \ + return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \ } -#define SIMD_WRAPPER_3_(op, intrin, mask) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\ +#define SIMD_WRAPPER_3_(op, intrin, mask) \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c))); \ } -#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf)) +#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf)) -#define SIMD_DWRAPPER_1_(op, intrin, mask) \ - static SIMDINLINE Double SIMDCALL op(Double a) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\ +#define SIMD_DWRAPPER_1_(op, intrin, mask) \ + static SIMDINLINE Double SIMDCALL op(Double a) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \ } -#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0x3)) +#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0x3)) -#define SIMD_DWRAPPER_1I_(op, intrin, mask) \ - template<int ImmT> \ - static SIMDINLINE Double SIMDCALL op(Double a) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\ +#define SIMD_DWRAPPER_1I_(op, intrin, mask) \ + template <int ImmT> \ + static SIMDINLINE Double SIMDCALL op(Double a) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \ } -#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3)) +#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3)) -#define SIMD_DWRAPPER_2_(op, intrin, mask) \ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\ +#define SIMD_DWRAPPER_2_(op, intrin, mask) \ + static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \ } -#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0x3)) +#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0x3)) -#define SIMD_DWRAPPER_2I(op) \ - template<int ImmT>\ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ - {\ - return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT));\ +#define SIMD_DWRAPPER_2I(op) \ + template <int ImmT> \ + static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ + { \ + return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT)); \ } -#define SIMD_IWRAPPER_1_(op, intrin, mask) \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\ +#define SIMD_IWRAPPER_1_(op, intrin, mask) \ + static SIMDINLINE Integer SIMDCALL op(Integer a) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \ } -#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull)) -#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xff)) -#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0x3)) - -#define SIMD_IWRAPPER_1I_(op, intrin, mask) \ - template<int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\ +#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull)) +#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xff)) +#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0x3)) + +#define SIMD_IWRAPPER_1I_(op, intrin, mask) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer a) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \ } -#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull)) -#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff)) -#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3)) - -#define SIMD_IWRAPPER_2_(op, intrin, mask) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\ +#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull)) +#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff)) +#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3)) + +#define SIMD_IWRAPPER_2_(op, intrin, mask) \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \ } -#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull)) -#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xff)) -#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0x3)) - -#define SIMD_IWRAPPER_2I(op) \ - template<int ImmT>\ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - {\ - return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\ +#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull)) +#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xff)) +#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0x3)) + +#define SIMD_IWRAPPER_2I(op) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ + { \ + return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \ } -SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8) -SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) -SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64) -SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) -SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16 -SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32 -SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 -SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32 +SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8) +SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) +SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64) +SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) +SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and + // _mm512_packs_epi16 +SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and + // _mm512_packs_epi32 +SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and + // _mm512_packus_epi16 +SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and + // _mm512_packus_epi32 SIMD_IWRAPPER_2_16(unpackhi_epi16); SIMD_IWRAPPER_2_64(unpackhi_epi64); SIMD_IWRAPPER_2_8(unpackhi_epi8); @@ -151,8 +155,7 @@ SIMD_IWRAPPER_2_8(unpacklo_epi8); static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a) { __mmask64 m = 0xffffull; - return static_cast<uint32_t>( - _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80))); + return static_cast<uint32_t>(_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80))); } #undef SIMD_WRAPPER_1_ diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_knights.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_knights.inl index b0cae503419..1b6592e2003 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_knights.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_knights.inl @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2017 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2017 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #if !defined(__SIMD_LIB_AVX512_HPP__) #error Do not include this file directly, use "simdlib.hpp" instead. #endif @@ -32,4 +32,3 @@ // These use native AVX512 instructions with masking to enable a larger // register set. //============================================================================ - diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl index 00c094a425a..4ac0f95a468 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2017 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2017 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #if !defined(__SIMD_LIB_AVX_HPP__) #error Do not include this file directly, use "simdlib.hpp" instead. #endif @@ -30,178 +30,172 @@ using SIMD128T = SIMD128Impl::AVXImpl; // SIMD256 AVX (1) implementation //============================================================================ -#define SIMD_WRAPPER_1(op) \ - static SIMDINLINE Float SIMDCALL op(Float const &a) \ - {\ - return _mm256_##op(a);\ - } +#define SIMD_WRAPPER_1(op) \ + static SIMDINLINE Float SIMDCALL op(Float const& a) { return _mm256_##op(a); } -#define SIMD_WRAPPER_2(op) \ - static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b) \ - {\ - return _mm256_##op(a, b);\ +#define SIMD_WRAPPER_2(op) \ + static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \ + { \ + return _mm256_##op(a, b); \ } -#define SIMD_DWRAPPER_2(op) \ - static SIMDINLINE Double SIMDCALL op(Double const &a, Double const &b) \ - {\ - return _mm256_##op(a, b);\ +#define SIMD_DWRAPPER_2(op) \ + static SIMDINLINE Double SIMDCALL op(Double const& a, Double const& b) \ + { \ + return _mm256_##op(a, b); \ } -#define SIMD_WRAPPER_2I(op) \ - template<int ImmT>\ - static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b) \ - {\ - return _mm256_##op(a, b, ImmT);\ +#define SIMD_WRAPPER_2I(op) \ + template <int ImmT> \ + static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \ + { \ + return _mm256_##op(a, b, ImmT); \ } -#define SIMD_DWRAPPER_2I(op) \ - template<int ImmT>\ - static SIMDINLINE Double SIMDCALL op(Double const &a, Double const &b) \ - {\ - return _mm256_##op(a, b, ImmT);\ +#define SIMD_DWRAPPER_2I(op) \ + template <int ImmT> \ + static SIMDINLINE Double SIMDCALL op(Double const& a, Double const& b) \ + { \ + return _mm256_##op(a, b, ImmT); \ } -#define SIMD_WRAPPER_3(op) \ - static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b, Float const &c) \ - {\ - return _mm256_##op(a, b, c);\ +#define SIMD_WRAPPER_3(op) \ + static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \ + { \ + return _mm256_##op(a, b, c); \ } -#define SIMD_IWRAPPER_1(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer const &a) \ - {\ - return _mm256_##op(a);\ - } +#define SIMD_IWRAPPER_1(op) \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a) { return _mm256_##op(a); } -#define SIMD_IWRAPPER_2(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \ - {\ - return _mm256_##op(a, b);\ +#define SIMD_IWRAPPER_2(op) \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ + { \ + return _mm256_##op(a, b); \ } -#define SIMD_IFWRAPPER_2(op, intrin) \ - static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \ - {\ - return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\ +#define SIMD_IFWRAPPER_2(op, intrin) \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ + { \ + return castps_si(intrin(castsi_ps(a), castsi_ps(b))); \ } -#define SIMD_IFWRAPPER_2I(op, intrin) \ - template<int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \ - {\ - return castps_si( intrin(castsi_ps(a), castsi_ps(b), ImmT) );\ +#define SIMD_IFWRAPPER_2I(op, intrin) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ + { \ + return castps_si(intrin(castsi_ps(a), castsi_ps(b), ImmT)); \ } -#define SIMD_IWRAPPER_2I_(op, intrin) \ - template<int ImmT>\ - static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \ - {\ - return _mm256_##intrin(a, b, ImmT);\ +#define SIMD_IWRAPPER_2I_(op, intrin) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ + { \ + return _mm256_##intrin(a, b, ImmT); \ } -#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op) +#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op) -#define SIMD_IWRAPPER_3(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b, Integer const &c) \ - {\ - return _mm256_##op(a, b, c);\ +#define SIMD_IWRAPPER_3(op) \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \ + { \ + return _mm256_##op(a, b, c); \ } // emulated integer simd -#define SIMD_EMU_IWRAPPER_1(op) \ - static SIMDINLINE \ - Integer SIMDCALL op(Integer const &a)\ - {\ - return Integer\ - {\ - SIMD128T::op(a.v4[0]),\ - SIMD128T::op(a.v4[1]),\ - };\ +#define SIMD_EMU_IWRAPPER_1(op) \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a) \ + { \ + return Integer{ \ + SIMD128T::op(a.v4[0]), \ + SIMD128T::op(a.v4[1]), \ + }; \ } -#define SIMD_EMU_IWRAPPER_1L(op, shift) \ - static SIMDINLINE \ - Integer SIMDCALL op(Integer const &a)\ - {\ - return Integer \ - {\ - SIMD128T::op(a.v4[0]), \ - SIMD128T::op(SIMD128T::template srli_si<shift>(a.v4[0])), \ - };\ - }\ - static SIMDINLINE \ - Integer SIMDCALL op(SIMD128Impl::Integer const &a)\ - {\ - return Integer \ - {\ - SIMD128T::op(a), \ - SIMD128T::op(SIMD128T::template srli_si<shift>(a)), \ - };\ +#define SIMD_EMU_IWRAPPER_1L(op, shift) \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a) \ + { \ + return Integer{ \ + SIMD128T::op(a.v4[0]), \ + SIMD128T::op(SIMD128T::template srli_si<shift>(a.v4[0])), \ + }; \ + } \ + static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer const& a) \ + { \ + return Integer{ \ + SIMD128T::op(a), \ + SIMD128T::op(SIMD128T::template srli_si<shift>(a)), \ + }; \ } -#define SIMD_EMU_IWRAPPER_1I(op) \ - template <int ImmT> static SIMDINLINE \ - Integer SIMDCALL op(Integer const &a)\ - {\ - return Integer\ - {\ - SIMD128T::template op<ImmT>(a.v4[0]),\ - SIMD128T::template op<ImmT>(a.v4[1]),\ - };\ +#define SIMD_EMU_IWRAPPER_1I(op) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a) \ + { \ + return Integer{ \ + SIMD128T::template op<ImmT>(a.v4[0]), \ + SIMD128T::template op<ImmT>(a.v4[1]), \ + }; \ } -#define SIMD_EMU_IWRAPPER_2(op) \ - static SIMDINLINE \ - Integer SIMDCALL op(Integer const &a, Integer const &b)\ - {\ - return Integer\ - {\ - SIMD128T::op(a.v4[0], b.v4[0]),\ - SIMD128T::op(a.v4[1], b.v4[1]),\ - };\ +#define SIMD_EMU_IWRAPPER_2(op) \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ + { \ + return Integer{ \ + SIMD128T::op(a.v4[0], b.v4[0]), \ + SIMD128T::op(a.v4[1], b.v4[1]), \ + }; \ } -#define SIMD_EMU_IWRAPPER_2I(op) \ - template <int ImmT> static SIMDINLINE \ - Integer SIMDCALL op(Integer const &a, Integer const &b)\ - {\ - return Integer\ - {\ - SIMD128T::template op<ImmT>(a.v4[0], b.v[0]),\ - SIMD128T::template op<ImmT>(a.v4[1], b.v[1]),\ - };\ +#define SIMD_EMU_IWRAPPER_2I(op) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ + { \ + return Integer{ \ + SIMD128T::template op<ImmT>(a.v4[0], b.v[0]), \ + SIMD128T::template op<ImmT>(a.v4[1], b.v[1]), \ + }; \ } //----------------------------------------------------------------------- // Single precision floating point arithmetic operations //----------------------------------------------------------------------- -SIMD_WRAPPER_2(add_ps); // return a + b -SIMD_WRAPPER_2(div_ps); // return a / b +SIMD_WRAPPER_2(add_ps); // return a + b +SIMD_WRAPPER_2(div_ps); // return a / b -static SIMDINLINE Float SIMDCALL fmadd_ps(Float const &a, Float const &b, Float const &c) // return (a * b) + c +static SIMDINLINE Float SIMDCALL fmadd_ps(Float const& a, + Float const& b, + Float const& c) // return (a * b) + c { return add_ps(mul_ps(a, b), c); } -static SIMDINLINE Float SIMDCALL fmsub_ps(Float const &a, Float const &b, Float const &c) // return (a * b) - c +static SIMDINLINE Float SIMDCALL fmsub_ps(Float const& a, + Float const& b, + Float const& c) // return (a * b) - c { return sub_ps(mul_ps(a, b), c); } -SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b -SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b -SIMD_WRAPPER_2(mul_ps); // return a * b -SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a -SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a) -SIMD_WRAPPER_2(sub_ps); // return a - b +SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b +SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b +SIMD_WRAPPER_2(mul_ps); // return a * b +SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a +SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a) +SIMD_WRAPPER_2(sub_ps); // return a - b template <RoundMode RMT> -static SIMDINLINE Float SIMDCALL round_ps(Float const &a) +static SIMDINLINE Float SIMDCALL round_ps(Float const& a) { return _mm256_round_ps(a, static_cast<int>(RMT)); } -static SIMDINLINE Float SIMDCALL ceil_ps(Float const &a) { return round_ps<RoundMode::CEIL_NOEXC>(a); } -static SIMDINLINE Float SIMDCALL floor_ps(Float const &a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); } +static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a) +{ + return round_ps<RoundMode::CEIL_NOEXC>(a); +} +static SIMDINLINE Float SIMDCALL floor_ps(Float const& a) +{ + return round_ps<RoundMode::FLOOR_NOEXC>(a); +} //----------------------------------------------------------------------- // Integer (various width) arithmetic operations @@ -209,7 +203,7 @@ static SIMDINLINE Float SIMDCALL floor_ps(Float const &a) { return round_ps<Roun SIMD_EMU_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32) SIMD_EMU_IWRAPPER_2(add_epi32); // return a + b (int32) SIMD_EMU_IWRAPPER_2(add_epi8); // return a + b (int8) -SIMD_EMU_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) +SIMD_EMU_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) SIMD_EMU_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32) SIMD_EMU_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32) SIMD_EMU_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32) @@ -237,182 +231,184 @@ SIMD_EMU_IWRAPPER_2(or_si); // return a | b (int) SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int) SIMD_EMU_IWRAPPER_2(xor_si); // return a ^ b (int) - //----------------------------------------------------------------------- // Shift operations //----------------------------------------------------------------------- -SIMD_EMU_IWRAPPER_1I(slli_epi32); // return a << ImmT +SIMD_EMU_IWRAPPER_1I(slli_epi32); // return a << ImmT -static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer const &vA, Integer const &vCount) // return a << b (uint32) +static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer const& vA, + Integer const& vCount) // return a << b (uint32) { int32_t aHi, aLow, countHi, countLow; - __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1)); - __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0)); - __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1)); + __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1)); + __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0)); + __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1)); __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0)); - aHi = _mm_extract_epi32(vAHi, 0); + aHi = _mm_extract_epi32(vAHi, 0); countHi = _mm_extract_epi32(vCountHi, 0); aHi <<= countHi; vAHi = _mm_insert_epi32(vAHi, aHi, 0); - aLow = _mm_extract_epi32(vALow, 0); + aLow = _mm_extract_epi32(vALow, 0); countLow = _mm_extract_epi32(vCountLow, 0); aLow <<= countLow; vALow = _mm_insert_epi32(vALow, aLow, 0); - aHi = _mm_extract_epi32(vAHi, 1); + aHi = _mm_extract_epi32(vAHi, 1); countHi = _mm_extract_epi32(vCountHi, 1); aHi <<= countHi; vAHi = _mm_insert_epi32(vAHi, aHi, 1); - aLow = _mm_extract_epi32(vALow, 1); + aLow = _mm_extract_epi32(vALow, 1); countLow = _mm_extract_epi32(vCountLow, 1); aLow <<= countLow; vALow = _mm_insert_epi32(vALow, aLow, 1); - aHi = _mm_extract_epi32(vAHi, 2); + aHi = _mm_extract_epi32(vAHi, 2); countHi = _mm_extract_epi32(vCountHi, 2); aHi <<= countHi; vAHi = _mm_insert_epi32(vAHi, aHi, 2); - aLow = _mm_extract_epi32(vALow, 2); + aLow = _mm_extract_epi32(vALow, 2); countLow = _mm_extract_epi32(vCountLow, 2); aLow <<= countLow; vALow = _mm_insert_epi32(vALow, aLow, 2); - aHi = _mm_extract_epi32(vAHi, 3); + aHi = _mm_extract_epi32(vAHi, 3); countHi = _mm_extract_epi32(vCountHi, 3); aHi <<= countHi; vAHi = _mm_insert_epi32(vAHi, aHi, 3); - aLow = _mm_extract_epi32(vALow, 3); + aLow = _mm_extract_epi32(vALow, 3); countLow = _mm_extract_epi32(vCountLow, 3); aLow <<= countLow; vALow = _mm_insert_epi32(vALow, aLow, 3); __m256i ret = _mm256_set1_epi32(0); - ret = _mm256_insertf128_si256(ret, vAHi, 1); - ret = _mm256_insertf128_si256(ret, vALow, 0); + ret = _mm256_insertf128_si256(ret, vAHi, 1); + ret = _mm256_insertf128_si256(ret, vALow, 0); return ret; } -SIMD_EMU_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32) -SIMD_EMU_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32) -SIMD_EMU_IWRAPPER_1I(srli_si); // return a >> (ImmT*8) (uint) +SIMD_EMU_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32) +SIMD_EMU_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32) +SIMD_EMU_IWRAPPER_1I(srli_si); // return a >> (ImmT*8) (uint) -template<int ImmT> // same as srli_si, but with Float cast to int -static SIMDINLINE Float SIMDCALL srlisi_ps(Float const &a) +template <int ImmT> // same as srli_si, but with Float cast to int +static SIMDINLINE Float SIMDCALL srlisi_ps(Float const& a) { return castsi_ps(srli_si<ImmT>(castps_si(a))); } -static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer const &vA, Integer const &vCount) // return a >> b (uint32) +static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer const& vA, + Integer const& vCount) // return a >> b (uint32) { int32_t aHi, aLow, countHi, countLow; - __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1)); - __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0)); - __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1)); + __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1)); + __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0)); + __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1)); __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0)); - aHi = _mm_extract_epi32(vAHi, 0); + aHi = _mm_extract_epi32(vAHi, 0); countHi = _mm_extract_epi32(vCountHi, 0); aHi >>= countHi; vAHi = _mm_insert_epi32(vAHi, aHi, 0); - aLow = _mm_extract_epi32(vALow, 0); + aLow = _mm_extract_epi32(vALow, 0); countLow = _mm_extract_epi32(vCountLow, 0); aLow >>= countLow; vALow = _mm_insert_epi32(vALow, aLow, 0); - aHi = _mm_extract_epi32(vAHi, 1); + aHi = _mm_extract_epi32(vAHi, 1); countHi = _mm_extract_epi32(vCountHi, 1); aHi >>= countHi; vAHi = _mm_insert_epi32(vAHi, aHi, 1); - aLow = _mm_extract_epi32(vALow, 1); + aLow = _mm_extract_epi32(vALow, 1); countLow = _mm_extract_epi32(vCountLow, 1); aLow >>= countLow; vALow = _mm_insert_epi32(vALow, aLow, 1); - aHi = _mm_extract_epi32(vAHi, 2); + aHi = _mm_extract_epi32(vAHi, 2); countHi = _mm_extract_epi32(vCountHi, 2); aHi >>= countHi; vAHi = _mm_insert_epi32(vAHi, aHi, 2); - aLow = _mm_extract_epi32(vALow, 2); + aLow = _mm_extract_epi32(vALow, 2); countLow = _mm_extract_epi32(vCountLow, 2); aLow >>= countLow; vALow = _mm_insert_epi32(vALow, aLow, 2); - aHi = _mm_extract_epi32(vAHi, 3); + aHi = _mm_extract_epi32(vAHi, 3); countHi = _mm_extract_epi32(vCountHi, 3); aHi >>= countHi; vAHi = _mm_insert_epi32(vAHi, aHi, 3); - aLow = _mm_extract_epi32(vALow, 3); + aLow = _mm_extract_epi32(vALow, 3); countLow = _mm_extract_epi32(vCountLow, 3); aLow >>= countLow; vALow = _mm_insert_epi32(vALow, aLow, 3); __m256i ret = _mm256_set1_epi32(0); - ret = _mm256_insertf128_si256(ret, vAHi, 1); - ret = _mm256_insertf128_si256(ret, vALow, 0); + ret = _mm256_insertf128_si256(ret, vAHi, 1); + ret = _mm256_insertf128_si256(ret, vALow, 0); return ret; } - - //----------------------------------------------------------------------- // Conversion operations //----------------------------------------------------------------------- -static SIMDINLINE Float SIMDCALL castpd_ps(Double const &a) // return *(Float*)(&a) +static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a) { return _mm256_castpd_ps(a); } -static SIMDINLINE Integer SIMDCALL castps_si(Float const &a) // return *(Integer*)(&a) +static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a) { return _mm256_castps_si256(a); } -static SIMDINLINE Double SIMDCALL castsi_pd(Integer const &a) // return *(Double*)(&a) +static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a) { return _mm256_castsi256_pd(a); } -static SIMDINLINE Double SIMDCALL castps_pd(Float const &a) // return *(Double*)(&a) +static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a) { return _mm256_castps_pd(a); } -static SIMDINLINE Integer SIMDCALL castpd_si(Double const &a) // return *(Integer*)(&a) +static SIMDINLINE Integer SIMDCALL castpd_si(Double const& a) // return *(Integer*)(&a) { return _mm256_castpd_si256(a); } -static SIMDINLINE Float SIMDCALL castsi_ps(Integer const &a) // return *(Float*)(&a) +static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a) { return _mm256_castsi256_ps(a); } -static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer const &a) // return (float)a (int32 --> float) +static SIMDINLINE Float SIMDCALL + cvtepi32_ps(Integer const& a) // return (float)a (int32 --> float) { return _mm256_cvtepi32_ps(a); } -SIMD_EMU_IWRAPPER_1L(cvtepu8_epi16, 8); // return (int16)a (uint8 --> int16) -SIMD_EMU_IWRAPPER_1L(cvtepu8_epi32, 4); // return (int32)a (uint8 --> int32) -SIMD_EMU_IWRAPPER_1L(cvtepu16_epi32, 8); // return (int32)a (uint16 --> int32) -SIMD_EMU_IWRAPPER_1L(cvtepu16_epi64, 4); // return (int64)a (uint16 --> int64) -SIMD_EMU_IWRAPPER_1L(cvtepu32_epi64, 8); // return (int64)a (uint32 --> int64) +SIMD_EMU_IWRAPPER_1L(cvtepu8_epi16, 8); // return (int16)a (uint8 --> int16) +SIMD_EMU_IWRAPPER_1L(cvtepu8_epi32, 4); // return (int32)a (uint8 --> int32) +SIMD_EMU_IWRAPPER_1L(cvtepu16_epi32, 8); // return (int32)a (uint16 --> int32) +SIMD_EMU_IWRAPPER_1L(cvtepu16_epi64, 4); // return (int64)a (uint16 --> int64) +SIMD_EMU_IWRAPPER_1L(cvtepu32_epi64, 8); // return (int64)a (uint32 --> int64) -static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float const &a) // return (int32)a (float --> int32) +static SIMDINLINE Integer SIMDCALL + cvtps_epi32(Float const& a) // return (int32)a (float --> int32) { return _mm256_cvtps_epi32(a); } -static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float const &a) // return (int32)a (rnd_to_zero(float) --> int32) +static SIMDINLINE Integer SIMDCALL + cvttps_epi32(Float const& a) // return (int32)a (rnd_to_zero(float) --> int32) { return _mm256_cvttps_epi32(a); } @@ -420,79 +416,107 @@ static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float const &a) // ret //----------------------------------------------------------------------- // Comparison operations //----------------------------------------------------------------------- -template<CompareType CmpTypeT> -static SIMDINLINE Float SIMDCALL cmp_ps(Float const &a, Float const &b) // return a (CmpTypeT) b +template <CompareType CmpTypeT> +static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b { return _mm256_cmp_ps(a, b, static_cast<const int>(CmpTypeT)); } -static SIMDINLINE Float SIMDCALL cmplt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LT_OQ>(a, b); } -static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GT_OQ>(a, b); } -static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); } -static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::EQ_OQ>(a, b); } -static SIMDINLINE Float SIMDCALL cmpge_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GE_OQ>(a, b); } -static SIMDINLINE Float SIMDCALL cmple_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LE_OQ>(a, b); } +static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b) +{ + return cmp_ps<CompareType::LT_OQ>(a, b); +} +static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b) +{ + return cmp_ps<CompareType::GT_OQ>(a, b); +} +static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b) +{ + return cmp_ps<CompareType::NEQ_OQ>(a, b); +} +static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b) +{ + return cmp_ps<CompareType::EQ_OQ>(a, b); +} +static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b) +{ + return cmp_ps<CompareType::GE_OQ>(a, b); +} +static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b) +{ + return cmp_ps<CompareType::LE_OQ>(a, b); +} -SIMD_EMU_IWRAPPER_2(cmpeq_epi8); // return a == b (int8) -SIMD_EMU_IWRAPPER_2(cmpeq_epi16); // return a == b (int16) -SIMD_EMU_IWRAPPER_2(cmpeq_epi32); // return a == b (int32) -SIMD_EMU_IWRAPPER_2(cmpeq_epi64); // return a == b (int64) -SIMD_EMU_IWRAPPER_2(cmpgt_epi8); // return a > b (int8) -SIMD_EMU_IWRAPPER_2(cmpgt_epi16); // return a > b (int16) -SIMD_EMU_IWRAPPER_2(cmpgt_epi32); // return a > b (int32) -SIMD_EMU_IWRAPPER_2(cmpgt_epi64); // return a > b (int64) -SIMD_EMU_IWRAPPER_2(cmplt_epi32); // return a < b (int32) +SIMD_EMU_IWRAPPER_2(cmpeq_epi8); // return a == b (int8) +SIMD_EMU_IWRAPPER_2(cmpeq_epi16); // return a == b (int16) +SIMD_EMU_IWRAPPER_2(cmpeq_epi32); // return a == b (int32) +SIMD_EMU_IWRAPPER_2(cmpeq_epi64); // return a == b (int64) +SIMD_EMU_IWRAPPER_2(cmpgt_epi8); // return a > b (int8) +SIMD_EMU_IWRAPPER_2(cmpgt_epi16); // return a > b (int16) +SIMD_EMU_IWRAPPER_2(cmpgt_epi32); // return a > b (int32) +SIMD_EMU_IWRAPPER_2(cmpgt_epi64); // return a > b (int64) +SIMD_EMU_IWRAPPER_2(cmplt_epi32); // return a < b (int32) -static SIMDINLINE bool SIMDCALL testz_ps(Float const &a, Float const &b) // return all_lanes_zero(a & b) ? 1 : 0 (float) +static SIMDINLINE bool SIMDCALL + testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float) { - return 0 != _mm256_testz_ps(a, b); + return 0 != _mm256_testz_ps(a, b); } -static SIMDINLINE bool SIMDCALL testz_si(Integer const &a, Integer const &b) // return all_lanes_zero(a & b) ? 1 : 0 (int) +static SIMDINLINE bool SIMDCALL + testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int) { - return 0 != _mm256_testz_si256(a, b); + return 0 != _mm256_testz_si256(a, b); } //----------------------------------------------------------------------- // Blend / shuffle / permute operations //----------------------------------------------------------------------- -SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float) -SIMD_IFWRAPPER_2I(blend_epi32, _mm256_blend_ps); // return ImmT ? b : a (int32) -SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float) +SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float) +SIMD_IFWRAPPER_2I(blend_epi32, _mm256_blend_ps); // return ImmT ? b : a (int32) +SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float) -static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Float const &mask) // return mask ? b : a (int) +static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a, + Integer const& b, + Float const& mask) // return mask ? b : a (int) { return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask)); } -static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Integer const &mask) // return mask ? b : a (int) +static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a, + Integer const& b, + Integer const& mask) // return mask ? b : a (int) { return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask))); } -static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value) +static SIMDINLINE Float SIMDCALL + broadcast_ss(float const* p) // return *p (all elements in vector get same value) { return _mm256_broadcast_ss(p); } -SIMD_EMU_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16 -SIMD_EMU_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32 -SIMD_EMU_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 -SIMD_EMU_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32 +SIMD_EMU_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16 +SIMD_EMU_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32 +SIMD_EMU_IWRAPPER_2( + packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 +SIMD_EMU_IWRAPPER_2( + packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32 -template<int ImmT> -static SIMDINLINE Float SIMDCALL permute_ps(Float const &a) +template <int ImmT> +static SIMDINLINE Float SIMDCALL permute_ps(Float const& a) { return _mm256_permute_ps(a, ImmT); } -static SIMDINLINE Integer SIMDCALL permute_epi32(Integer const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (int32) +static SIMDINLINE Integer SIMDCALL permute_epi32( + Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32) { Integer result; // Ugly slow implementation - uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a); - uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz); - uint32_t *pResult = reinterpret_cast<uint32_t *>(&result); + uint32_t const* pA = reinterpret_cast<uint32_t const*>(&a); + uint32_t const* pSwiz = reinterpret_cast<uint32_t const*>(&swiz); + uint32_t* pResult = reinterpret_cast<uint32_t*>(&result); for (uint32_t i = 0; i < SIMD_WIDTH; ++i) { @@ -502,14 +526,15 @@ static SIMDINLINE Integer SIMDCALL permute_epi32(Integer const &a, Integer const return result; } -static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (float) +static SIMDINLINE Float SIMDCALL + permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float) { Float result; // Ugly slow implementation - float const *pA = reinterpret_cast<float const*>(&a); - uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz); - float *pResult = reinterpret_cast<float *>(&result); + float const* pA = reinterpret_cast<float const*>(&a); + uint32_t const* pSwiz = reinterpret_cast<uint32_t const*>(&swiz); + float* pResult = reinterpret_cast<float*>(&result); for (uint32_t i = 0; i < SIMD_WIDTH; ++i) { @@ -523,11 +548,10 @@ SIMD_WRAPPER_2I(permute2f128_ps); SIMD_DWRAPPER_2I(permute2f128_pd); SIMD_IWRAPPER_2I_(permute2f128_si, permute2f128_si256); - SIMD_EMU_IWRAPPER_1I(shuffle_epi32); -template<int ImmT> -static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const &a, Integer const &b) +template <int ImmT> +static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const& a, Integer const& b) { return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b))); } @@ -550,83 +574,88 @@ SIMD_WRAPPER_2(unpacklo_ps); //----------------------------------------------------------------------- // Load / store operations //----------------------------------------------------------------------- -template<ScaleFactor ScaleT> -static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer const &idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) +template <ScaleFactor ScaleT> +static SIMDINLINE Float SIMDCALL + i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) { - uint32_t *pOffsets = (uint32_t*)&idx; - Float vResult; - float* pResult = (float*)&vResult; + uint32_t* pOffsets = (uint32_t*)&idx; + Float vResult; + float* pResult = (float*)&vResult; for (uint32_t i = 0; i < SIMD_WIDTH; ++i) { uint32_t offset = pOffsets[i]; - offset = offset * static_cast<uint32_t>(ScaleT); - pResult[i] = *(float const*)(((uint8_t const*)p + offset)); + offset = offset * static_cast<uint32_t>(ScaleT); + pResult[i] = *(float const*)(((uint8_t const*)p + offset)); } return vResult; } -static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements) +static SIMDINLINE Float SIMDCALL + load1_ps(float const* p) // return *p (broadcast 1 value to all elements) { return broadcast_ss(p); } -static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory) +static SIMDINLINE Float SIMDCALL + load_ps(float const* p) // return *p (loads SIMD width elements from memory) { return _mm256_load_ps(p); } -static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p +static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p { return _mm256_load_si256(&p->v); } -static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem) +static SIMDINLINE Float SIMDCALL + loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem) { return _mm256_loadu_ps(p); } -static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem) +static SIMDINLINE Integer SIMDCALL + loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem) { return _mm256_lddqu_si256(&p->v); } // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old -template<ScaleFactor ScaleT> -static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float const &old, float const* p, Integer const &idx, Float const &mask) -{ - uint32_t *pOffsets = (uint32_t*)&idx; - Float vResult = old; - float* pResult = (float*)&vResult; - DWORD index; - uint32_t umask = movemask_ps(mask); +template <ScaleFactor ScaleT> +static SIMDINLINE Float SIMDCALL + mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask) +{ + uint32_t* pOffsets = (uint32_t*)&idx; + Float vResult = old; + float* pResult = (float*)&vResult; + DWORD index; + uint32_t umask = movemask_ps(mask); while (_BitScanForward(&index, umask)) { umask &= ~(1 << index); uint32_t offset = pOffsets[index]; - offset = offset * static_cast<uint32_t>(ScaleT); - pResult[index] = *(float const *)(((uint8_t const *)p + offset)); + offset = offset * static_cast<uint32_t>(ScaleT); + pResult[index] = *(float const*)(((uint8_t const*)p + offset)); } return vResult; } -static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer const &mask, Float const &src) +static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src) { _mm256_maskstore_ps(p, mask, src); } -static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const &a) +static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const& a) { - return SIMD128T::movemask_epi8(a.v4[0]) | - (SIMD128T::movemask_epi8(a.v4[1]) << 16); + return SIMD128T::movemask_epi8(a.v4[0]) | (SIMD128T::movemask_epi8(a.v4[1]) << 16); } -static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const &a) +static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a) { return static_cast<uint32_t>(_mm256_movemask_pd(a)); } -static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const &a) +static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a) { return static_cast<uint32_t>(_mm256_movemask_ps(a)); } @@ -641,32 +670,34 @@ static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements a return _mm256_set1_epi8(i); } -static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value) +static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value) { return _mm256_set1_ps(f); } -static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float) +static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float) { return _mm256_setzero_ps(); } -static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer) +static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer) { return _mm256_setzero_si256(); } -static SIMDINLINE void SIMDCALL store_ps(float *p, Float const &a) // *p = a (stores all elements contiguously in memory) +static SIMDINLINE void SIMDCALL + store_ps(float* p, Float const& a) // *p = a (stores all elements contiguously in memory) { _mm256_store_ps(p, a); } -static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer const &a) // *p = a +static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a { _mm256_store_si256(&p->v, a); } -static SIMDINLINE void SIMDCALL stream_ps(float *p, Float const &a) // *p = a (same as store_ps, but doesn't keep memory in cache) +static SIMDINLINE void SIMDCALL + stream_ps(float* p, Float const& a) // *p = a (same as store_ps, but doesn't keep memory in cache) { _mm256_stream_ps(p, a); } @@ -675,43 +706,43 @@ static SIMDINLINE void SIMDCALL stream_ps(float *p, Float const &a) // *p = a // Legacy interface (available only in SIMD256 width) //======================================================================= -static SIMDINLINE Float SIMDCALL broadcast_ps(SIMD128Impl::Float const *p) +static SIMDINLINE Float SIMDCALL broadcast_ps(SIMD128Impl::Float const* p) { return _mm256_broadcast_ps(&p->v); } -template<int ImmT> -static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double const &a) +template <int ImmT> +static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double const& a) { return _mm256_extractf128_pd(a, ImmT); } -template<int ImmT> -static SIMDINLINE SIMD128Impl::Float SIMDCALL extractf128_ps(Float const &a) +template <int ImmT> +static SIMDINLINE SIMD128Impl::Float SIMDCALL extractf128_ps(Float const& a) { return _mm256_extractf128_ps(a, ImmT); } -template<int ImmT> -static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer const &a) +template <int ImmT> +static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer const& a) { return _mm256_extractf128_si256(a, ImmT); } -template<int ImmT> -static SIMDINLINE Double SIMDCALL insertf128_pd(Double const &a, SIMD128Impl::Double const &b) +template <int ImmT> +static SIMDINLINE Double SIMDCALL insertf128_pd(Double const& a, SIMD128Impl::Double const& b) { return _mm256_insertf128_pd(a, b, ImmT); } -template<int ImmT> -static SIMDINLINE Float SIMDCALL insertf128_ps(Float const &a, SIMD128Impl::Float const &b) +template <int ImmT> +static SIMDINLINE Float SIMDCALL insertf128_ps(Float const& a, SIMD128Impl::Float const& b) { return _mm256_insertf128_ps(a, b, ImmT); } -template<int ImmT> -static SIMDINLINE Integer SIMDCALL insertf128_si(Integer const &a, SIMD128Impl::Integer const &b) +template <int ImmT> +static SIMDINLINE Integer SIMDCALL insertf128_si(Integer const& a, SIMD128Impl::Integer const& b) { return _mm256_insertf128_si256(a, b, ImmT); } @@ -727,33 +758,37 @@ static SIMDINLINE Integer SIMDCALL insertf128_si(Integer const &a, SIMD128Impl:: _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr)) #endif -static SIMDINLINE Integer SIMDCALL loadu2_si(SIMD128Impl::Integer const* phi, SIMD128Impl::Integer const* plo) +static SIMDINLINE Integer SIMDCALL loadu2_si(SIMD128Impl::Integer const* phi, + SIMD128Impl::Integer const* plo) { return _mm256_loadu2_m128i(&phi->v, &plo->v); } -static SIMDINLINE Integer SIMDCALL set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0) +static SIMDINLINE Integer SIMDCALL + set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0) { return _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0); } -static SIMDINLINE Float SIMDCALL set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0) +static SIMDINLINE Float SIMDCALL + set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0) { return _mm256_set_ps(i7, i6, i5, i4, i3, i2, i1, i0); } -static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer *phi, SIMD128Impl::Integer *plo, Integer const &src) +static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer* phi, + SIMD128Impl::Integer* plo, + Integer const& src) { _mm256_storeu2_m128i(&phi->v, &plo->v, src); } static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask) { - Integer vec = set1_epi32(mask); - const Integer bit = set_epi32( - 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); - vec = and_si(vec, bit); - vec = cmplt_epi32(setzero_si(), vec); + Integer vec = set1_epi32(mask); + const Integer bit = set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); + vec = and_si(vec, bit); + vec = cmplt_epi32(setzero_si(), vec); return castsi_ps(vec); } diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl index 96c24fff9da..59a61cf9263 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2017 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2017 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #if !defined(__SIMD_LIB_AVX2_HPP__) #error Do not include this file directly, use "simdlib.hpp" instead. #endif @@ -32,62 +32,61 @@ // Mostly these are integer operations that are no longer emulated with SSE //============================================================================ -#define SIMD_IWRAPPER_1(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer const &a) \ - {\ - return _mm256_##op(a);\ +#define SIMD_IWRAPPER_1(op) \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a) { return _mm256_##op(a); } + +#define SIMD_IWRAPPER_1L(op) \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a) \ + { \ + return _mm256_##op(_mm256_castsi256_si128(a)); \ } -#define SIMD_IWRAPPER_1L(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer const &a) \ - {\ - return _mm256_##op(_mm256_castsi256_si128(a));\ - }\ - -#define SIMD_IWRAPPER_1I(op) \ - template<int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer const &a) \ - {\ - return _mm256_##op(a, ImmT);\ +#define SIMD_IWRAPPER_1I(op) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a) \ + { \ + return _mm256_##op(a, ImmT); \ } -#define SIMD_IWRAPPER_1I_(op, intrin) \ - template<int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer const &a) \ - {\ - return _mm256_##intrin(a, ImmT);\ +#define SIMD_IWRAPPER_1I_(op, intrin) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a) \ + { \ + return _mm256_##intrin(a, ImmT); \ } -#define SIMD_IWRAPPER_2_(op, intrin) \ - static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \ - {\ - return _mm256_##intrin(a, b);\ +#define SIMD_IWRAPPER_2_(op, intrin) \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ + { \ + return _mm256_##intrin(a, b); \ } -#define SIMD_IWRAPPER_2(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \ - {\ - return _mm256_##op(a, b);\ +#define SIMD_IWRAPPER_2(op) \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ + { \ + return _mm256_##op(a, b); \ } -#define SIMD_IWRAPPER_2I(op) \ - template<int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \ - {\ - return _mm256_##op(a, b, ImmT);\ +#define SIMD_IWRAPPER_2I(op) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ + { \ + return _mm256_##op(a, b, ImmT); \ } -#define SIMD_IWRAPPER_2I(op) \ - template<int ImmT>\ - static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \ - {\ - return _mm256_##op(a, b, ImmT);\ +#define SIMD_IWRAPPER_2I(op) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ + { \ + return _mm256_##op(a, b, ImmT); \ } //----------------------------------------------------------------------- // Floating point arithmetic operations //----------------------------------------------------------------------- -static SIMDINLINE Float SIMDCALL fmadd_ps(Float const &a, Float const &b, Float const &c) // return (a * b) + c +static SIMDINLINE Float SIMDCALL fmadd_ps(Float const& a, + Float const& b, + Float const& c) // return (a * b) + c { return _mm256_fmadd_ps(a, b, c); } @@ -98,7 +97,7 @@ static SIMDINLINE Float SIMDCALL fmadd_ps(Float const &a, Float const &b, Float SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32) SIMD_IWRAPPER_2(add_epi32); // return a + b (int32) SIMD_IWRAPPER_2(add_epi8); // return a + b (int8) -SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) +SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32) SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32) SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32) @@ -117,51 +116,50 @@ SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) //----------------------------------------------------------------------- // Logical operations //----------------------------------------------------------------------- -SIMD_IWRAPPER_2_(and_si, and_si256); // return a & b (int) -SIMD_IWRAPPER_2_(andnot_si, andnot_si256); // return (~a) & b (int) -SIMD_IWRAPPER_2_(or_si, or_si256); // return a | b (int) -SIMD_IWRAPPER_2_(xor_si, xor_si256); // return a ^ b (int) - +SIMD_IWRAPPER_2_(and_si, and_si256); // return a & b (int) +SIMD_IWRAPPER_2_(andnot_si, andnot_si256); // return (~a) & b (int) +SIMD_IWRAPPER_2_(or_si, or_si256); // return a | b (int) +SIMD_IWRAPPER_2_(xor_si, xor_si256); // return a ^ b (int) //----------------------------------------------------------------------- // Shift operations //----------------------------------------------------------------------- -SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT -SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32) -SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32) -SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32) -SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32) -SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint) +SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT +SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32) +SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32) +SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32) +SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32) +SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint) -template<int ImmT> // same as srli_si, but with Float cast to int -static SIMDINLINE Float SIMDCALL srlisi_ps(Float const &a) +template <int ImmT> // same as srli_si, but with Float cast to int +static SIMDINLINE Float SIMDCALL srlisi_ps(Float const& a) { return castsi_ps(srli_si<ImmT>(castps_si(a))); } - //----------------------------------------------------------------------- // Conversion operations //----------------------------------------------------------------------- -SIMD_IWRAPPER_1L(cvtepu8_epi16); // return (int16)a (uint8 --> int16) -SIMD_IWRAPPER_1L(cvtepu8_epi32); // return (int32)a (uint8 --> int32) -SIMD_IWRAPPER_1L(cvtepu16_epi32); // return (int32)a (uint16 --> int32) -SIMD_IWRAPPER_1L(cvtepu16_epi64); // return (int64)a (uint16 --> int64) -SIMD_IWRAPPER_1L(cvtepu32_epi64); // return (int64)a (uint32 --> int64) +SIMD_IWRAPPER_1L(cvtepu8_epi16); // return (int16)a (uint8 --> int16) +SIMD_IWRAPPER_1L(cvtepu8_epi32); // return (int32)a (uint8 --> int32) +SIMD_IWRAPPER_1L(cvtepu16_epi32); // return (int32)a (uint16 --> int32) +SIMD_IWRAPPER_1L(cvtepu16_epi64); // return (int64)a (uint16 --> int64) +SIMD_IWRAPPER_1L(cvtepu32_epi64); // return (int64)a (uint32 --> int64) //----------------------------------------------------------------------- // Comparison operations //----------------------------------------------------------------------- -SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8) -SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16) -SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32) -SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64) -SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8) -SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16) -SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32) -SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64) - -static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer const &a, Integer const &b) // return a < b (int32) +SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8) +SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16) +SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32) +SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64) +SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8) +SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16) +SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32) +SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64) + +static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer const& a, + Integer const& b) // return a < b (int32) { return cmpgt_epi32(b, a); } @@ -169,28 +167,29 @@ static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer const &a, Integer const & //----------------------------------------------------------------------- // Blend / shuffle / permute operations //----------------------------------------------------------------------- -SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32) -SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16 -SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32 -SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 -SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32 +SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32) +SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16 +SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32 +SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 +SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32 -template<int ImmT> -static SIMDINLINE Float SIMDCALL permute_ps(Float const &a) +template <int ImmT> +static SIMDINLINE Float SIMDCALL permute_ps(Float const& a) { return _mm256_permute_ps(a, ImmT); } SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32); -static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (float) +static SIMDINLINE Float SIMDCALL + permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float) { return _mm256_permutevar8x32_ps(a, swiz); } SIMD_IWRAPPER_1I(shuffle_epi32); -template<int ImmT> -static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const &a, Integer const &b) +template <int ImmT> +static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const& a, Integer const& b) { return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b))); } @@ -207,22 +206,24 @@ SIMD_IWRAPPER_2(unpacklo_epi8); //----------------------------------------------------------------------- // Load / store operations //----------------------------------------------------------------------- -template<ScaleFactor ScaleT> -static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer const &idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) +template <ScaleFactor ScaleT> +static SIMDINLINE Float SIMDCALL + i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) { return _mm256_i32gather_ps(p, idx, static_cast<int>(ScaleT)); } // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old -template<ScaleFactor ScaleT> -static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float const &old, float const* p, Integer const &idx, Float const &mask) +template <ScaleFactor ScaleT> +static SIMDINLINE Float SIMDCALL + mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask) { - // g++ in debug mode needs the explicit .v suffix instead of relying on operator __m256() - // Only for this intrinsic - not sure why. :( + // g++ in debug mode needs the explicit .v suffix instead of relying on operator __m256() + // Only for this intrinsic - not sure why. :( return _mm256_mask_i32gather_ps(old.v, p, idx.v, mask.v, static_cast<int>(ScaleT)); } -static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const &a) +static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const& a) { return static_cast<uint32_t>(_mm256_movemask_epi8(a)); } diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl index 3fcfd250f91..790609861e5 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2017 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2017 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #if !defined(__SIMD_LIB_AVX512_HPP__) #error Do not include this file directly, use "simdlib.hpp" instead. #endif @@ -34,120 +34,138 @@ //============================================================================ private: - static SIMDINLINE __m512 __conv(Float r) { return _mm512_castps256_ps512(r.v); } - static SIMDINLINE __m512d __conv(Double r) { return _mm512_castpd256_pd512(r.v); } - static SIMDINLINE __m512i __conv(Integer r) { return _mm512_castsi256_si512(r.v); } - static SIMDINLINE Float __conv(__m512 r) { return _mm512_castps512_ps256(r); } - static SIMDINLINE Double __conv(__m512d r) { return _mm512_castpd512_pd256(r); } - static SIMDINLINE Integer __conv(__m512i r) { return _mm512_castsi512_si256(r); } -public: +static SIMDINLINE __m512 __conv(Float r) +{ + return _mm512_castps256_ps512(r.v); +} +static SIMDINLINE __m512d __conv(Double r) +{ + return _mm512_castpd256_pd512(r.v); +} +static SIMDINLINE __m512i __conv(Integer r) +{ + return _mm512_castsi256_si512(r.v); +} +static SIMDINLINE Float __conv(__m512 r) +{ + return _mm512_castps512_ps256(r); +} +static SIMDINLINE Double __conv(__m512d r) +{ + return _mm512_castpd512_pd256(r); +} +static SIMDINLINE Integer __conv(__m512i r) +{ + return _mm512_castsi512_si256(r); +} -#define SIMD_WRAPPER_1_(op, intrin, mask) \ - static SIMDINLINE Float SIMDCALL op(Float a) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\ +public: +#define SIMD_WRAPPER_1_(op, intrin, mask) \ + static SIMDINLINE Float SIMDCALL op(Float a) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \ } -#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xff)) +#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xff)) -#define SIMD_WRAPPER_1I_(op, intrin, mask) \ - template<int ImmT> \ - static SIMDINLINE Float SIMDCALL op(Float a) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\ +#define SIMD_WRAPPER_1I_(op, intrin, mask) \ + template <int ImmT> \ + static SIMDINLINE Float SIMDCALL op(Float a) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \ } -#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xff)) +#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xff)) -#define SIMD_WRAPPER_2_(op, intrin, mask) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\ +#define SIMD_WRAPPER_2_(op, intrin, mask) \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \ } -#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xff)) +#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xff)) -#define SIMD_WRAPPER_2I(op) \ - template<int ImmT>\ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - {\ - return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT));\ +#define SIMD_WRAPPER_2I(op) \ + template <int ImmT> \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ + { \ + return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT)); \ } -#define SIMD_WRAPPER_3_(op, intrin, mask) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\ +#define SIMD_WRAPPER_3_(op, intrin, mask) \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c))); \ } -#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xff)) +#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xff)) -#define SIMD_DWRAPPER_2I(op) \ - template<int ImmT>\ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ - {\ - return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\ +#define SIMD_DWRAPPER_2I(op) \ + template <int ImmT> \ + static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ + { \ + return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \ } -#define SIMD_IWRAPPER_1_(op, intrin, mask) \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\ +#define SIMD_IWRAPPER_1_(op, intrin, mask) \ + static SIMDINLINE Integer SIMDCALL op(Integer a) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \ } -#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xff)) +#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xff)) -#define SIMD_IWRAPPER_1I_(op, intrin, mask) \ - template<int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\ +#define SIMD_IWRAPPER_1I_(op, intrin, mask) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer a) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \ } -#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xff)) +#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xff)) -#define SIMD_IWRAPPER_2_(op, intrin, mask) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\ +#define SIMD_IWRAPPER_2_(op, intrin, mask) \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \ } -#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xff)) +#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xff)) -#define SIMD_IWRAPPER_2I(op) \ - template<int ImmT>\ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - {\ - return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT));\ +#define SIMD_IWRAPPER_2I(op) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ + { \ + return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT)); \ } //----------------------------------------------------------------------- // Single precision floating point arithmetic operations //----------------------------------------------------------------------- -SIMD_WRAPPER_2(add_ps); // return a + b -SIMD_WRAPPER_2(div_ps); // return a / b -SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c -SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c -SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b -SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b -SIMD_WRAPPER_2(mul_ps); // return a * b +SIMD_WRAPPER_2(add_ps); // return a + b +SIMD_WRAPPER_2(div_ps); // return a / b +SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c +SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c +SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b +SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b +SIMD_WRAPPER_2(mul_ps); // return a * b SIMD_WRAPPER_1_(rcp_ps, rcp14_ps, __mmask16(0xff)); // return 1.0f / a -SIMD_WRAPPER_1_(rsqrt_ps, rsqrt14_ps, __mmask16(0xff)); // return 1.0f / sqrt(a) -SIMD_WRAPPER_2(sub_ps); // return a - b +SIMD_WRAPPER_1_(rsqrt_ps, rsqrt14_ps, __mmask16(0xff)); // return 1.0f / sqrt(a) +SIMD_WRAPPER_2(sub_ps); // return a - b //----------------------------------------------------------------------- // Integer (various width) arithmetic operations //----------------------------------------------------------------------- -SIMD_IWRAPPER_1_32(abs_epi32); // return absolute_value(a) (int32) -SIMD_IWRAPPER_2_32(add_epi32); // return a + b (int32) -SIMD_IWRAPPER_2_32(max_epi32); // return (a > b) ? a : b (int32) -SIMD_IWRAPPER_2_32(max_epu32); // return (a > b) ? a : b (uint32) -SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32) -SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32) -SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32) +SIMD_IWRAPPER_1_32(abs_epi32); // return absolute_value(a) (int32) +SIMD_IWRAPPER_2_32(add_epi32); // return a + b (int32) +SIMD_IWRAPPER_2_32(max_epi32); // return (a > b) ? a : b (int32) +SIMD_IWRAPPER_2_32(max_epu32); // return (a > b) ? a : b (uint32) +SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32) +SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32) +SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32) // SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8) -// SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) +// SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) // return (a * b) & 0xFFFFFFFF // // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, // and store the low 32 bits of the intermediate integers in dst. SIMD_IWRAPPER_2_32(mullo_epi32); -SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32) +SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32) // SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64) // SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) @@ -155,23 +173,22 @@ SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32) //----------------------------------------------------------------------- // Logical operations //----------------------------------------------------------------------- -SIMD_IWRAPPER_2_(and_si, and_epi32, __mmask16(0xff)); // return a & b (int) +SIMD_IWRAPPER_2_(and_si, and_epi32, __mmask16(0xff)); // return a & b (int) SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xff)); // return (~a) & b (int) -SIMD_IWRAPPER_2_(or_si, or_epi32, __mmask16(0xff)); // return a | b (int) -SIMD_IWRAPPER_2_(xor_si, xor_epi32, __mmask16(0xff)); // return a ^ b (int) - +SIMD_IWRAPPER_2_(or_si, or_epi32, __mmask16(0xff)); // return a | b (int) +SIMD_IWRAPPER_2_(xor_si, xor_epi32, __mmask16(0xff)); // return a ^ b (int) //----------------------------------------------------------------------- // Shift operations //----------------------------------------------------------------------- -SIMD_IWRAPPER_1I_32(slli_epi32); // return a << ImmT -SIMD_IWRAPPER_2_32(sllv_epi32); // return a << b (uint32) -SIMD_IWRAPPER_1I_32(srai_epi32); // return a >> ImmT (int32) -SIMD_IWRAPPER_1I_32(srli_epi32); // return a >> ImmT (uint32) -SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32) +SIMD_IWRAPPER_1I_32(slli_epi32); // return a << ImmT +SIMD_IWRAPPER_2_32(sllv_epi32); // return a << b (uint32) +SIMD_IWRAPPER_1I_32(srai_epi32); // return a >> ImmT (int32) +SIMD_IWRAPPER_1I_32(srli_epi32); // return a >> ImmT (uint32) +SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32) // use AVX2 version -//SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint) +// SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint) //----------------------------------------------------------------------- // Conversion operations (Use AVX2 versions) @@ -185,16 +202,16 @@ SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32) //----------------------------------------------------------------------- // Comparison operations (Use AVX2 versions //----------------------------------------------------------------------- -//SIMD_IWRAPPER_2_CMP(cmpeq_epi8); // return a == b (int8) -//SIMD_IWRAPPER_2_CMP(cmpeq_epi16); // return a == b (int16) -//SIMD_IWRAPPER_2_CMP(cmpeq_epi32); // return a == b (int32) -//SIMD_IWRAPPER_2_CMP(cmpeq_epi64); // return a == b (int64) -//SIMD_IWRAPPER_2_CMP(cmpgt_epi8,); // return a > b (int8) -//SIMD_IWRAPPER_2_CMP(cmpgt_epi16); // return a > b (int16) -//SIMD_IWRAPPER_2_CMP(cmpgt_epi32); // return a > b (int32) -//SIMD_IWRAPPER_2_CMP(cmpgt_epi64); // return a > b (int64) +// SIMD_IWRAPPER_2_CMP(cmpeq_epi8); // return a == b (int8) +// SIMD_IWRAPPER_2_CMP(cmpeq_epi16); // return a == b (int16) +// SIMD_IWRAPPER_2_CMP(cmpeq_epi32); // return a == b (int32) +// SIMD_IWRAPPER_2_CMP(cmpeq_epi64); // return a == b (int64) +// SIMD_IWRAPPER_2_CMP(cmpgt_epi8,); // return a > b (int8) +// SIMD_IWRAPPER_2_CMP(cmpgt_epi16); // return a > b (int16) +// SIMD_IWRAPPER_2_CMP(cmpgt_epi32); // return a > b (int32) +// SIMD_IWRAPPER_2_CMP(cmpgt_epi64); // return a > b (int64) // -//static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b) // return a < b (int32) +// static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b) // return a < b (int32) //{ // return cmpgt_epi32(b, a); //} @@ -202,25 +219,28 @@ SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32) //----------------------------------------------------------------------- // Blend / shuffle / permute operations //----------------------------------------------------------------------- -// SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16 -// SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32 -// SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 -// SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32 +// SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 +// and _mm512_packs_epi16 SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation +// for _mm256_packs_epi32 and _mm512_packs_epi32 SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> +// uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 +// SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for +// _mm256_packus_epi32 and _mm512_packus_epi32 // SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32); -//static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float) +// static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for +// each 32-bit lane i (float) //{ // return _mm256_permutevar8x32_ps(a, swiz); //} SIMD_IWRAPPER_1I_32(shuffle_epi32); -//template<int ImmT> -//static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) +// template<int ImmT> +// static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) //{ // return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b))); //} -//SIMD_IWRAPPER_2(shuffle_epi8); +// SIMD_IWRAPPER_2(shuffle_epi8); SIMD_IWRAPPER_2_32(unpackhi_epi32); SIMD_IWRAPPER_2_32(unpacklo_epi32); @@ -234,50 +254,47 @@ SIMD_IWRAPPER_2_32(unpacklo_epi32); //----------------------------------------------------------------------- // Load / store operations //----------------------------------------------------------------------- -static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory) +static SIMDINLINE Float SIMDCALL + load_ps(float const* p) // return *p (loads SIMD width elements from memory) { return __conv(_mm512_maskz_loadu_ps(__mmask16(0xff), p)); } -static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p +static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p { return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p)); } -static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem) +static SIMDINLINE Float SIMDCALL + loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem) { return __conv(_mm512_maskz_loadu_ps(__mmask16(0xff), p)); } -static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem) +static SIMDINLINE Integer SIMDCALL + loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem) { return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p)); } -template<ScaleFactor ScaleT> -static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) +template <ScaleFactor ScaleT> +static SIMDINLINE Float SIMDCALL + i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) { return __conv(_mm512_mask_i32gather_ps( - _mm512_setzero_ps(), - __mmask16(0xff), - __conv(idx), - p, - static_cast<int>(ScaleT))); + _mm512_setzero_ps(), __mmask16(0xff), __conv(idx), p, static_cast<int>(ScaleT))); } // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old -template<ScaleFactor ScaleT> -static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) +template <ScaleFactor ScaleT> +static SIMDINLINE Float SIMDCALL + mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) { __mmask16 m = 0xff; - m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)), - _mm512_set1_epi32(0x80000000)); - return __conv(_mm512_mask_i32gather_ps( - __conv(old), - m, - __conv(idx), - p, - static_cast<int>(ScaleT))); + m = _mm512_mask_test_epi32_mask( + m, _mm512_castps_si512(__conv(mask)), _mm512_set1_epi32(0x80000000)); + return __conv( + _mm512_mask_i32gather_ps(__conv(old), m, __conv(idx), p, static_cast<int>(ScaleT))); } // static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a) @@ -287,19 +304,20 @@ static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, In // _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80))); // } -static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src) +static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src) { __mmask16 m = 0xff; - m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000)); + m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000)); _mm512_mask_storeu_ps(p, m, __conv(src)); } -static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory) +static SIMDINLINE void SIMDCALL + store_ps(float* p, Float a) // *p = a (stores all elements contiguously in memory) { _mm512_mask_storeu_ps(p, __mmask16(0xff), __conv(a)); } -static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a +static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a { _mm512_mask_storeu_epi32(p, __mmask16(0xff), __conv(a)); } diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_core.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_core.inl index 6ffe7c2a0f0..1acdc7e07ff 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_core.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_core.inl @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2017 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2017 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #if !defined(__SIMD_LIB_AVX512_HPP__) #error Do not include this file directly, use "simdlib.hpp" instead. #endif @@ -33,65 +33,68 @@ // register set. //============================================================================ -#define SIMD_DWRAPPER_1_(op, intrin, mask) \ - static SIMDINLINE Double SIMDCALL op(Double a) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\ +#define SIMD_DWRAPPER_1_(op, intrin, mask) \ + static SIMDINLINE Double SIMDCALL op(Double a) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \ } -#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0xf)) +#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0xf)) -#define SIMD_DWRAPPER_1I_(op, intrin, mask) \ - template<int ImmT> \ - static SIMDINLINE Double SIMDCALL op(Double a) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\ +#define SIMD_DWRAPPER_1I_(op, intrin, mask) \ + template <int ImmT> \ + static SIMDINLINE Double SIMDCALL op(Double a) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \ } -#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf)) +#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf)) -#define SIMD_DWRAPPER_2_(op, intrin, mask) \ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\ +#define SIMD_DWRAPPER_2_(op, intrin, mask) \ + static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \ } -#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0xf)) +#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0xf)) -#define SIMD_IWRAPPER_1_(op, intrin, mask) \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\ +#define SIMD_IWRAPPER_1_(op, intrin, mask) \ + static SIMDINLINE Integer SIMDCALL op(Integer a) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \ } -#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull)) -#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff)) -#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0xf)) +#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull)) +#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff)) +#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0xf)) -#define SIMD_IWRAPPER_1I_(op, intrin, mask) \ - template<int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\ +#define SIMD_IWRAPPER_1I_(op, intrin, mask) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer a) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \ } -#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull)) -#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff)) -#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf)) +#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull)) +#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff)) +#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf)) -#define SIMD_IWRAPPER_2_(op, intrin, mask) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - {\ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\ +#define SIMD_IWRAPPER_2_(op, intrin, mask) \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ + { \ + return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \ } -#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull)) -#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff)) -#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0xf)) +#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull)) +#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff)) +#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0xf)) - -SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8) -SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) -SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64) -SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) -SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16 -SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32 -SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 -SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32 +SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8) +SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) +SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64) +SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) +SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and + // _mm512_packs_epi16 +SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and + // _mm512_packs_epi32 +SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and + // _mm512_packus_epi16 +SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and + // _mm512_packus_epi32 SIMD_IWRAPPER_2_16(unpackhi_epi16); SIMD_IWRAPPER_2_64(unpackhi_epi64); SIMD_IWRAPPER_2_8(unpackhi_epi8); @@ -102,8 +105,7 @@ SIMD_IWRAPPER_2_8(unpacklo_epi8); static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a) { __mmask64 m = 0xffffffffull; - return static_cast<uint32_t>( - _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80))); + return static_cast<uint32_t>(_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80))); } #undef SIMD_DWRAPPER_1_ diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_knights.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_knights.inl index acd8ffd9688..52b6ca2b61e 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_knights.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_knights.inl @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2017 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2017 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #if !defined(__SIMD_LIB_AVX512_HPP__) #error Do not include this file directly, use "simdlib.hpp" instead. #endif @@ -32,4 +32,3 @@ // These use native AVX512 instructions with masking to enable a larger // register set. //============================================================================ - diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl index dfe19d3c04a..e9e908ac3c6 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl @@ -1,41 +1,41 @@ /**************************************************************************** -* Copyright (C) 2017 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2017 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #if !defined(__SIMD_LIB_AVX512_HPP__) #error Do not include this file directly, use "simdlib.hpp" instead. #endif -#if defined(__GNUC__) && !defined( __clang__) && !defined(__INTEL_COMPILER) +#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) // gcc as of 7.1 was missing these intrinsics #ifndef _mm512_cmpneq_ps_mask -#define _mm512_cmpneq_ps_mask(a,b) _mm512_cmp_ps_mask((a),(b),_CMP_NEQ_UQ) +#define _mm512_cmpneq_ps_mask(a, b) _mm512_cmp_ps_mask((a), (b), _CMP_NEQ_UQ) #endif #ifndef _mm512_cmplt_ps_mask -#define _mm512_cmplt_ps_mask(a,b) _mm512_cmp_ps_mask((a),(b),_CMP_LT_OS) +#define _mm512_cmplt_ps_mask(a, b) _mm512_cmp_ps_mask((a), (b), _CMP_LT_OS) #endif #ifndef _mm512_cmplt_pd_mask -#define _mm512_cmplt_pd_mask(a,b) _mm512_cmp_pd_mask((a),(b),_CMP_LT_OS) +#define _mm512_cmplt_pd_mask(a, b) _mm512_cmp_pd_mask((a), (b), _CMP_LT_OS) #endif #endif @@ -47,136 +47,108 @@ //============================================================================ static const int TARGET_SIMD_WIDTH = 16; -using SIMD256T = SIMD256Impl::AVX2Impl; +using SIMD256T = SIMD256Impl::AVX2Impl; -#define SIMD_WRAPPER_1_(op, intrin) \ - static SIMDINLINE Float SIMDCALL op(Float a) \ - {\ - return intrin(a);\ - } +#define SIMD_WRAPPER_1_(op, intrin) \ + static SIMDINLINE Float SIMDCALL op(Float a) { return intrin(a); } -#define SIMD_WRAPPER_1(op) \ - SIMD_WRAPPER_1_(op, _mm512_##op) +#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, _mm512_##op) -#define SIMD_WRAPPER_2_(op, intrin) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - {\ - return _mm512_##intrin(a, b);\ - } +#define SIMD_WRAPPER_2_(op, intrin) \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm512_##intrin(a, b); } #define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op) -#define SIMD_WRAPPERI_2_(op, intrin) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - {\ - return _mm512_castsi512_ps(_mm512_##intrin(\ - _mm512_castps_si512(a), _mm512_castps_si512(b)));\ +#define SIMD_WRAPPERI_2_(op, intrin) \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ + { \ + return _mm512_castsi512_ps( \ + _mm512_##intrin(_mm512_castps_si512(a), _mm512_castps_si512(b))); \ } -#define SIMD_DWRAPPER_2(op) \ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ - {\ - return _mm512_##op(a, b);\ - } +#define SIMD_DWRAPPER_2(op) \ + static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm512_##op(a, b); } -#define SIMD_WRAPPER_2I_(op, intrin) \ - template<int ImmT>\ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - {\ - return _mm512_##intrin(a, b, ImmT);\ +#define SIMD_WRAPPER_2I_(op, intrin) \ + template <int ImmT> \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ + { \ + return _mm512_##intrin(a, b, ImmT); \ } -#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op) +#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op) -#define SIMD_DWRAPPER_2I_(op, intrin) \ - template<int ImmT>\ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ - {\ - return _mm512_##intrin(a, b, ImmT);\ +#define SIMD_DWRAPPER_2I_(op, intrin) \ + template <int ImmT> \ + static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ + { \ + return _mm512_##intrin(a, b, ImmT); \ } -#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op) +#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op) -#define SIMD_WRAPPER_3(op) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \ - {\ - return _mm512_##op(a, b, c);\ - } +#define SIMD_WRAPPER_3(op) \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm512_##op(a, b, c); } -#define SIMD_IWRAPPER_1(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - {\ - return _mm512_##op(a);\ - } -#define SIMD_IWRAPPER_1_8(op) \ - static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) \ - {\ - return _mm512_##op(a);\ - } +#define SIMD_IWRAPPER_1(op) \ + static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm512_##op(a); } +#define SIMD_IWRAPPER_1_8(op) \ + static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) { return _mm512_##op(a); } -#define SIMD_IWRAPPER_1_4(op) \ - static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) \ - {\ - return _mm512_##op(a);\ - } +#define SIMD_IWRAPPER_1_4(op) \ + static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) { return _mm512_##op(a); } -#define SIMD_IWRAPPER_1I_(op, intrin) \ - template<int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - {\ - return intrin(a, ImmT);\ +#define SIMD_IWRAPPER_1I_(op, intrin) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer a) \ + { \ + return intrin(a, ImmT); \ } #define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op) -#define SIMD_IWRAPPER_2_(op, intrin) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - {\ - return _mm512_##intrin(a, b);\ - } -#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op) +#define SIMD_IWRAPPER_2_(op, intrin) \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm512_##intrin(a, b); } +#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op) -#define SIMD_IWRAPPER_2_CMP(op, cmp) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - {\ - return cmp(a, b);\ - } +#define SIMD_IWRAPPER_2_CMP(op, cmp) \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return cmp(a, b); } -#define SIMD_IFWRAPPER_2(op, intrin) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - {\ - return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\ +#define SIMD_IFWRAPPER_2(op, intrin) \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ + { \ + return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b))); \ } -#define SIMD_IWRAPPER_2I_(op, intrin) \ - template<int ImmT>\ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - {\ - return _mm512_##intrin(a, b, ImmT);\ +#define SIMD_IWRAPPER_2I_(op, intrin) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ + { \ + return _mm512_##intrin(a, b, ImmT); \ } #define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op) private: - static SIMDINLINE Integer vmask(__mmask16 m) - { - return _mm512_maskz_set1_epi32(m, -1); - } +static SIMDINLINE Integer vmask(__mmask16 m) +{ + return _mm512_maskz_set1_epi32(m, -1); +} - static SIMDINLINE Integer vmask(__mmask8 m) - { - return _mm512_maskz_set1_epi64(m, -1LL); - } +static SIMDINLINE Integer vmask(__mmask8 m) +{ + return _mm512_maskz_set1_epi64(m, -1LL); +} public: //----------------------------------------------------------------------- // Single precision floating point arithmetic operations //----------------------------------------------------------------------- -SIMD_WRAPPER_2(add_ps); // return a + b -SIMD_WRAPPER_2(div_ps); // return a / b -SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c -SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c -SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b -SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b -SIMD_WRAPPER_2(mul_ps); // return a * b -SIMD_WRAPPER_1_(rcp_ps, _mm512_rcp14_ps); // return 1.0f / a -SIMD_WRAPPER_1_(rsqrt_ps, _mm512_rsqrt14_ps); // return 1.0f / sqrt(a) -SIMD_WRAPPER_2(sub_ps); // return a - b +SIMD_WRAPPER_2(add_ps); // return a + b +SIMD_WRAPPER_2(div_ps); // return a / b +SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c +SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c +SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b +SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b +SIMD_WRAPPER_2(mul_ps); // return a * b +SIMD_WRAPPER_1_(rcp_ps, _mm512_rcp14_ps); // return 1.0f / a +SIMD_WRAPPER_1_(rsqrt_ps, _mm512_rsqrt14_ps); // return 1.0f / sqrt(a) +SIMD_WRAPPER_2(sub_ps); // return a - b template <RoundMode RMT> static SIMDINLINE Float SIMDCALL round_ps(Float a) @@ -184,52 +156,57 @@ static SIMDINLINE Float SIMDCALL round_ps(Float a) return _mm512_roundscale_ps(a, static_cast<int>(RMT)); } -static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); } -static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); } +static SIMDINLINE Float SIMDCALL ceil_ps(Float a) +{ + return round_ps<RoundMode::CEIL_NOEXC>(a); +} +static SIMDINLINE Float SIMDCALL floor_ps(Float a) +{ + return round_ps<RoundMode::FLOOR_NOEXC>(a); +} //----------------------------------------------------------------------- // Integer (various width) arithmetic operations //----------------------------------------------------------------------- SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32) SIMD_IWRAPPER_2(add_epi32); // return a + b (int32) -//SIMD_IWRAPPER_2(add_epi8); // return a + b (int8) -//SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) +// SIMD_IWRAPPER_2(add_epi8); // return a + b (int8) +// SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32) SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32) SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32) SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32) SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32) - // return (a * b) & 0xFFFFFFFF - // - // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, - // and store the low 32 bits of the intermediate integers in dst. +// return (a * b) & 0xFFFFFFFF +// +// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, +// and store the low 32 bits of the intermediate integers in dst. SIMD_IWRAPPER_2(mullo_epi32); SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32) SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64) -//SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) +// SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) //----------------------------------------------------------------------- // Logical operations //----------------------------------------------------------------------- -SIMD_IWRAPPER_2_(and_si, and_si512); // return a & b (int) -SIMD_IWRAPPER_2_(andnot_si, andnot_si512); // return (~a) & b (int) -SIMD_IWRAPPER_2_(or_si, or_si512); // return a | b (int) -SIMD_IWRAPPER_2_(xor_si, xor_si512); // return a ^ b (int) +SIMD_IWRAPPER_2_(and_si, and_si512); // return a & b (int) +SIMD_IWRAPPER_2_(andnot_si, andnot_si512); // return (~a) & b (int) +SIMD_IWRAPPER_2_(or_si, or_si512); // return a | b (int) +SIMD_IWRAPPER_2_(xor_si, xor_si512); // return a ^ b (int) // SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int) // SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int) // SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int) // SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int) - //----------------------------------------------------------------------- // Shift operations //----------------------------------------------------------------------- -SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT +SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT SIMD_IWRAPPER_2(sllv_epi32); -SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32) -SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32) +SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32) +SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32) #if 0 SIMD_IWRAPPER_1I_(srli_si, srli_si512); // return a >> (ImmT*8) (uint) @@ -246,32 +223,32 @@ SIMD_IWRAPPER_2(srlv_epi32); //----------------------------------------------------------------------- // Conversion operations //----------------------------------------------------------------------- -static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a) +static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a) { return _mm512_castpd_ps(a); } -static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a) +static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a) { return _mm512_castps_si512(a); } -static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a) +static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a) { return _mm512_castsi512_pd(a); } -static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a) +static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a) { return _mm512_castps_pd(a); } -static SIMDINLINE Integer SIMDCALL castpd_si(Double a) // return *(Integer*)(&a) +static SIMDINLINE Integer SIMDCALL castpd_si(Double a) // return *(Integer*)(&a) { return _mm512_castpd_si512(a); } -static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a) +static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a) { return _mm512_castsi512_ps(a); } @@ -281,18 +258,19 @@ static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (i return _mm512_cvtepi32_ps(a); } -//SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a (uint8 --> int16) -SIMD_IWRAPPER_1_4(cvtepu8_epi32); // return (int32)a (uint8 --> int32) -SIMD_IWRAPPER_1_8(cvtepu16_epi32); // return (int32)a (uint16 --> int32) -SIMD_IWRAPPER_1_4(cvtepu16_epi64); // return (int64)a (uint16 --> int64) -SIMD_IWRAPPER_1_8(cvtepu32_epi64); // return (int64)a (uint32 --> int64) +// SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a (uint8 --> int16) +SIMD_IWRAPPER_1_4(cvtepu8_epi32); // return (int32)a (uint8 --> int32) +SIMD_IWRAPPER_1_8(cvtepu16_epi32); // return (int32)a (uint16 --> int32) +SIMD_IWRAPPER_1_4(cvtepu16_epi64); // return (int64)a (uint16 --> int64) +SIMD_IWRAPPER_1_8(cvtepu32_epi64); // return (int64)a (uint32 --> int64) -static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32) +static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32) { return _mm512_cvtps_epi32(a); } -static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32) +static SIMDINLINE Integer SIMDCALL + cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32) { return _mm512_cvttps_epi32(a); } @@ -300,13 +278,13 @@ static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a) // return (in //----------------------------------------------------------------------- // Comparison operations //----------------------------------------------------------------------- -template<CompareType CmpTypeT> +template <CompareType CmpTypeT> static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b) { return _mm512_cmp_ps_mask(a, b, static_cast<const int>(CmpTypeT)); } -template<CompareType CmpTypeT> +template <CompareType CmpTypeT> static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b { // Legacy vector mask generator @@ -314,21 +292,39 @@ static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) return castsi_ps(vmask(result)); } -static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); } -static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); } -static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); } -static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); } -static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); } -static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); } +static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) +{ + return cmp_ps<CompareType::LT_OQ>(a, b); +} +static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) +{ + return cmp_ps<CompareType::GT_OQ>(a, b); +} +static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) +{ + return cmp_ps<CompareType::NEQ_OQ>(a, b); +} +static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) +{ + return cmp_ps<CompareType::EQ_OQ>(a, b); +} +static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) +{ + return cmp_ps<CompareType::GE_OQ>(a, b); +} +static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) +{ + return cmp_ps<CompareType::LE_OQ>(a, b); +} -template<CompareTypeInt CmpTypeT> +template <CompareTypeInt CmpTypeT> static SIMDINLINE Integer SIMDCALL cmp_epi32(Integer a, Integer b) { // Legacy vector mask generator __mmask16 result = _mm512_cmp_epi32_mask(a, b, static_cast<const int>(CmpTypeT)); return vmask(result); } -template<CompareTypeInt CmpTypeT> +template <CompareTypeInt CmpTypeT> static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, Integer b) { // Legacy vector mask generator @@ -336,22 +332,24 @@ static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, Integer b) return vmask(result); } -//SIMD_IWRAPPER_2_CMP(cmpeq_epi8, cmp_epi8<CompareTypeInt::EQ>); // return a == b (int8) -//SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>); // return a == b (int16) -SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32<CompareTypeInt::EQ>); // return a == b (int32) -SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64<CompareTypeInt::EQ>); // return a == b (int64) -//SIMD_IWRAPPER_2_CMP(cmpgt_epi8, cmp_epi8<CompareTypeInt::GT>); // return a > b (int8) -//SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>); // return a > b (int16) -SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32<CompareTypeInt::GT>); // return a > b (int32) -SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64<CompareTypeInt::GT>); // return a > b (int64) -SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32<CompareTypeInt::LT>); // return a < b (int32) +// SIMD_IWRAPPER_2_CMP(cmpeq_epi8, cmp_epi8<CompareTypeInt::EQ>); // return a == b (int8) +// SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>); // return a == b (int16) +SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32<CompareTypeInt::EQ>); // return a == b (int32) +SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64<CompareTypeInt::EQ>); // return a == b (int64) +// SIMD_IWRAPPER_2_CMP(cmpgt_epi8, cmp_epi8<CompareTypeInt::GT>); // return a > b (int8) +// SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>); // return a > b (int16) +SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32<CompareTypeInt::GT>); // return a > b (int32) +SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64<CompareTypeInt::GT>); // return a > b (int64) +SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32<CompareTypeInt::LT>); // return a < b (int32) -static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float) +static SIMDINLINE bool SIMDCALL testz_ps(Float a, + Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float) { return (0 == static_cast<int>(_mm512_test_epi32_mask(castps_si(a), castps_si(b)))); } -static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int) +static SIMDINLINE bool SIMDCALL testz_si(Integer a, + Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int) { return (0 == static_cast<int>(_mm512_test_epi32_mask(a, b))); } @@ -376,75 +374,82 @@ static SIMDINLINE Float blendv_ps(Float a, Float b, Float mask) // return mask ? return _mm512_mask_blend_ps(__mmask16(movemask_ps(mask)), a, b); } - -static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int) +static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, + Integer b, + Float mask) // return mask ? b : a (int) { return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask)); } -static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int) +static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, + Integer b, + Integer mask) // return mask ? b : a (int) { return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask))); } -static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value) +static SIMDINLINE Float SIMDCALL + broadcast_ss(float const* p) // return *p (all elements in vector get same value) { return _mm512_set1_ps(*p); } -template<int imm> +template <int imm> static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a) { return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a), imm)); } -template<int imm> +template <int imm> static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a) { return _mm512_extractf64x4_pd(a, imm); } -template<int imm> +template <int imm> static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a) { return _mm512_extracti64x4_epi64(a, imm); } -template<int imm> +template <int imm> static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b) { return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(a), _mm256_castps_pd(b), imm)); } -template<int imm> +template <int imm> static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b) { return _mm512_insertf64x4(a, b, imm); } -template<int imm> +template <int imm> static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b) { return _mm512_inserti64x4(a, b, imm); } -// SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16 -// SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32 -// SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16 -// SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32 +// SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 and +// _mm512_packs_epi16 SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32 +// and _mm512_packs_epi32 SIMD_IWRAPPER_2(packus_epi16); // See documentation for +// _mm512_packus_epi16 and _mm512_packus_epi16 SIMD_IWRAPPER_2(packus_epi32); // See documentation +// for _mm512_packus_epi32 and _mm512_packus_epi32 -template<int ImmT> -static SIMDINLINE Float SIMDCALL permute_ps(Float const &a) +template <int ImmT> +static SIMDINLINE Float SIMDCALL permute_ps(Float const& a) { return _mm512_permute_ps(a, ImmT); } -static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float) +static SIMDINLINE Integer SIMDCALL + permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float) { return _mm512_permutexvar_epi32(swiz, a); } -static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float) +static SIMDINLINE Float SIMDCALL + permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float) { return _mm512_permutexvar_ps(swiz, a); } @@ -455,11 +460,11 @@ SIMD_IWRAPPER_2I_(permute2f128_si, shuffle_i32x4); SIMD_IWRAPPER_1I(shuffle_epi32); -//SIMD_IWRAPPER_2(shuffle_epi8); +// SIMD_IWRAPPER_2(shuffle_epi8); SIMD_DWRAPPER_2I(shuffle_pd); SIMD_WRAPPER_2I(shuffle_ps); -template<int ImmT> +template <int ImmT> static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) { return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b))); @@ -467,73 +472,79 @@ static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) SIMD_IWRAPPER_2(unpackhi_epi16); -//SIMD_IFWRAPPER_2(unpackhi_epi32, _mm512_unpackhi_ps); +// SIMD_IFWRAPPER_2(unpackhi_epi32, _mm512_unpackhi_ps); static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b) { return castps_si(_mm512_unpackhi_ps(castsi_ps(a), castsi_ps(b))); } SIMD_IWRAPPER_2(unpackhi_epi64); -//SIMD_IWRAPPER_2(unpackhi_epi8); +// SIMD_IWRAPPER_2(unpackhi_epi8); SIMD_DWRAPPER_2(unpackhi_pd); SIMD_WRAPPER_2(unpackhi_ps); -//SIMD_IWRAPPER_2(unpacklo_epi16); +// SIMD_IWRAPPER_2(unpacklo_epi16); SIMD_IFWRAPPER_2(unpacklo_epi32, unpacklo_ps); SIMD_IWRAPPER_2(unpacklo_epi64); -//SIMD_IWRAPPER_2(unpacklo_epi8); +// SIMD_IWRAPPER_2(unpacklo_epi8); SIMD_DWRAPPER_2(unpacklo_pd); SIMD_WRAPPER_2(unpacklo_ps); //----------------------------------------------------------------------- // Load / store operations //----------------------------------------------------------------------- -template<ScaleFactor ScaleT> -static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) +template <ScaleFactor ScaleT> +static SIMDINLINE Float SIMDCALL + i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) { return _mm512_i32gather_ps(idx, p, static_cast<int>(ScaleT)); } -static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements) +static SIMDINLINE Float SIMDCALL + load1_ps(float const* p) // return *p (broadcast 1 value to all elements) { return broadcast_ss(p); } -static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory) +static SIMDINLINE Float SIMDCALL + load_ps(float const* p) // return *p (loads SIMD width elements from memory) { return _mm512_load_ps(p); } -static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p +static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p { return _mm512_load_si512(&p->v); } -static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem) +static SIMDINLINE Float SIMDCALL + loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem) { return _mm512_loadu_ps(p); } -static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem) +static SIMDINLINE Integer SIMDCALL + loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem) { return _mm512_loadu_si512(p); } // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old -template<ScaleFactor ScaleT> -static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) +template <ScaleFactor ScaleT> +static SIMDINLINE Float SIMDCALL + mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) { __mmask16 k = _mm512_cmpneq_ps_mask(mask, setzero_ps()); return _mm512_mask_i32gather_ps(old, k, idx, p, static_cast<int>(ScaleT)); } -static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src) +static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src) { Mask m = _mm512_cmplt_epi32_mask(mask, setzero_si()); _mm512_mask_store_ps(p, m, src); } -//static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a) +// static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a) //{ // __mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si()); // return static_cast<uint64_t>(m); @@ -565,78 +576,99 @@ static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements a return _mm512_set1_epi8(i); } -static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value) +static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value) { return _mm512_set1_ps(f); } -static SIMDINLINE Double SIMDCALL setzero_pd() // return 0 (double) +static SIMDINLINE Double SIMDCALL setzero_pd() // return 0 (double) { return _mm512_setzero_pd(); } -static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float) +static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float) { return _mm512_setzero_ps(); } -static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer) +static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer) { return _mm512_setzero_si512(); } -static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory) +static SIMDINLINE void SIMDCALL + store_ps(float* p, Float a) // *p = a (stores all elements contiguously in memory) { _mm512_store_ps(p, a); } -static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a +static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a { _mm512_store_si512(&p->v, a); } -static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a (same as store_si but allows for unaligned mem) +static SIMDINLINE void SIMDCALL + storeu_si(Integer* p, Integer a) // *p = a (same as store_si but allows for unaligned mem) { _mm512_storeu_si512(&p->v, a); } -static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache) +static SIMDINLINE void SIMDCALL + stream_ps(float* p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache) { _mm512_stream_ps(p, a); } -static SIMDINLINE Integer SIMDCALL set_epi32( - int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8, - int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0) +static SIMDINLINE Integer SIMDCALL set_epi32(int i15, + int i14, + int i13, + int i12, + int i11, + int i10, + int i9, + int i8, + int i7, + int i6, + int i5, + int i4, + int i3, + int i2, + int i1, + int i0) { - return _mm512_set_epi32( - i15, i14, i13, i12, i11, i10, i9, i8, - i7, i6, i5, i4, i3, i2, i1, i0); + return _mm512_set_epi32(i15, i14, i13, i12, i11, i10, i9, i8, i7, i6, i5, i4, i3, i2, i1, i0); } -static SIMDINLINE Integer SIMDCALL set_epi32( - int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0) +static SIMDINLINE Integer SIMDCALL + set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0) { - return set_epi32( - 0, 0, 0, 0, 0, 0, 0, 0, - i7, i6, i5, i4, i3, i2, i1, i0); + return set_epi32(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0); } -static SIMDINLINE Float SIMDCALL set_ps( - float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8, - float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0) +static SIMDINLINE Float SIMDCALL set_ps(float i15, + float i14, + float i13, + float i12, + float i11, + float i10, + float i9, + float i8, + float i7, + float i6, + float i5, + float i4, + float i3, + float i2, + float i1, + float i0) { - return _mm512_set_ps( - i15, i14, i13, i12, i11, i10, i9, i8, - i7, i6, i5, i4, i3, i2, i1, i0); + return _mm512_set_ps(i15, i14, i13, i12, i11, i10, i9, i8, i7, i6, i5, i4, i3, i2, i1, i0); } -static SIMDINLINE Float SIMDCALL set_ps( - float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0) +static SIMDINLINE Float SIMDCALL + set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0) { - return set_ps( - 0, 0, 0, 0, 0, 0, 0, 0, - i7, i6, i5, i4, i3, i2, i1, i0); + return set_ps(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0); } static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask) @@ -665,4 +697,3 @@ static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask) #undef SIMD_IWRAPPER_2 #undef SIMD_IWRAPPER_2_ #undef SIMD_IWRAPPER_2I - diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl index fed6307f4bc..82aa2bb4173 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2017 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2017 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #if !defined(__SIMD_LIB_AVX512_HPP__) #error Do not include this file directly, use "simdlib.hpp" instead. #endif @@ -29,139 +29,111 @@ // //============================================================================ -#define SIMD_WRAPPER_1_(op, intrin) \ - static SIMDINLINE Float SIMDCALL op(Float a) \ - {\ - return intrin(a);\ - } +#define SIMD_WRAPPER_1_(op, intrin) \ + static SIMDINLINE Float SIMDCALL op(Float a) { return intrin(a); } -#define SIMD_WRAPPER_1(op) \ - SIMD_WRAPPER_1_(op, _mm512_##op) +#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, _mm512_##op) -#define SIMD_WRAPPER_2_(op, intrin) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - {\ - return _mm512_##intrin(a, b);\ - } +#define SIMD_WRAPPER_2_(op, intrin) \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm512_##intrin(a, b); } #define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op) -#define SIMD_WRAPPERI_2_(op, intrin) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - {\ - return _mm512_castsi512_ps(_mm512_##intrin(\ - _mm512_castps_si512(a), _mm512_castps_si512(b)));\ +#define SIMD_WRAPPERI_2_(op, intrin) \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ + { \ + return _mm512_castsi512_ps( \ + _mm512_##intrin(_mm512_castps_si512(a), _mm512_castps_si512(b))); \ } -#define SIMD_DWRAPPER_2(op) \ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ - {\ - return _mm512_##op(a, b);\ - } +#define SIMD_DWRAPPER_2(op) \ + static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm512_##op(a, b); } -#define SIMD_WRAPPER_2I_(op, intrin) \ - template<int ImmT>\ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - {\ - return _mm512_##intrin(a, b, ImmT);\ +#define SIMD_WRAPPER_2I_(op, intrin) \ + template <int ImmT> \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ + { \ + return _mm512_##intrin(a, b, ImmT); \ } -#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op) +#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op) -#define SIMD_DWRAPPER_2I_(op, intrin) \ - template<int ImmT>\ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ - {\ - return _mm512_##intrin(a, b, ImmT);\ +#define SIMD_DWRAPPER_2I_(op, intrin) \ + template <int ImmT> \ + static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ + { \ + return _mm512_##intrin(a, b, ImmT); \ } -#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op) +#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op) -#define SIMD_WRAPPER_3(op) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \ - {\ - return _mm512_##op(a, b, c);\ - } +#define SIMD_WRAPPER_3(op) \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm512_##op(a, b, c); } -#define SIMD_IWRAPPER_1(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - {\ - return _mm512_##op(a);\ - } -#define SIMD_IWRAPPER_1_8(op) \ - static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) \ - {\ - return _mm512_##op(a);\ - } +#define SIMD_IWRAPPER_1(op) \ + static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm512_##op(a); } +#define SIMD_IWRAPPER_1_8(op) \ + static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) { return _mm512_##op(a); } -#define SIMD_IWRAPPER_1_4(op) \ - static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) \ - {\ - return _mm512_##op(a);\ - } +#define SIMD_IWRAPPER_1_4(op) \ + static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) { return _mm512_##op(a); } -#define SIMD_IWRAPPER_1I_(op, intrin) \ - template<int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - {\ - return intrin(a, ImmT);\ +#define SIMD_IWRAPPER_1I_(op, intrin) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer a) \ + { \ + return intrin(a, ImmT); \ } #define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op) -#define SIMD_IWRAPPER_2_(op, intrin) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - {\ - return _mm512_##intrin(a, b);\ - } -#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op) +#define SIMD_IWRAPPER_2_(op, intrin) \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm512_##intrin(a, b); } +#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op) -#define SIMD_IWRAPPER_2_CMP(op, cmp) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - {\ - return cmp(a, b);\ - } +#define SIMD_IWRAPPER_2_CMP(op, cmp) \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return cmp(a, b); } -#define SIMD_IFWRAPPER_2(op, intrin) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - {\ - return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\ +#define SIMD_IFWRAPPER_2(op, intrin) \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ + { \ + return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b))); \ } -#define SIMD_IWRAPPER_2I_(op, intrin) \ - template<int ImmT>\ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - {\ - return _mm512_##intrin(a, b, ImmT);\ +#define SIMD_IWRAPPER_2I_(op, intrin) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ + { \ + return _mm512_##intrin(a, b, ImmT); \ } #define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op) private: - static SIMDINLINE Integer vmask(__mmask32 m) - { - return _mm512_maskz_set1_epi16(m, -1); - } - static SIMDINLINE Integer vmask(__mmask64 m) - { - return _mm512_maskz_set1_epi8(m, -1); - } -public: +static SIMDINLINE Integer vmask(__mmask32 m) +{ + return _mm512_maskz_set1_epi16(m, -1); +} +static SIMDINLINE Integer vmask(__mmask64 m) +{ + return _mm512_maskz_set1_epi8(m, -1); +} -SIMD_IWRAPPER_2(add_epi8); // return a + b (int8) -SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) -SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) +public: +SIMD_IWRAPPER_2(add_epi8); // return a + b (int8) +SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) +SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) -SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int) -SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int) -SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int) -SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int) +SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int) +SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int) +SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int) +SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int) -SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a (uint8 --> int16) +SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a (uint8 --> int16) -template<CompareTypeInt CmpTypeT> +template <CompareTypeInt CmpTypeT> static SIMDINLINE Integer SIMDCALL cmp_epi8(Integer a, Integer b) { // Legacy vector mask generator __mmask64 result = _mm512_cmp_epi8_mask(a, b, static_cast<const int>(CmpTypeT)); return vmask(result); } -template<CompareTypeInt CmpTypeT> +template <CompareTypeInt CmpTypeT> static SIMDINLINE Integer SIMDCALL cmp_epi16(Integer a, Integer b) { // Legacy vector mask generator @@ -169,19 +141,19 @@ static SIMDINLINE Integer SIMDCALL cmp_epi16(Integer a, Integer b) return vmask(result); } -SIMD_IWRAPPER_2_CMP(cmpeq_epi8, cmp_epi8<CompareTypeInt::EQ>); // return a == b (int8) -SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>); // return a == b (int16) -SIMD_IWRAPPER_2_CMP(cmpgt_epi8, cmp_epi8<CompareTypeInt::GT>); // return a > b (int8) -SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>); // return a > b (int16) +SIMD_IWRAPPER_2_CMP(cmpeq_epi8, cmp_epi8<CompareTypeInt::EQ>); // return a == b (int8) +SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>); // return a == b (int16) +SIMD_IWRAPPER_2_CMP(cmpgt_epi8, cmp_epi8<CompareTypeInt::GT>); // return a > b (int8) +SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>); // return a > b (int16) -SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 -SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32 -SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16 -SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32 +SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 +SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32 +SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16 +SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32 -SIMD_IWRAPPER_2(unpackhi_epi8); // See documentation for _mm512_unpackhi_epi8 -SIMD_IWRAPPER_2(unpacklo_epi16); // See documentation for _mm512_unpacklo_epi16 -SIMD_IWRAPPER_2(unpacklo_epi8); // See documentation for _mm512_unpacklo_epi8 +SIMD_IWRAPPER_2(unpackhi_epi8); // See documentation for _mm512_unpackhi_epi8 +SIMD_IWRAPPER_2(unpacklo_epi16); // See documentation for _mm512_unpacklo_epi16 +SIMD_IWRAPPER_2(unpacklo_epi8); // See documentation for _mm512_unpacklo_epi8 SIMD_IWRAPPER_2(shuffle_epi8); @@ -191,8 +163,6 @@ static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a) return static_cast<uint64_t>(m); } - - #undef SIMD_WRAPPER_1_ #undef SIMD_WRAPPER_1 #undef SIMD_WRAPPER_2 @@ -214,4 +184,3 @@ static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a) #undef SIMD_IWRAPPER_2 #undef SIMD_IWRAPPER_2_ #undef SIMD_IWRAPPER_2I - diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl index 690ab386b46..9ec3ff6c6b1 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2017 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2017 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #if !defined(__SIMD_LIB_AVX512_HPP__) #error Do not include this file directly, use "simdlib.hpp" instead. #endif @@ -29,113 +29,85 @@ // //============================================================================ -#define SIMD_WRAPPER_1_(op, intrin) \ - static SIMDINLINE Float SIMDCALL op(Float a) \ - {\ - return intrin(a);\ - } +#define SIMD_WRAPPER_1_(op, intrin) \ + static SIMDINLINE Float SIMDCALL op(Float a) { return intrin(a); } -#define SIMD_WRAPPER_1(op) \ - SIMD_WRAPPER_1_(op, _mm512_##op) +#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, _mm512_##op) -#define SIMD_WRAPPER_2_(op, intrin) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - {\ - return _mm512_##intrin(a, b);\ - } +#define SIMD_WRAPPER_2_(op, intrin) \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm512_##intrin(a, b); } #define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op) -#define SIMD_WRAPPERI_2_(op, intrin) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - {\ - return _mm512_castsi512_ps(_mm512_##intrin(\ - _mm512_castps_si512(a), _mm512_castps_si512(b)));\ +#define SIMD_WRAPPERI_2_(op, intrin) \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ + { \ + return _mm512_castsi512_ps( \ + _mm512_##intrin(_mm512_castps_si512(a), _mm512_castps_si512(b))); \ } -#define SIMD_DWRAPPER_2(op) \ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ - {\ - return _mm512_##op(a, b);\ - } +#define SIMD_DWRAPPER_2(op) \ + static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm512_##op(a, b); } -#define SIMD_WRAPPER_2I_(op, intrin) \ - template<int ImmT>\ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - {\ - return _mm512_##intrin(a, b, ImmT);\ +#define SIMD_WRAPPER_2I_(op, intrin) \ + template <int ImmT> \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ + { \ + return _mm512_##intrin(a, b, ImmT); \ } -#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op) +#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op) -#define SIMD_DWRAPPER_2I_(op, intrin) \ - template<int ImmT>\ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ - {\ - return _mm512_##intrin(a, b, ImmT);\ +#define SIMD_DWRAPPER_2I_(op, intrin) \ + template <int ImmT> \ + static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ + { \ + return _mm512_##intrin(a, b, ImmT); \ } -#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op) +#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op) -#define SIMD_WRAPPER_3(op) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \ - {\ - return _mm512_##op(a, b, c);\ - } +#define SIMD_WRAPPER_3(op) \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm512_##op(a, b, c); } -#define SIMD_IWRAPPER_1(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - {\ - return _mm512_##op(a);\ - } -#define SIMD_IWRAPPER_1_8(op) \ - static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) \ - {\ - return _mm512_##op(a);\ - } +#define SIMD_IWRAPPER_1(op) \ + static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm512_##op(a); } +#define SIMD_IWRAPPER_1_8(op) \ + static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) { return _mm512_##op(a); } -#define SIMD_IWRAPPER_1_4(op) \ - static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) \ - {\ - return _mm512_##op(a);\ - } +#define SIMD_IWRAPPER_1_4(op) \ + static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) { return _mm512_##op(a); } -#define SIMD_IWRAPPER_1I_(op, intrin) \ - template<int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - {\ - return intrin(a, ImmT);\ +#define SIMD_IWRAPPER_1I_(op, intrin) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer a) \ + { \ + return intrin(a, ImmT); \ } #define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op) -#define SIMD_IWRAPPER_2_(op, intrin) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - {\ - return _mm512_##intrin(a, b);\ - } -#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op) +#define SIMD_IWRAPPER_2_(op, intrin) \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm512_##intrin(a, b); } +#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op) -#define SIMD_IWRAPPER_2_CMP(op, cmp) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - {\ - return cmp(a, b);\ - } +#define SIMD_IWRAPPER_2_CMP(op, cmp) \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return cmp(a, b); } -#define SIMD_IFWRAPPER_2(op, intrin) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - {\ - return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\ +#define SIMD_IFWRAPPER_2(op, intrin) \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ + { \ + return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b))); \ } -#define SIMD_IWRAPPER_2I_(op, intrin) \ - template<int ImmT>\ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - {\ - return _mm512_##intrin(a, b, ImmT);\ +#define SIMD_IWRAPPER_2I_(op, intrin) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ + { \ + return _mm512_##intrin(a, b, ImmT); \ } #define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op) -SIMD_WRAPPERI_2_(and_ps, and_epi32); // return a & b (float treated as int) -SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32); // return (~a) & b (float treated as int) -SIMD_WRAPPERI_2_(or_ps, or_epi32); // return a | b (float treated as int) -SIMD_WRAPPERI_2_(xor_ps, xor_epi32); // return a ^ b (float treated as int) +SIMD_WRAPPERI_2_(and_ps, and_epi32); // return a & b (float treated as int) +SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32); // return (~a) & b (float treated as int) +SIMD_WRAPPERI_2_(or_ps, or_epi32); // return a | b (float treated as int) +SIMD_WRAPPERI_2_(xor_ps, xor_epi32); // return a ^ b (float treated as int) #undef SIMD_WRAPPER_1_ #undef SIMD_WRAPPER_1 @@ -158,4 +130,3 @@ SIMD_WRAPPERI_2_(xor_ps, xor_epi32); // return a ^ b (float treat #undef SIMD_IWRAPPER_2 #undef SIMD_IWRAPPER_2_ #undef SIMD_IWRAPPER_2I - diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl index 3e36ce5bd36..f9d4b8c3902 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2017 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2017 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #if !defined(__SIMD_LIB_AVX512_HPP__) #error Do not include this file directly, use "simdlib.hpp" instead. #endif diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_core.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_core.inl index 3e36ce5bd36..f9d4b8c3902 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_core.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_core.inl @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2017 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2017 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #if !defined(__SIMD_LIB_AVX512_HPP__) #error Do not include this file directly, use "simdlib.hpp" instead. #endif diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_knights.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_knights.inl index 3e36ce5bd36..f9d4b8c3902 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_knights.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_knights.inl @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2017 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2017 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #if !defined(__SIMD_LIB_AVX512_HPP__) #error Do not include this file directly, use "simdlib.hpp" instead. #endif diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl index 55981dceba1..91705f2646d 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2017 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2017 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #if !defined(__SIMD_LIB_AVX_HPP__) #error Do not include this file directly, use "simdlib.hpp" instead. #endif @@ -29,149 +29,143 @@ //============================================================================ static const int TARGET_SIMD_WIDTH = 8; -using SIMD128T = SIMD128Impl::AVXImpl; - -#define SIMD_WRAPPER_1(op) \ - static SIMDINLINE Float SIMDCALL op(Float const &a) \ - {\ - return Float\ - {\ - SIMD256T::op(a.v8[0]),\ - SIMD256T::op(a.v8[1]),\ - };\ +using SIMD128T = SIMD128Impl::AVXImpl; + +#define SIMD_WRAPPER_1(op) \ + static SIMDINLINE Float SIMDCALL op(Float const& a) \ + { \ + return Float{ \ + SIMD256T::op(a.v8[0]), \ + SIMD256T::op(a.v8[1]), \ + }; \ } -#define SIMD_WRAPPER_2(op) \ - static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b) \ - {\ - return Float\ - {\ - SIMD256T::op(a.v8[0], b.v8[0]),\ - SIMD256T::op(a.v8[1], b.v8[1]),\ - };\ +#define SIMD_WRAPPER_2(op) \ + static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \ + { \ + return Float{ \ + SIMD256T::op(a.v8[0], b.v8[0]), \ + SIMD256T::op(a.v8[1], b.v8[1]), \ + }; \ } -#define SIMD_WRAPPER_2I(op) \ - template<int ImmT>\ - static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b) \ - {\ - return Float\ - {\ - SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\ - SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\ - };\ +#define SIMD_WRAPPER_2I(op) \ + template <int ImmT> \ + static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \ + { \ + return Float{ \ + SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]), \ + SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \ + }; \ } -#define SIMD_WRAPPER_2I_1(op) \ - template<int ImmT>\ - static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b) \ - {\ - return Float\ - {\ - SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\ - SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\ - };\ +#define SIMD_WRAPPER_2I_1(op) \ + template <int ImmT> \ + static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \ + { \ + return Float{ \ + SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]), \ + SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]), \ + }; \ } -#define SIMD_WRAPPER_3(op) \ - static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b, Float const &c) \ - {\ - return Float\ - {\ - SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\ - SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\ - };\ - } - -#define SIMD_IWRAPPER_1(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer const &a) \ - {\ - return Integer\ - {\ - SIMD256T::op(a.v8[0]),\ - SIMD256T::op(a.v8[1]),\ - };\ +#define SIMD_WRAPPER_3(op) \ + static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \ + { \ + return Float{ \ + SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]), \ + SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]), \ + }; \ } -#define SIMD_IWRAPPER_2(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \ - {\ - return Integer\ - {\ - SIMD256T::op(a.v8[0], b.v8[0]),\ - SIMD256T::op(a.v8[1], b.v8[1]),\ - };\ +#define SIMD_IWRAPPER_1(op) \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a) \ + { \ + return Integer{ \ + SIMD256T::op(a.v8[0]), \ + SIMD256T::op(a.v8[1]), \ + }; \ } -#define SIMD_IWRAPPER_2I(op) \ - template<int ImmT>\ - static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \ - {\ - return Integer\ - {\ - SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\ - SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\ - };\ +#define SIMD_IWRAPPER_2(op) \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ + { \ + return Integer{ \ + SIMD256T::op(a.v8[0], b.v8[0]), \ + SIMD256T::op(a.v8[1], b.v8[1]), \ + }; \ } -#define SIMD_IWRAPPER_2I_1(op) \ - template<int ImmT>\ - static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \ - {\ - return Integer\ - {\ - SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\ - SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\ - };\ +#define SIMD_IWRAPPER_2I(op) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ + { \ + return Integer{ \ + SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]), \ + SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \ + }; \ } -#define SIMD_IWRAPPER_2I_2(op) \ - template<int ImmT>\ - static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \ - {\ - return Integer\ - {\ - SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]),\ - SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]),\ - };\ +#define SIMD_IWRAPPER_2I_1(op) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ + { \ + return Integer{ \ + SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]), \ + SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]), \ + }; \ } -#define SIMD_IWRAPPER_3(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b, Integer const &c) \ - {\ - return Integer\ - {\ - SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\ - SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\ - };\ +#define SIMD_IWRAPPER_2I_2(op) \ + template <int ImmT> \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ + { \ + return Integer{ \ + SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]), \ + SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]), \ + }; \ + } + +#define SIMD_IWRAPPER_3(op) \ + static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \ + { \ + return Integer{ \ + SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]), \ + SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]), \ + }; \ } //----------------------------------------------------------------------- // Single precision floating point arithmetic operations //----------------------------------------------------------------------- -SIMD_WRAPPER_2(add_ps); // return a + b -SIMD_WRAPPER_2(div_ps); // return a / b -SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c -SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c -SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b -SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b -SIMD_WRAPPER_2(mul_ps); // return a * b -SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a -SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a) -SIMD_WRAPPER_2(sub_ps); // return a - b +SIMD_WRAPPER_2(add_ps); // return a + b +SIMD_WRAPPER_2(div_ps); // return a / b +SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c +SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c +SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b +SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b +SIMD_WRAPPER_2(mul_ps); // return a * b +SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a +SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a) +SIMD_WRAPPER_2(sub_ps); // return a - b template <RoundMode RMT> -static SIMDINLINE Float SIMDCALL round_ps(Float const &a) +static SIMDINLINE Float SIMDCALL round_ps(Float const& a) { - return Float - { + return Float{ SIMD256T::template round_ps<RMT>(a.v8[0]), SIMD256T::template round_ps<RMT>(a.v8[1]), }; } -static SIMDINLINE Float SIMDCALL ceil_ps(Float const &a) { return round_ps<RoundMode::CEIL_NOEXC>(a); } -static SIMDINLINE Float SIMDCALL floor_ps(Float const &a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); } +static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a) +{ + return round_ps<RoundMode::CEIL_NOEXC>(a); +} +static SIMDINLINE Float SIMDCALL floor_ps(Float const& a) +{ + return round_ps<RoundMode::FLOOR_NOEXC>(a); +} //----------------------------------------------------------------------- // Integer (various width) arithmetic operations @@ -179,7 +173,7 @@ static SIMDINLINE Float SIMDCALL floor_ps(Float const &a) { return round_ps<Roun SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32) SIMD_IWRAPPER_2(add_epi32); // return a + b (int32) SIMD_IWRAPPER_2(add_epi8); // return a + b (int8) -SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) +SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32) SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32) SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32) @@ -207,178 +201,168 @@ SIMD_IWRAPPER_2(or_si); // return a | b (int) SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int) SIMD_IWRAPPER_2(xor_si); // return a ^ b (int) - //----------------------------------------------------------------------- // Shift operations //----------------------------------------------------------------------- -template<int ImmT> -static SIMDINLINE Integer SIMDCALL slli_epi32(Integer const &a) // return a << ImmT +template <int ImmT> +static SIMDINLINE Integer SIMDCALL slli_epi32(Integer const& a) // return a << ImmT { - return Integer - { + return Integer{ SIMD256T::template slli_epi32<ImmT>(a.v8[0]), SIMD256T::template slli_epi32<ImmT>(a.v8[1]), }; } -SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32) +SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32) -template<int ImmT> -static SIMDINLINE Integer SIMDCALL srai_epi32(Integer const &a) // return a >> ImmT (int32) +template <int ImmT> +static SIMDINLINE Integer SIMDCALL srai_epi32(Integer const& a) // return a >> ImmT (int32) { - return Integer - { + return Integer{ SIMD256T::template srai_epi32<ImmT>(a.v8[0]), SIMD256T::template srai_epi32<ImmT>(a.v8[1]), }; } -template<int ImmT> -static SIMDINLINE Integer SIMDCALL srli_epi32(Integer const &a) // return a >> ImmT (uint32) +template <int ImmT> +static SIMDINLINE Integer SIMDCALL srli_epi32(Integer const& a) // return a >> ImmT (uint32) { - return Integer - { + return Integer{ SIMD256T::template srli_epi32<ImmT>(a.v8[0]), SIMD256T::template srli_epi32<ImmT>(a.v8[1]), }; } -template<int ImmT> // for each 128-bit lane: -static SIMDINLINE Integer SIMDCALL srli_si(Integer const &a) // return a >> (ImmT*8) (uint) +template <int ImmT> // for each 128-bit lane: +static SIMDINLINE Integer SIMDCALL srli_si(Integer const& a) // return a >> (ImmT*8) (uint) { - return Integer - { + return Integer{ SIMD256T::template srli_si<ImmT>(a.v8[0]), SIMD256T::template srli_si<ImmT>(a.v8[1]), }; } -template<int ImmT> -static SIMDINLINE Float SIMDCALL srlisi_ps(Float const &a) // same as srli_si, but with Float cast to int +template <int ImmT> +static SIMDINLINE Float SIMDCALL + srlisi_ps(Float const& a) // same as srli_si, but with Float cast to int { - return Float - { + return Float{ SIMD256T::template srlisi_ps<ImmT>(a.v8[0]), SIMD256T::template srlisi_ps<ImmT>(a.v8[1]), }; } -SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32) +SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32) //----------------------------------------------------------------------- // Conversion operations //----------------------------------------------------------------------- -static SIMDINLINE Float SIMDCALL castpd_ps(Double const &a) // return *(Float*)(&a) +static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a) { - return Float - { + return Float{ SIMD256T::castpd_ps(a.v8[0]), SIMD256T::castpd_ps(a.v8[1]), }; } -static SIMDINLINE Integer SIMDCALL castps_si(Float const &a) // return *(Integer*)(&a) +static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a) { - return Integer - { + return Integer{ SIMD256T::castps_si(a.v8[0]), SIMD256T::castps_si(a.v8[1]), }; } -static SIMDINLINE Double SIMDCALL castsi_pd(Integer const &a) // return *(Double*)(&a) +static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a) { - return Double - { + return Double{ SIMD256T::castsi_pd(a.v8[0]), SIMD256T::castsi_pd(a.v8[1]), }; } -static SIMDINLINE Double SIMDCALL castps_pd(Float const &a) // return *(Double*)(&a) +static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a) { - return Double - { + return Double{ SIMD256T::castps_pd(a.v8[0]), SIMD256T::castps_pd(a.v8[1]), }; } -static SIMDINLINE Float SIMDCALL castsi_ps(Integer const &a) // return *(Float*)(&a) +static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a) { - return Float - { + return Float{ SIMD256T::castsi_ps(a.v8[0]), SIMD256T::castsi_ps(a.v8[1]), }; } -static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer const &a) // return (float)a (int32 --> float) +static SIMDINLINE Float SIMDCALL + cvtepi32_ps(Integer const& a) // return (float)a (int32 --> float) { - return Float - { + return Float{ SIMD256T::cvtepi32_ps(a.v8[0]), SIMD256T::cvtepi32_ps(a.v8[1]), }; } -static SIMDINLINE Integer SIMDCALL cvtepu8_epi16(SIMD256Impl::Integer const &a) // return (int16)a (uint8 --> int16) +static SIMDINLINE Integer SIMDCALL + cvtepu8_epi16(SIMD256Impl::Integer const& a) // return (int16)a (uint8 --> int16) { - return Integer - { + return Integer{ SIMD256T::cvtepu8_epi16(a.v4[0]), SIMD256T::cvtepu8_epi16(a.v4[1]), }; } -static SIMDINLINE Integer SIMDCALL cvtepu8_epi32(SIMD256Impl::Integer const &a) // return (int32)a (uint8 --> int32) +static SIMDINLINE Integer SIMDCALL + cvtepu8_epi32(SIMD256Impl::Integer const& a) // return (int32)a (uint8 --> int32) { - return Integer - { + return Integer{ SIMD256T::cvtepu8_epi32(a.v4[0]), SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])), - }; + }; } -static SIMDINLINE Integer SIMDCALL cvtepu16_epi32(SIMD256Impl::Integer const &a) // return (int32)a (uint16 --> int32) +static SIMDINLINE Integer SIMDCALL + cvtepu16_epi32(SIMD256Impl::Integer const& a) // return (int32)a (uint16 --> int32) { - return Integer - { + return Integer{ SIMD256T::cvtepu16_epi32(a.v4[0]), SIMD256T::cvtepu16_epi32(a.v4[1]), }; } -static SIMDINLINE Integer SIMDCALL cvtepu16_epi64(SIMD256Impl::Integer const &a) // return (int64)a (uint16 --> int64) +static SIMDINLINE Integer SIMDCALL + cvtepu16_epi64(SIMD256Impl::Integer const& a) // return (int64)a (uint16 --> int64) { - return Integer - { + return Integer{ SIMD256T::cvtepu16_epi64(a.v4[0]), SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])), }; } -static SIMDINLINE Integer SIMDCALL cvtepu32_epi64(SIMD256Impl::Integer const &a) // return (int64)a (uint32 --> int64) +static SIMDINLINE Integer SIMDCALL + cvtepu32_epi64(SIMD256Impl::Integer const& a) // return (int64)a (uint32 --> int64) { - return Integer - { + return Integer{ SIMD256T::cvtepu32_epi64(a.v4[0]), SIMD256T::cvtepu32_epi64(a.v4[1]), }; } -static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float const &a) // return (int32)a (float --> int32) +static SIMDINLINE Integer SIMDCALL + cvtps_epi32(Float const& a) // return (int32)a (float --> int32) { - return Integer - { + return Integer{ SIMD256T::cvtps_epi32(a.v8[0]), SIMD256T::cvtps_epi32(a.v8[1]), }; } -static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float const &a) // return (int32)a (rnd_to_zero(float) --> int32) +static SIMDINLINE Integer SIMDCALL + cvttps_epi32(Float const& a) // return (int32)a (rnd_to_zero(float) --> int32) { - return Integer - { + return Integer{ SIMD256T::cvtps_epi32(a.v8[0]), SIMD256T::cvtps_epi32(a.v8[1]), }; @@ -387,126 +371,144 @@ static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float const &a) // ret //----------------------------------------------------------------------- // Comparison operations //----------------------------------------------------------------------- -template<CompareType CmpTypeT> -static SIMDINLINE Float SIMDCALL cmp_ps(Float const &a, Float const &b) // return a (CmpTypeT) b +template <CompareType CmpTypeT> +static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b { - return Float - { + return Float{ SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]), SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]), }; } -static SIMDINLINE Float SIMDCALL cmplt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LT_OQ>(a, b); } -static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GT_OQ>(a, b); } -static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); } -static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::EQ_OQ>(a, b); } -static SIMDINLINE Float SIMDCALL cmpge_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GE_OQ>(a, b); } -static SIMDINLINE Float SIMDCALL cmple_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LE_OQ>(a, b); } +static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b) +{ + return cmp_ps<CompareType::LT_OQ>(a, b); +} +static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b) +{ + return cmp_ps<CompareType::GT_OQ>(a, b); +} +static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b) +{ + return cmp_ps<CompareType::NEQ_OQ>(a, b); +} +static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b) +{ + return cmp_ps<CompareType::EQ_OQ>(a, b); +} +static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b) +{ + return cmp_ps<CompareType::GE_OQ>(a, b); +} +static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b) +{ + return cmp_ps<CompareType::LE_OQ>(a, b); +} -template<CompareType CmpTypeT> -static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float const &a, Float const &b) +template <CompareType CmpTypeT> +static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float const& a, Float const& b) { return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b))); } +SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8) +SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16) +SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32) +SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64) +SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8) +SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16) +SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32) +SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64) +SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32) -SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8) -SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16) -SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32) -SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64) -SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8) -SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16) -SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32) -SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64) -SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32) - -static SIMDINLINE bool SIMDCALL testz_ps(Float const &a, Float const &b) // return all_lanes_zero(a & b) ? 1 : 0 (float) +static SIMDINLINE bool SIMDCALL + testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float) { - return 0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) & - SIMD256T::testz_ps(a.v8[1], b.v8[1])); + return 0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) & SIMD256T::testz_ps(a.v8[1], b.v8[1])); } -static SIMDINLINE bool SIMDCALL testz_si(Integer const &a, Integer const &b) // return all_lanes_zero(a & b) ? 1 : 0 (int) +static SIMDINLINE bool SIMDCALL + testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int) { - return 0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) & - SIMD256T::testz_si(a.v8[1], b.v8[1])); + return 0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) & SIMD256T::testz_si(a.v8[1], b.v8[1])); } //----------------------------------------------------------------------- // Blend / shuffle / permute operations //----------------------------------------------------------------------- -SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float) -SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32) -SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float) -static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Float const &mask) // return mask ? b : a (int) -{ - return Integer - { +SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float) +SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32) +SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float) +static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a, + Integer const& b, + Float const& mask) // return mask ? b : a (int) +{ + return Integer{ SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]), SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]), }; } -static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Integer const &mask) // return mask ? b : a (int) +static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a, + Integer const& b, + Integer const& mask) // return mask ? b : a (int) { - return Integer - { + return Integer{ SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]), SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]), }; } -static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value) +static SIMDINLINE Float SIMDCALL + broadcast_ss(float const* p) // return *p (all elements in vector get same value) { float f = *p; - return Float - { + return Float{ SIMD256T::set1_ps(f), SIMD256T::set1_ps(f), }; } -template<int imm> -static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float const &a) +template <int imm> +static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float const& a) { SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); return a.v8[imm]; } -template<int imm> -static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double const &a) +template <int imm> +static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double const& a) { SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); return a.v8[imm]; } -template<int imm> -static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer const &a) +template <int imm> +static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer const& a) { SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); return a.v8[imm]; } -template<int imm> -static SIMDINLINE Float SIMDCALL insert_ps(Float const &a, SIMD256Impl::Float const &b) +template <int imm> +static SIMDINLINE Float SIMDCALL insert_ps(Float const& a, SIMD256Impl::Float const& b) { SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); - Float r = a; + Float r = a; r.v8[imm] = b; return r; } -template<int imm> -static SIMDINLINE Double SIMDCALL insert_pd(Double const &a, SIMD256Impl::Double const &b) +template <int imm> +static SIMDINLINE Double SIMDCALL insert_pd(Double const& a, SIMD256Impl::Double const& b) { SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); - Double r = a; + Double r = a; r.v8[imm] = b; return r; } -template<int imm> -static SIMDINLINE Integer SIMDCALL insert_si(Integer const &a, SIMD256Impl::Integer const &b) +template <int imm> +static SIMDINLINE Integer SIMDCALL insert_si(Integer const& a, SIMD256Impl::Integer const& b) { SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); Integer r = a; @@ -514,27 +516,28 @@ static SIMDINLINE Integer SIMDCALL insert_si(Integer const &a, SIMD256Impl::Inte return r; } -SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16 -SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32 -SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 -SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32 +SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16 +SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32 +SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 +SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32 -template<int ImmT> -static SIMDINLINE Float SIMDCALL permute_ps(Float const &a) +template <int ImmT> +static SIMDINLINE Float SIMDCALL permute_ps(Float const& a) { - return Float - { + return Float{ SIMD256T::template permute_ps<ImmT>(a.v8[0]), SIMD256T::template permute_ps<ImmT>(a.v8[1]), }; } -static SIMDINLINE Integer SIMDCALL permute_epi32(Integer const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (int32) +static SIMDINLINE Integer SIMDCALL permute_epi32( + Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32) { return castps_si(permute_ps(castsi_ps(a), swiz)); } -static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (float) +static SIMDINLINE Float SIMDCALL + permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float) { const auto mask = SIMD256T::set1_epi32(7); @@ -544,10 +547,11 @@ static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz) auto hilo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[1], mask)); auto hihi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[1], mask)); - return Float - { - SIMD256T::blendv_ps(lolo, lohi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))), - SIMD256T::blendv_ps(hilo, hihi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))), + return Float{ + SIMD256T::blendv_ps( + lolo, lohi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))), + SIMD256T::blendv_ps( + hilo, hihi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))), }; } @@ -562,7 +566,7 @@ static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz) // ESAC // RETURN tmp[127:0] // } -// +// // dst[127:0] : = SELECT4(a[511:0], imm8[1:0]) // dst[255:128] : = SELECT4(a[511:0], imm8[3:2]) // dst[383:256] : = SELECT4(b[511:0], imm8[5:4]) @@ -574,32 +578,35 @@ static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz) // AVX instructions for emulation. // template <int shuf> -static SIMDINLINE Float SIMDCALL permute2f128_ps(Float const &a, Float const &b) +static SIMDINLINE Float SIMDCALL permute2f128_ps(Float const& a, Float const& b) { - return Float - { - SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]), - SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]), + return Float{ + SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], + a.v8[1]), + SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], + b.v8[1]), }; } template <int shuf> -static SIMDINLINE Double SIMDCALL permute2f128_pd(Double const &a, Double const &b) +static SIMDINLINE Double SIMDCALL permute2f128_pd(Double const& a, Double const& b) { - return Double - { - SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]), - SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]), + return Double{ + SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], + a.v8[1]), + SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], + b.v8[1]), }; } template <int shuf> -static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer const &a, Integer const &b) +static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer const& a, Integer const& b) { - return Integer - { - SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]), - SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]), + return Integer{ + SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], + a.v8[1]), + SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], + b.v8[1]), }; } @@ -624,209 +631,193 @@ SIMD_WRAPPER_2(unpacklo_ps); //----------------------------------------------------------------------- // Load / store operations //----------------------------------------------------------------------- -template<ScaleFactor ScaleT> -static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer const &idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) +template <ScaleFactor ScaleT> +static SIMDINLINE Float SIMDCALL + i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) { - return Float - { + return Float{ SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]), SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]), }; } -static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements) +static SIMDINLINE Float SIMDCALL + load1_ps(float const* p) // return *p (broadcast 1 value to all elements) { return broadcast_ss(p); } -static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory) +static SIMDINLINE Float SIMDCALL + load_ps(float const* p) // return *p (loads SIMD width elements from memory) { - return Float - { - SIMD256T::load_ps(p), - SIMD256T::load_ps(p + TARGET_SIMD_WIDTH) - }; + return Float{SIMD256T::load_ps(p), SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)}; } -static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p +static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p { - return Integer - { + return Integer{ SIMD256T::load_si(&p->v8[0]), SIMD256T::load_si(&p->v8[1]), }; } -static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem) +static SIMDINLINE Float SIMDCALL + loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem) { - return Float - { - SIMD256T::loadu_ps(p), - SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH) - }; + return Float{SIMD256T::loadu_ps(p), SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)}; } -static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem) +static SIMDINLINE Integer SIMDCALL + loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem) { - return Integer - { + return Integer{ SIMD256T::loadu_si(&p->v8[0]), SIMD256T::loadu_si(&p->v8[1]), }; } // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old -template<ScaleFactor ScaleT> -static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float const &old, float const* p, Integer const &idx, Float const &mask) +template <ScaleFactor ScaleT> +static SIMDINLINE Float SIMDCALL + mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask) { - return Float - { + return Float{ SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]), SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]), }; } -static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer const &mask, Float const &src) +static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src) { SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]); SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]); } -static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer const &a) +static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer const& a) { uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0])); - mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4); + mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4); return mask; } -static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const &a) +static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a) { uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0])); - mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2); + mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2); return mask; } -static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const &a) +static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a) { uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0])); - mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH; + mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH; return mask; } static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value) { - return Integer - { - SIMD256T::set1_epi32(i), - SIMD256T::set1_epi32(i) - }; + return Integer{SIMD256T::set1_epi32(i), SIMD256T::set1_epi32(i)}; } static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value) { - return Integer - { - SIMD256T::set1_epi8(i), - SIMD256T::set1_epi8(i) - }; + return Integer{SIMD256T::set1_epi8(i), SIMD256T::set1_epi8(i)}; } -static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value) +static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value) { - return Float - { - SIMD256T::set1_ps(f), - SIMD256T::set1_ps(f) - }; + return Float{SIMD256T::set1_ps(f), SIMD256T::set1_ps(f)}; } -static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float) +static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float) { - return Float - { - SIMD256T::setzero_ps(), - SIMD256T::setzero_ps() - }; + return Float{SIMD256T::setzero_ps(), SIMD256T::setzero_ps()}; } -static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer) +static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer) { - return Integer - { - SIMD256T::setzero_si(), - SIMD256T::setzero_si() - }; + return Integer{SIMD256T::setzero_si(), SIMD256T::setzero_si()}; } -static SIMDINLINE void SIMDCALL store_ps(float *p, Float const &a) // *p = a (stores all elements contiguously in memory) +static SIMDINLINE void SIMDCALL + store_ps(float* p, Float const& a) // *p = a (stores all elements contiguously in memory) { SIMD256T::store_ps(p, a.v8[0]); SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]); } -static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer const &a) // *p = a +static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a { SIMD256T::store_si(&p->v8[0], a.v8[0]); SIMD256T::store_si(&p->v8[1], a.v8[1]); } -static SIMDINLINE void SIMDCALL stream_ps(float *p, Float const &a) // *p = a (same as store_ps, but doesn't keep memory in cache) +static SIMDINLINE void SIMDCALL + stream_ps(float* p, Float const& a) // *p = a (same as store_ps, but doesn't keep memory in cache) { SIMD256T::stream_ps(p, a.v8[0]); SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]); } -static SIMDINLINE Integer SIMDCALL set_epi32( - int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8, - int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0) +static SIMDINLINE Integer SIMDCALL set_epi32(int i15, + int i14, + int i13, + int i12, + int i11, + int i10, + int i9, + int i8, + int i7, + int i6, + int i5, + int i4, + int i3, + int i2, + int i1, + int i0) { - return Integer - { - SIMD256T::set_epi32( - i7, i6, i5, i4, i3, i2, i1, i0), - SIMD256T::set_epi32( - i15, i14, i13, i12, i11, i10, i9, i8) - }; + return Integer{SIMD256T::set_epi32(i7, i6, i5, i4, i3, i2, i1, i0), + SIMD256T::set_epi32(i15, i14, i13, i12, i11, i10, i9, i8)}; } -static SIMDINLINE Integer SIMDCALL set_epi32( - int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0) +static SIMDINLINE Integer SIMDCALL + set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0) { - return set_epi32( - 0, 0, 0, 0, 0, 0, 0, 0, - i7, i6, i5, i4, i3, i2, i1, i0); + return set_epi32(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0); } -static SIMDINLINE Float SIMDCALL set_ps( - float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8, - float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0) +static SIMDINLINE Float SIMDCALL set_ps(float i15, + float i14, + float i13, + float i12, + float i11, + float i10, + float i9, + float i8, + float i7, + float i6, + float i5, + float i4, + float i3, + float i2, + float i1, + float i0) { - return Float - { - SIMD256T::set_ps( - i7, i6, i5, i4, i3, i2, i1, i0), - SIMD256T::set_ps( - i15, i14, i13, i12, i11, i10, i9, i8) - }; + return Float{SIMD256T::set_ps(i7, i6, i5, i4, i3, i2, i1, i0), + SIMD256T::set_ps(i15, i14, i13, i12, i11, i10, i9, i8)}; } -static SIMDINLINE Float SIMDCALL set_ps( - float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0) +static SIMDINLINE Float SIMDCALL + set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0) { - return set_ps( - 0, 0, 0, 0, 0, 0, 0, 0, - i7, i6, i5, i4, i3, i2, i1, i0); + return set_ps(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0); } static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask) { - return Float - { - SIMD256T::vmask_ps(mask), - SIMD256T::vmask_ps(mask >> TARGET_SIMD_WIDTH) - }; + return Float{SIMD256T::vmask_ps(mask), SIMD256T::vmask_ps(mask >> TARGET_SIMD_WIDTH)}; } #undef SIMD_WRAPPER_1 diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl index bc5bff477a4..473934824ee 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl @@ -1,28 +1,27 @@ /**************************************************************************** -* Copyright (C) 2017 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2017 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #if !defined(__SIMD_LIB_AVX_HPP__) #error Do not include this file directly, use "simdlib.hpp" instead. #endif // no backwards compatibility for simd mask-enabled functions - diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp index df2df1b09cd..7902bcb2b64 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2017 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2017 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #pragma once #if 0 //=========================================================================== diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp index 0fad0e1fd8c..944c3c23fd3 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2017 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2017 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #pragma once #if !defined(__cplusplus) @@ -30,9 +30,9 @@ #include <inttypes.h> #include <stdint.h> -#define SIMD_ARCH_AVX 0 -#define SIMD_ARCH_AVX2 1 -#define SIMD_ARCH_AVX512 2 +#define SIMD_ARCH_AVX 0 +#define SIMD_ARCH_AVX2 1 +#define SIMD_ARCH_AVX512 2 #if !defined(SIMD_ARCH) #define SIMD_ARCH SIMD_ARCH_AVX @@ -55,81 +55,81 @@ namespace SIMDImpl { enum class CompareType { - EQ_OQ = 0x00, // Equal (ordered, nonsignaling) - LT_OS = 0x01, // Less-than (ordered, signaling) - LE_OS = 0x02, // Less-than-or-equal (ordered, signaling) - UNORD_Q = 0x03, // Unordered (nonsignaling) - NEQ_UQ = 0x04, // Not-equal (unordered, nonsignaling) - NLT_US = 0x05, // Not-less-than (unordered, signaling) - NLE_US = 0x06, // Not-less-than-or-equal (unordered, signaling) - ORD_Q = 0x07, // Ordered (nonsignaling) - EQ_UQ = 0x08, // Equal (unordered, non-signaling) - NGE_US = 0x09, // Not-greater-than-or-equal (unordered, signaling) - NGT_US = 0x0A, // Not-greater-than (unordered, signaling) - FALSE_OQ = 0x0B, // False (ordered, nonsignaling) - NEQ_OQ = 0x0C, // Not-equal (ordered, non-signaling) - GE_OS = 0x0D, // Greater-than-or-equal (ordered, signaling) - GT_OS = 0x0E, // Greater-than (ordered, signaling) - TRUE_UQ = 0x0F, // True (unordered, non-signaling) - EQ_OS = 0x10, // Equal (ordered, signaling) - LT_OQ = 0x11, // Less-than (ordered, nonsignaling) - LE_OQ = 0x12, // Less-than-or-equal (ordered, nonsignaling) - UNORD_S = 0x13, // Unordered (signaling) - NEQ_US = 0x14, // Not-equal (unordered, signaling) - NLT_UQ = 0x15, // Not-less-than (unordered, nonsignaling) - NLE_UQ = 0x16, // Not-less-than-or-equal (unordered, nonsignaling) - ORD_S = 0x17, // Ordered (signaling) - EQ_US = 0x18, // Equal (unordered, signaling) - NGE_UQ = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling) - NGT_UQ = 0x1A, // Not-greater-than (unordered, nonsignaling) - FALSE_OS = 0x1B, // False (ordered, signaling) - NEQ_OS = 0x1C, // Not-equal (ordered, signaling) - GE_OQ = 0x1D, // Greater-than-or-equal (ordered, nonsignaling) - GT_OQ = 0x1E, // Greater-than (ordered, nonsignaling) - TRUE_US = 0x1F, // True (unordered, signaling) + EQ_OQ = 0x00, // Equal (ordered, nonsignaling) + LT_OS = 0x01, // Less-than (ordered, signaling) + LE_OS = 0x02, // Less-than-or-equal (ordered, signaling) + UNORD_Q = 0x03, // Unordered (nonsignaling) + NEQ_UQ = 0x04, // Not-equal (unordered, nonsignaling) + NLT_US = 0x05, // Not-less-than (unordered, signaling) + NLE_US = 0x06, // Not-less-than-or-equal (unordered, signaling) + ORD_Q = 0x07, // Ordered (nonsignaling) + EQ_UQ = 0x08, // Equal (unordered, non-signaling) + NGE_US = 0x09, // Not-greater-than-or-equal (unordered, signaling) + NGT_US = 0x0A, // Not-greater-than (unordered, signaling) + FALSE_OQ = 0x0B, // False (ordered, nonsignaling) + NEQ_OQ = 0x0C, // Not-equal (ordered, non-signaling) + GE_OS = 0x0D, // Greater-than-or-equal (ordered, signaling) + GT_OS = 0x0E, // Greater-than (ordered, signaling) + TRUE_UQ = 0x0F, // True (unordered, non-signaling) + EQ_OS = 0x10, // Equal (ordered, signaling) + LT_OQ = 0x11, // Less-than (ordered, nonsignaling) + LE_OQ = 0x12, // Less-than-or-equal (ordered, nonsignaling) + UNORD_S = 0x13, // Unordered (signaling) + NEQ_US = 0x14, // Not-equal (unordered, signaling) + NLT_UQ = 0x15, // Not-less-than (unordered, nonsignaling) + NLE_UQ = 0x16, // Not-less-than-or-equal (unordered, nonsignaling) + ORD_S = 0x17, // Ordered (signaling) + EQ_US = 0x18, // Equal (unordered, signaling) + NGE_UQ = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling) + NGT_UQ = 0x1A, // Not-greater-than (unordered, nonsignaling) + FALSE_OS = 0x1B, // False (ordered, signaling) + NEQ_OS = 0x1C, // Not-equal (ordered, signaling) + GE_OQ = 0x1D, // Greater-than-or-equal (ordered, nonsignaling) + GT_OQ = 0x1E, // Greater-than (ordered, nonsignaling) + TRUE_US = 0x1F, // True (unordered, signaling) }; #if SIMD_ARCH >= SIMD_ARCH_AVX512 enum class CompareTypeInt { - EQ = _MM_CMPINT_EQ, // Equal - LT = _MM_CMPINT_LT, // Less than - LE = _MM_CMPINT_LE, // Less than or Equal - NE = _MM_CMPINT_NE, // Not Equal - GE = _MM_CMPINT_GE, // Greater than or Equal - GT = _MM_CMPINT_GT, // Greater than + EQ = _MM_CMPINT_EQ, // Equal + LT = _MM_CMPINT_LT, // Less than + LE = _MM_CMPINT_LE, // Less than or Equal + NE = _MM_CMPINT_NE, // Not Equal + GE = _MM_CMPINT_GE, // Greater than or Equal + GT = _MM_CMPINT_GT, // Greater than }; #endif // SIMD_ARCH >= SIMD_ARCH_AVX512 enum class ScaleFactor { - SF_1 = 1, // No scaling - SF_2 = 2, // Scale offset by 2 - SF_4 = 4, // Scale offset by 4 - SF_8 = 8, // Scale offset by 8 + SF_1 = 1, // No scaling + SF_2 = 2, // Scale offset by 2 + SF_4 = 4, // Scale offset by 4 + SF_8 = 8, // Scale offset by 8 }; enum class RoundMode { - TO_NEAREST_INT = 0x00, // Round to nearest integer == TRUNCATE(value + 0.5) - TO_NEG_INF = 0x01, // Round to negative infinity - TO_POS_INF = 0x02, // Round to positive infinity - TO_ZERO = 0x03, // Round to 0 a.k.a. truncate - CUR_DIRECTION = 0x04, // Round in direction set in MXCSR register - - RAISE_EXC = 0x00, // Raise exception on overflow - NO_EXC = 0x08, // Suppress exceptions - - NINT = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(RAISE_EXC), - NINT_NOEXC = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(NO_EXC), - FLOOR = static_cast<int>(TO_NEG_INF) | static_cast<int>(RAISE_EXC), - FLOOR_NOEXC = static_cast<int>(TO_NEG_INF) | static_cast<int>(NO_EXC), - CEIL = static_cast<int>(TO_POS_INF) | static_cast<int>(RAISE_EXC), - CEIL_NOEXC = static_cast<int>(TO_POS_INF) | static_cast<int>(NO_EXC), - TRUNC = static_cast<int>(TO_ZERO) | static_cast<int>(RAISE_EXC), - TRUNC_NOEXC = static_cast<int>(TO_ZERO) | static_cast<int>(NO_EXC), - RINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(RAISE_EXC), - NEARBYINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(NO_EXC), + TO_NEAREST_INT = 0x00, // Round to nearest integer == TRUNCATE(value + 0.5) + TO_NEG_INF = 0x01, // Round to negative infinity + TO_POS_INF = 0x02, // Round to positive infinity + TO_ZERO = 0x03, // Round to 0 a.k.a. truncate + CUR_DIRECTION = 0x04, // Round in direction set in MXCSR register + + RAISE_EXC = 0x00, // Raise exception on overflow + NO_EXC = 0x08, // Suppress exceptions + + NINT = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(RAISE_EXC), + NINT_NOEXC = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(NO_EXC), + FLOOR = static_cast<int>(TO_NEG_INF) | static_cast<int>(RAISE_EXC), + FLOOR_NOEXC = static_cast<int>(TO_NEG_INF) | static_cast<int>(NO_EXC), + CEIL = static_cast<int>(TO_POS_INF) | static_cast<int>(RAISE_EXC), + CEIL_NOEXC = static_cast<int>(TO_POS_INF) | static_cast<int>(NO_EXC), + TRUNC = static_cast<int>(TO_ZERO) | static_cast<int>(RAISE_EXC), + TRUNC_NOEXC = static_cast<int>(TO_ZERO) | static_cast<int>(NO_EXC), + RINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(RAISE_EXC), + NEARBYINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(NO_EXC), }; struct Traits @@ -140,7 +140,7 @@ namespace SIMDImpl }; // Attribute, 4-dimensional attribute in SIMD SOA layout - template<typename Float, typename Integer, typename Double> + template <typename Float, typename Integer, typename Double> union Vec4 { Float v[4]; @@ -148,14 +148,14 @@ namespace SIMDImpl Double vd[4]; struct { - Float x; - Float y; - Float z; - Float w; + Float x; + Float y; + Float z; + Float w; }; - SIMDINLINE Float& SIMDCALL operator[] (const int i) { return v[i]; } - SIMDINLINE Float const & SIMDCALL operator[] (const int i) const { return v[i]; } - SIMDINLINE Vec4& SIMDCALL operator=(Vec4 const & in) + SIMDINLINE Float& SIMDCALL operator[](const int i) { return v[i]; } + SIMDINLINE Float const& SIMDCALL operator[](const int i) const { return v[i]; } + SIMDINLINE Vec4& SIMDCALL operator=(Vec4 const& in) { v[0] = in.v[0]; v[1] = in.v[1]; @@ -171,8 +171,16 @@ namespace SIMDImpl { SIMDINLINE Float() = default; SIMDINLINE Float(__m128 in) : v(in) {} - SIMDINLINE Float& SIMDCALL operator=(__m128 in) { v = in; return *this; } - SIMDINLINE Float& SIMDCALL operator=(Float const & in) { v = in.v; return *this; } + SIMDINLINE Float& SIMDCALL operator=(__m128 in) + { + v = in; + return *this; + } + SIMDINLINE Float& SIMDCALL operator=(Float const& in) + { + v = in.v; + return *this; + } SIMDINLINE SIMDCALL operator __m128() const { return v; } SIMDALIGN(__m128, 16) v; @@ -182,8 +190,16 @@ namespace SIMDImpl { SIMDINLINE Integer() = default; SIMDINLINE Integer(__m128i in) : v(in) {} - SIMDINLINE Integer& SIMDCALL operator=(__m128i in) { v = in; return *this; } - SIMDINLINE Integer& SIMDCALL operator=(Integer const & in) { v = in.v; return *this; } + SIMDINLINE Integer& SIMDCALL operator=(__m128i in) + { + v = in; + return *this; + } + SIMDINLINE Integer& SIMDCALL operator=(Integer const& in) + { + v = in.v; + return *this; + } SIMDINLINE SIMDCALL operator __m128i() const { return v; } SIMDALIGN(__m128i, 16) v; @@ -193,8 +209,16 @@ namespace SIMDImpl { SIMDINLINE Double() = default; SIMDINLINE Double(__m128d in) : v(in) {} - SIMDINLINE Double& SIMDCALL operator=(__m128d in) { v = in; return *this; } - SIMDINLINE Double& SIMDCALL operator=(Double const & in) { v = in.v; return *this; } + SIMDINLINE Double& SIMDCALL operator=(__m128d in) + { + v = in; + return *this; + } + SIMDINLINE Double& SIMDCALL operator=(Double const& in) + { + v = in.v; + return *this; + } SIMDINLINE SIMDCALL operator __m128d() const { return v; } SIMDALIGN(__m128d, 16) v; @@ -204,7 +228,7 @@ namespace SIMDImpl using Mask = uint8_t; static const uint32_t SIMD_WIDTH = 4; - } // ns SIMD128Impl + } // namespace SIMD128Impl namespace SIMD256Impl { @@ -212,12 +236,21 @@ namespace SIMDImpl { SIMDINLINE Float() = default; SIMDINLINE Float(__m256 in) : v(in) {} - SIMDINLINE Float(SIMD128Impl::Float const &in_lo, SIMD128Impl::Float const &in_hi = _mm_setzero_ps()) + SIMDINLINE Float(SIMD128Impl::Float const& in_lo, + SIMD128Impl::Float const& in_hi = _mm_setzero_ps()) { v = _mm256_insertf128_ps(_mm256_castps128_ps256(in_lo), in_hi, 0x1); } - SIMDINLINE Float& SIMDCALL operator=(__m256 in) { v = in; return *this; } - SIMDINLINE Float& SIMDCALL operator=(Float const & in) { v = in.v; return *this; } + SIMDINLINE Float& SIMDCALL operator=(__m256 in) + { + v = in; + return *this; + } + SIMDINLINE Float& SIMDCALL operator=(Float const& in) + { + v = in.v; + return *this; + } SIMDINLINE SIMDCALL operator __m256() const { return v; } SIMDALIGN(__m256, 32) v; @@ -228,12 +261,21 @@ namespace SIMDImpl { SIMDINLINE Integer() = default; SIMDINLINE Integer(__m256i in) : v(in) {} - SIMDINLINE Integer(SIMD128Impl::Integer const &in_lo, SIMD128Impl::Integer const &in_hi = _mm_setzero_si128()) + SIMDINLINE Integer(SIMD128Impl::Integer const& in_lo, + SIMD128Impl::Integer const& in_hi = _mm_setzero_si128()) { v = _mm256_insertf128_si256(_mm256_castsi128_si256(in_lo), in_hi, 0x1); } - SIMDINLINE Integer& SIMDCALL operator=(__m256i in) { v = in; return *this; } - SIMDINLINE Integer& SIMDCALL operator=(Integer const & in) { v = in.v; return *this; } + SIMDINLINE Integer& SIMDCALL operator=(__m256i in) + { + v = in; + return *this; + } + SIMDINLINE Integer& SIMDCALL operator=(Integer const& in) + { + v = in.v; + return *this; + } SIMDINLINE SIMDCALL operator __m256i() const { return v; } SIMDALIGN(__m256i, 32) v; @@ -243,13 +285,22 @@ namespace SIMDImpl union Double { SIMDINLINE Double() = default; - SIMDINLINE Double(__m256d const &in) : v(in) {} - SIMDINLINE Double(SIMD128Impl::Double const &in_lo, SIMD128Impl::Double const &in_hi = _mm_setzero_pd()) + SIMDINLINE Double(__m256d const& in) : v(in) {} + SIMDINLINE Double(SIMD128Impl::Double const& in_lo, + SIMD128Impl::Double const& in_hi = _mm_setzero_pd()) { v = _mm256_insertf128_pd(_mm256_castpd128_pd256(in_lo), in_hi, 0x1); } - SIMDINLINE Double& SIMDCALL operator=(__m256d in) { v = in; return *this; } - SIMDINLINE Double& SIMDCALL operator=(Double const & in) { v = in.v; return *this; } + SIMDINLINE Double& SIMDCALL operator=(__m256d in) + { + v = in; + return *this; + } + SIMDINLINE Double& SIMDCALL operator=(Double const& in) + { + v = in.v; + return *this; + } SIMDINLINE SIMDCALL operator __m256d() const { return v; } SIMDALIGN(__m256d, 32) v; @@ -260,7 +311,7 @@ namespace SIMDImpl using Mask = uint8_t; static const uint32_t SIMD_WIDTH = 8; - } // ns SIMD256Impl + } // namespace SIMD256Impl namespace SIMD512Impl { @@ -282,14 +333,14 @@ namespace SIMDImpl union __m512i { private: - int8_t m512i_i8[64]; - int16_t m512i_i16[32]; - int32_t m512i_i32[16]; - int64_t m512i_i64[8]; - uint8_t m512i_u8[64]; - uint16_t m512i_u16[32]; - uint32_t m512i_u32[16]; - uint64_t m512i_u64[8]; + int8_t m512i_i8[64]; + int16_t m512i_i16[32]; + int32_t m512i_i32[16]; + int64_t m512i_i64[8]; + uint8_t m512i_u8[64]; + uint16_t m512i_u16[32]; + uint32_t m512i_u32[16]; + uint64_t m512i_u64[8]; }; using __mmask16 = uint16_t; @@ -305,9 +356,18 @@ namespace SIMDImpl { SIMDINLINE Float() = default; SIMDINLINE Float(__m512 in) : v(in) {} - SIMDINLINE Float(SIMD256Impl::Float const &in_lo, SIMD256Impl::Float const &in_hi = _mm256_setzero_ps()) { v8[0] = in_lo; v8[1] = in_hi; } - SIMDINLINE Float& SIMDCALL operator=(__m512 in) { v = in; return *this; } - SIMDINLINE Float& SIMDCALL operator=(Float const & in) + SIMDINLINE Float(SIMD256Impl::Float const& in_lo, + SIMD256Impl::Float const& in_hi = _mm256_setzero_ps()) + { + v8[0] = in_lo; + v8[1] = in_hi; + } + SIMDINLINE Float& SIMDCALL operator=(__m512 in) + { + v = in; + return *this; + } + SIMDINLINE Float& SIMDCALL operator=(Float const& in) { #if SIMD_ARCH >= SIMD_ARCH_AVX512 v = in.v; @@ -327,9 +387,18 @@ namespace SIMDImpl { SIMDINLINE Integer() = default; SIMDINLINE Integer(__m512i in) : v(in) {} - SIMDINLINE Integer(SIMD256Impl::Integer const &in_lo, SIMD256Impl::Integer const &in_hi = _mm256_setzero_si256()) { v8[0] = in_lo; v8[1] = in_hi; } - SIMDINLINE Integer& SIMDCALL operator=(__m512i in) { v = in; return *this; } - SIMDINLINE Integer& SIMDCALL operator=(Integer const & in) + SIMDINLINE Integer(SIMD256Impl::Integer const& in_lo, + SIMD256Impl::Integer const& in_hi = _mm256_setzero_si256()) + { + v8[0] = in_lo; + v8[1] = in_hi; + } + SIMDINLINE Integer& SIMDCALL operator=(__m512i in) + { + v = in; + return *this; + } + SIMDINLINE Integer& SIMDCALL operator=(Integer const& in) { #if SIMD_ARCH >= SIMD_ARCH_AVX512 v = in.v; @@ -350,9 +419,18 @@ namespace SIMDImpl { SIMDINLINE Double() = default; SIMDINLINE Double(__m512d in) : v(in) {} - SIMDINLINE Double(SIMD256Impl::Double const &in_lo, SIMD256Impl::Double const &in_hi = _mm256_setzero_pd()) { v8[0] = in_lo; v8[1] = in_hi; } - SIMDINLINE Double& SIMDCALL operator=(__m512d in) { v = in; return *this; } - SIMDINLINE Double& SIMDCALL operator=(Double const & in) + SIMDINLINE Double(SIMD256Impl::Double const& in_lo, + SIMD256Impl::Double const& in_hi = _mm256_setzero_pd()) + { + v8[0] = in_lo; + v8[1] = in_hi; + } + SIMDINLINE Double& SIMDCALL operator=(__m512d in) + { + v = in; + return *this; + } + SIMDINLINE Double& SIMDCALL operator=(Double const& in) { #if SIMD_ARCH >= SIMD_ARCH_AVX512 v = in.v; @@ -375,5 +453,5 @@ namespace SIMDImpl static const uint32_t SIMD_WIDTH = 16; #undef SIMD_ALIGNMENT_BYTES - } // ns SIMD512Impl -} // ns SIMDImpl + } // namespace SIMD512Impl +} // namespace SIMDImpl diff --git a/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp b/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp index 43b74a68fde..8e874fbc223 100644 --- a/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp +++ b/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #include "common/os.h" #include <stdarg.h> @@ -38,29 +38,32 @@ namespace ConsoleUtils { enum class TextColor { - BLACK = 0, + BLACK = 0, #if defined(_WIN32) - RED = 4, - GREEN = 2, - BLUE = 1, + RED = 4, + GREEN = 2, + BLUE = 1, #else - RED = 1, - GREEN = 2, - BLUE = 4, + RED = 1, + GREEN = 2, + BLUE = 4, #endif // _WIN32 - PURPLE = static_cast<uint32_t>(RED) | static_cast<uint32_t>(BLUE), - CYAN = static_cast<uint32_t>(GREEN) | static_cast<uint32_t>(BLUE), - YELLOW = static_cast<uint32_t>(RED) | static_cast<uint32_t>(GREEN), - WHITE = static_cast<uint32_t>(RED) | static_cast<uint32_t>(GREEN) | static_cast<uint32_t>(BLUE), + PURPLE = static_cast<uint32_t>(RED) | static_cast<uint32_t>(BLUE), + CYAN = static_cast<uint32_t>(GREEN) | static_cast<uint32_t>(BLUE), + YELLOW = static_cast<uint32_t>(RED) | static_cast<uint32_t>(GREEN), + WHITE = + static_cast<uint32_t>(RED) | static_cast<uint32_t>(GREEN) | static_cast<uint32_t>(BLUE), }; enum class TextStyle { - NORMAL = 0, - INTENSITY = 1, + NORMAL = 0, + INTENSITY = 1, }; - void SetTextColor(FILE* stream, TextColor color = TextColor::WHITE, TextStyle style = TextStyle::NORMAL) + void SetTextColor(FILE* stream, + TextColor color = TextColor::WHITE, + TextStyle style = TextStyle::NORMAL) { #if defined(_WIN32) @@ -89,7 +92,8 @@ namespace ConsoleUtils #else // !_WIN32 // Print ANSI codes - uint32_t cc = 30 + ((style == TextStyle::INTENSITY) ? 60 : 0) + static_cast<uint32_t>(color); + uint32_t cc = + 30 + ((style == TextStyle::INTENSITY) ? 60 : 0) + static_cast<uint32_t>(color); fprintf(stream, "\033[0m\033[%d;%dm", static_cast<uint32_t>(style), cc); #endif @@ -110,17 +114,16 @@ namespace ConsoleUtils } static std::mutex g_stderrMutex; -} // ns ConsoleUtils - -bool SwrAssert( - bool chkDebugger, - bool& enabled, - const char* pExpression, - const char* pFileName, - uint32_t lineNum, - const char* pFunction, - const char* pFmtString, - ...) +} // namespace ConsoleUtils + +bool SwrAssert(bool chkDebugger, + bool& enabled, + const char* pExpression, + const char* pFileName, + uint32_t lineNum, + const char* pFunction, + const char* pFmtString, + ...) { using namespace ConsoleUtils; std::lock_guard<std::mutex> l(g_stderrMutex); @@ -151,7 +154,7 @@ bool SwrAssert( #if defined(_WIN32) static const int MAX_MESSAGE_LEN = 2048; - char msgBuf[MAX_MESSAGE_LEN]; + char msgBuf[MAX_MESSAGE_LEN]; sprintf_s(msgBuf, "%s(%d): ASSERT: %s\n", pFileName, lineNum, pExpression); msgBuf[MAX_MESSAGE_LEN - 2] = '\n'; @@ -169,15 +172,13 @@ bool SwrAssert( { va_list args; va_start(args, pFmtString); - offset = _vsnprintf_s( - msgBuf, - sizeof(msgBuf), - sizeof(msgBuf), - pFmtString, - args); + offset = _vsnprintf_s(msgBuf, sizeof(msgBuf), sizeof(msgBuf), pFmtString, args); va_end(args); - if (offset < 0) { return true; } + if (offset < 0) + { + return true; + } OutputDebugStringA("\t"); OutputDebugStringA(msgBuf); @@ -186,46 +187,51 @@ bool SwrAssert( if (enabled && KNOB_ENABLE_ASSERT_DIALOGS) { - int retval = sprintf_s( - &msgBuf[offset], - MAX_MESSAGE_LEN - offset, - "\n\n" - "File: %s\n" - "Line: %d\n" - "\n" - "Expression: %s\n\n" - "Cancel: Disable this assert for the remainder of the process\n" - "Try Again: Break into the debugger\n" - "Continue: Continue execution (but leave assert enabled)", - pFileName, - lineNum, - pExpression); - - if (retval < 0) { return true; } + int retval = sprintf_s(&msgBuf[offset], + MAX_MESSAGE_LEN - offset, + "\n\n" + "File: %s\n" + "Line: %d\n" + "\n" + "Expression: %s\n\n" + "Cancel: Disable this assert for the remainder of the process\n" + "Try Again: Break into the debugger\n" + "Continue: Continue execution (but leave assert enabled)", + pFileName, + lineNum, + pExpression); + + if (retval < 0) + { + return true; + } offset += retval; if (!IsDebuggerPresent()) { - sprintf_s( - &msgBuf[offset], - MAX_MESSAGE_LEN - offset, - "\n\n*** NO DEBUGGER DETECTED ***\n\nPressing \"Try Again\" will cause a program crash!"); + sprintf_s(&msgBuf[offset], + MAX_MESSAGE_LEN - offset, + "\n\n*** NO DEBUGGER DETECTED ***\n\nPressing \"Try Again\" will cause a " + "program crash!"); } - retval = MessageBoxA(nullptr, msgBuf, "Assert Failed", MB_CANCELTRYCONTINUE | MB_ICONEXCLAMATION | MB_SETFOREGROUND); + retval = MessageBoxA(nullptr, + msgBuf, + "Assert Failed", + MB_CANCELTRYCONTINUE | MB_ICONEXCLAMATION | MB_SETFOREGROUND); switch (retval) { - case IDCANCEL: - enabled = false; - return false; + case IDCANCEL: + enabled = false; + return false; - case IDTRYAGAIN: - return true; + case IDTRYAGAIN: + return true; - case IDCONTINUE: - return false; + case IDCONTINUE: + return false; } } else @@ -238,11 +244,7 @@ bool SwrAssert( } void SwrTrace( - const char* pFileName, - uint32_t lineNum, - const char* pFunction, - const char* pFmtString, - ...) + const char* pFileName, uint32_t lineNum, const char* pFunction, const char* pFmtString, ...) { using namespace ConsoleUtils; std::lock_guard<std::mutex> l(g_stderrMutex); @@ -266,7 +268,7 @@ void SwrTrace( #if defined(_WIN32) static const int MAX_MESSAGE_LEN = 2048; - char msgBuf[MAX_MESSAGE_LEN]; + char msgBuf[MAX_MESSAGE_LEN]; sprintf_s(msgBuf, "%s(%d): TRACE in %s\n", pFileName, lineNum, pFunction); msgBuf[MAX_MESSAGE_LEN - 2] = '\n'; @@ -279,15 +281,13 @@ void SwrTrace( { va_list args; va_start(args, pFmtString); - offset = _vsnprintf_s( - msgBuf, - sizeof(msgBuf), - sizeof(msgBuf), - pFmtString, - args); + offset = _vsnprintf_s(msgBuf, sizeof(msgBuf), sizeof(msgBuf), pFmtString, args); va_end(args); - if (offset < 0) { return; } + if (offset < 0) + { + return; + } OutputDebugStringA("\t"); OutputDebugStringA(msgBuf); diff --git a/src/gallium/drivers/swr/rasterizer/common/swr_assert.h b/src/gallium/drivers/swr/rasterizer/common/swr_assert.h index a9e5bb4e77f..d74b7981255 100644 --- a/src/gallium/drivers/swr/rasterizer/common/swr_assert.h +++ b/src/gallium/drivers/swr/rasterizer/common/swr_assert.h @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #ifndef __SWR_ASSERT_H__ #define __SWR_ASSERT_H__ @@ -55,28 +55,38 @@ // Stupid preprocessor tricks to avoid -Wall / -W4 warnings #if defined(_MSC_VER) -#define _SWR_WARN_DISABLE __pragma(warning(push)) __pragma(warning(disable:4127)) +#define _SWR_WARN_DISABLE __pragma(warning(push)) __pragma(warning(disable : 4127)) #define _SWR_WARN_RESTORE __pragma(warning(pop)) #else // ! MSVC compiler #define _SWR_WARN_DISABLE #define _SWR_WARN_RESTORE #endif -#define _SWR_MACRO_START do { -#define _SWR_MACRO_END \ - _SWR_WARN_DISABLE \ - } while(0) \ +#define _SWR_MACRO_START \ + do \ + { +#define _SWR_MACRO_END \ + _SWR_WARN_DISABLE \ + } \ + while (0) \ _SWR_WARN_RESTORE - #if defined(_WIN32) -#define SWR_ASSUME(e, ...) _SWR_MACRO_START __assume(e); _SWR_MACRO_END +#define SWR_ASSUME(e, ...) \ + _SWR_MACRO_START __assume(e); \ + _SWR_MACRO_END #elif defined(__clang__) -#define SWR_ASSUME(e, ...) _SWR_MACRO_START __builtin_assume(e); _SWR_MACRO_END +#define SWR_ASSUME(e, ...) \ + _SWR_MACRO_START __builtin_assume(e); \ + _SWR_MACRO_END #elif defined(__GNUC__) -#define SWR_ASSUME(e, ...) _SWR_MACRO_START ((e) ? ((void)0) : __builtin_unreachable()); _SWR_MACRO_END +#define SWR_ASSUME(e, ...) \ + _SWR_MACRO_START((e) ? ((void)0) : __builtin_unreachable()); \ + _SWR_MACRO_END #else -#define SWR_ASSUME(e, ...) _SWR_MACRO_START ASSUME(e); _SWR_MACRO_END +#define SWR_ASSUME(e, ...) \ + _SWR_MACRO_START ASSUME(e); \ + _SWR_MACRO_END #endif #if !defined(SWR_ENABLE_ASSERTS) @@ -110,47 +120,50 @@ #else -bool SwrAssert( - bool chkDebugger, - bool& enabled, - const char* pExpression, - const char* pFileName, - uint32_t lineNum, - const char* function, - const char* pFmtString = nullptr, - ...); +bool SwrAssert(bool chkDebugger, + bool& enabled, + const char* pExpression, + const char* pFileName, + uint32_t lineNum, + const char* function, + const char* pFmtString = nullptr, + ...); void SwrTrace( - const char* pFileName, - uint32_t lineNum, - const char* function, - const char* pFmtString, - ...); - -#define _SWR_ASSERT(chkDebugger, e, ...) \ - _SWR_MACRO_START \ - bool expFailed = !(e);\ - if (expFailed) {\ - static bool swrAssertEnabled = true;\ - expFailed = SwrAssert(chkDebugger, swrAssertEnabled, #e, __FILE__, __LINE__, __FUNCTION__, ##__VA_ARGS__);\ - if (expFailed) { DEBUGBREAK; }\ - }\ + const char* pFileName, uint32_t lineNum, const char* function, const char* pFmtString, ...); + +#define _SWR_ASSERT(chkDebugger, e, ...) \ + _SWR_MACRO_START \ + bool expFailed = !(e); \ + if (expFailed) \ + { \ + static bool swrAssertEnabled = true; \ + expFailed = SwrAssert( \ + chkDebugger, swrAssertEnabled, #e, __FILE__, __LINE__, __FUNCTION__, ##__VA_ARGS__); \ + if (expFailed) \ + { \ + DEBUGBREAK; \ + } \ + } \ _SWR_MACRO_END -#define _SWR_INVALID(chkDebugger, ...) \ - _SWR_MACRO_START \ - static bool swrAssertEnabled = true;\ - bool expFailed = SwrAssert(chkDebugger, swrAssertEnabled, "", __FILE__, __LINE__, __FUNCTION__, ##__VA_ARGS__);\ - if (expFailed) { DEBUGBREAK; }\ +#define _SWR_INVALID(chkDebugger, ...) \ + _SWR_MACRO_START \ + static bool swrAssertEnabled = true; \ + bool expFailed = SwrAssert( \ + chkDebugger, swrAssertEnabled, "", __FILE__, __LINE__, __FUNCTION__, ##__VA_ARGS__); \ + if (expFailed) \ + { \ + DEBUGBREAK; \ + } \ _SWR_MACRO_END -#define _SWR_TRACE(_fmtstr, ...) \ - SwrTrace(__FILE__, __LINE__, __FUNCTION__, _fmtstr, ##__VA_ARGS__); +#define _SWR_TRACE(_fmtstr, ...) SwrTrace(__FILE__, __LINE__, __FUNCTION__, _fmtstr, ##__VA_ARGS__); #if SWR_ENABLE_ASSERTS -#define SWR_ASSERT(e, ...) _SWR_ASSERT(true, e, ##__VA_ARGS__) -#define SWR_ASSUME_ASSERT(e, ...) SWR_ASSERT(e, ##__VA_ARGS__) -#define SWR_TRACE(_fmtstr, ...) _SWR_TRACE(_fmtstr, ##__VA_ARGS__) +#define SWR_ASSERT(e, ...) _SWR_ASSERT(true, e, ##__VA_ARGS__) +#define SWR_ASSUME_ASSERT(e, ...) SWR_ASSERT(e, ##__VA_ARGS__) +#define SWR_TRACE(_fmtstr, ...) _SWR_TRACE(_fmtstr, ##__VA_ARGS__) #if defined(assert) #undef assert @@ -160,24 +173,25 @@ void SwrTrace( #endif // SWR_ENABLE_ASSERTS #if SWR_ENABLE_REL_ASSERTS -#define SWR_REL_ASSERT(e, ...) _SWR_ASSERT(false, e, ##__VA_ARGS__) -#define SWR_REL_ASSUME_ASSERT(e, ...) SWR_REL_ASSERT(e, ##__VA_ARGS__) -#define SWR_REL_TRACE(_fmtstr, ...) _SWR_TRACE(_fmtstr, ##__VA_ARGS__) +#define SWR_REL_ASSERT(e, ...) _SWR_ASSERT(false, e, ##__VA_ARGS__) +#define SWR_REL_ASSUME_ASSERT(e, ...) SWR_REL_ASSERT(e, ##__VA_ARGS__) +#define SWR_REL_TRACE(_fmtstr, ...) _SWR_TRACE(_fmtstr, ##__VA_ARGS__) // SWR_INVALID is always enabled // Funky handling to allow 0 arguments with g++/gcc // This is needed because you can't "swallow commas" with ##_VA_ARGS__ unless // there is a first argument to the macro. So having a macro that can optionally // accept 0 arguments is tricky. -#define _SWR_INVALID_0() _SWR_INVALID(false) -#define _SWR_INVALID_1(...) _SWR_INVALID(false, ##__VA_ARGS__) +#define _SWR_INVALID_0() _SWR_INVALID(false) +#define _SWR_INVALID_1(...) _SWR_INVALID(false, ##__VA_ARGS__) #define _SWR_INVALID_VARGS_(_10, _9, _8, _7, _6, _5, _4, _3, _2, _1, N, ...) N -#define _SWR_INVALID_VARGS(...) _SWR_INVALID_VARGS_(__VA_ARGS__, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) -#define _SWR_INVALID_VARGS_0() 1, 2, 3, 4, 5, 6, 7, 9, 9, 10 -#define _SWR_INVALID_CONCAT_(a, b) a##b -#define _SWR_INVALID_CONCAT(a, b) _SWR_INVALID_CONCAT_(a, b) -#define SWR_INVALID(...) \ - _SWR_INVALID_CONCAT(_SWR_INVALID_,_SWR_INVALID_VARGS(_SWR_INVALID_VARGS_0 __VA_ARGS__ ()))(__VA_ARGS__) +#define _SWR_INVALID_VARGS(...) _SWR_INVALID_VARGS_(__VA_ARGS__, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) +#define _SWR_INVALID_VARGS_0() 1, 2, 3, 4, 5, 6, 7, 9, 9, 10 +#define _SWR_INVALID_CONCAT_(a, b) a##b +#define _SWR_INVALID_CONCAT(a, b) _SWR_INVALID_CONCAT_(a, b) +#define SWR_INVALID(...) \ + _SWR_INVALID_CONCAT(_SWR_INVALID_, _SWR_INVALID_VARGS(_SWR_INVALID_VARGS_0 __VA_ARGS__())) \ + (__VA_ARGS__) #endif #endif // C++ @@ -185,20 +199,33 @@ void SwrTrace( #endif // SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS // Needed to allow passing bitfield members to sizeof() in disabled asserts -template<typename T> -static bool SwrSizeofWorkaround(T) {return false;} +template <typename T> +static bool SwrSizeofWorkaround(T) +{ + return false; +} #if !SWR_ENABLE_ASSERTS -#define SWR_ASSERT(e, ...) _SWR_MACRO_START (void)sizeof(SwrSizeofWorkaround(e)); _SWR_MACRO_END -#define SWR_ASSUME_ASSERT(e, ...) SWR_ASSUME(e, ##__VA_ARGS__) -#define SWR_TRACE(_fmtstr, ...) _SWR_MACRO_START (void)(0); _SWR_MACRO_END +#define SWR_ASSERT(e, ...) \ + _SWR_MACRO_START(void) sizeof(SwrSizeofWorkaround(e)); \ + _SWR_MACRO_END +#define SWR_ASSUME_ASSERT(e, ...) SWR_ASSUME(e, ##__VA_ARGS__) +#define SWR_TRACE(_fmtstr, ...) \ + _SWR_MACRO_START(void)(0); \ + _SWR_MACRO_END #endif #if !SWR_ENABLE_REL_ASSERTS -#define SWR_REL_ASSERT(e, ...) _SWR_MACRO_START (void)sizeof(SwrSizeofWorkaround(e)); _SWR_MACRO_END -#define SWR_INVALID(...) _SWR_MACRO_START (void)(0); _SWR_MACRO_END -#define SWR_REL_ASSUME_ASSERT(e, ...) SWR_ASSUME(e, ##__VA_ARGS__) -#define SWR_REL_TRACE(_fmtstr, ...) _SWR_MACRO_START (void)(0); _SWR_MACRO_END +#define SWR_REL_ASSERT(e, ...) \ + _SWR_MACRO_START(void) sizeof(SwrSizeofWorkaround(e)); \ + _SWR_MACRO_END +#define SWR_INVALID(...) \ + _SWR_MACRO_START(void)(0); \ + _SWR_MACRO_END +#define SWR_REL_ASSUME_ASSERT(e, ...) SWR_ASSUME(e, ##__VA_ARGS__) +#define SWR_REL_TRACE(_fmtstr, ...) \ + _SWR_MACRO_START(void)(0); \ + _SWR_MACRO_END #endif #if defined(_MSC_VER) @@ -211,4 +238,4 @@ static bool SwrSizeofWorkaround(T) {return false;} #define SWR_NOT_IMPL SWR_INVALID("%s not implemented", SWR_FUNCTION_DECL) -#endif//__SWR_ASSERT_H__ +#endif //__SWR_ASSERT_H__ diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index c932ec0bd66..00f331303ee 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -1,30 +1,30 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file api.cpp -* -* @brief API implementation -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file api.cpp + * + * @brief API implementation + * + ******************************************************************************/ #include <cfloat> #include <cmath> @@ -46,16 +46,16 @@ #include "common/os.h" -static const SWR_RECT g_MaxScissorRect = { 0, 0, KNOB_MAX_SCISSOR_X, KNOB_MAX_SCISSOR_Y }; +static const SWR_RECT g_MaxScissorRect = {0, 0, KNOB_MAX_SCISSOR_X, KNOB_MAX_SCISSOR_Y}; -void SetupDefaultState(SWR_CONTEXT *pContext); +void SetupDefaultState(SWR_CONTEXT* pContext); static INLINE SWR_CONTEXT* GetContext(HANDLE hContext) { return (SWR_CONTEXT*)hContext; } -void WakeAllThreads(SWR_CONTEXT *pContext) +void WakeAllThreads(SWR_CONTEXT* pContext) { pContext->FifosNotEmpty.notify_all(); } @@ -63,15 +63,14 @@ void WakeAllThreads(SWR_CONTEXT *pContext) ////////////////////////////////////////////////////////////////////////// /// @brief Create SWR Context. /// @param pCreateInfo - pointer to creation info. -HANDLE SwrCreateContext( - SWR_CREATECONTEXT_INFO* pCreateInfo) +HANDLE SwrCreateContext(SWR_CREATECONTEXT_INFO* pCreateInfo) { RDTSC_RESET(); RDTSC_INIT(0); void* pContextMem = AlignedMalloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4); memset(pContextMem, 0, sizeof(SWR_CONTEXT)); - SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT(); + SWR_CONTEXT* pContext = new (pContextMem) SWR_CONTEXT(); pContext->privateStateSize = pCreateInfo->privateStateSize; @@ -84,8 +83,10 @@ HANDLE SwrCreateContext( pContext->dcRing.Init(pContext->MAX_DRAWS_IN_FLIGHT); pContext->dsRing.Init(pContext->MAX_DRAWS_IN_FLIGHT); - pContext->pMacroTileManagerArray = (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * pContext->MAX_DRAWS_IN_FLIGHT, 64); - pContext->pDispatchQueueArray = (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * pContext->MAX_DRAWS_IN_FLIGHT, 64); + pContext->pMacroTileManagerArray = + (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * pContext->MAX_DRAWS_IN_FLIGHT, 64); + pContext->pDispatchQueueArray = + (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * pContext->MAX_DRAWS_IN_FLIGHT, 64); for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc) { @@ -102,14 +103,14 @@ HANDLE SwrCreateContext( } else { - pContext->threadInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS; - pContext->threadInfo.BASE_NUMA_NODE = KNOB_BASE_NUMA_NODE; - pContext->threadInfo.BASE_CORE = KNOB_BASE_CORE; - pContext->threadInfo.BASE_THREAD = KNOB_BASE_THREAD; - pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES; - pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE; - pContext->threadInfo.MAX_THREADS_PER_CORE = KNOB_MAX_THREADS_PER_CORE; - pContext->threadInfo.SINGLE_THREADED = KNOB_SINGLE_THREADED; + pContext->threadInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS; + pContext->threadInfo.BASE_NUMA_NODE = KNOB_BASE_NUMA_NODE; + pContext->threadInfo.BASE_CORE = KNOB_BASE_CORE; + pContext->threadInfo.BASE_THREAD = KNOB_BASE_THREAD; + pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES; + pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE; + pContext->threadInfo.MAX_THREADS_PER_CORE = KNOB_MAX_THREADS_PER_CORE; + pContext->threadInfo.SINGLE_THREADED = KNOB_SINGLE_THREADED; } if (pCreateInfo->pApiThreadInfo) @@ -118,9 +119,9 @@ HANDLE SwrCreateContext( } else { - pContext->apiThreadInfo.bindAPIThread0 = true; - pContext->apiThreadInfo.numAPIReservedThreads = 1; - pContext->apiThreadInfo.numAPIThreadsPerCore = 1; + pContext->apiThreadInfo.bindAPIThread0 = true; + pContext->apiThreadInfo.numAPIReservedThreads = 1; + pContext->apiThreadInfo.numAPIThreadsPerCore = 1; } if (pCreateInfo->pWorkerPrivateState) @@ -146,12 +147,14 @@ HANDLE SwrCreateContext( } pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads]; - pContext->pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64); + pContext->pStats = + (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64); #if defined(KNOB_ENABLE_AR) // Setup ArchRast thread contexts which includes +1 for API thread. - pContext->pArContext = new HANDLE[pContext->NumWorkerThreads+1]; - pContext->pArContext[pContext->NumWorkerThreads] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::API); + pContext->pArContext = new HANDLE[pContext->NumWorkerThreads + 1]; + pContext->pArContext[pContext->NumWorkerThreads] = + ArchRast::CreateThreadContext(ArchRast::AR_THREAD::API); #endif // Allocate scratch space for workers. @@ -159,14 +162,17 @@ HANDLE SwrCreateContext( for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) { #if defined(_WIN32) - uint32_t numaNode = pContext->threadPool.pThreadData ? - pContext->threadPool.pThreadData[i].numaId : 0; - pContext->ppScratch[i] = (uint8_t*)VirtualAllocExNuma( - GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE), - MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE, - numaNode); + uint32_t numaNode = + pContext->threadPool.pThreadData ? pContext->threadPool.pThreadData[i].numaId : 0; + pContext->ppScratch[i] = (uint8_t*)VirtualAllocExNuma(GetCurrentProcess(), + nullptr, + 32 * sizeof(KILOBYTE), + MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE, + numaNode); #else - pContext->ppScratch[i] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4); + pContext->ppScratch[i] = + (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4); #endif #if defined(KNOB_ENABLE_AR) @@ -187,13 +193,13 @@ HANDLE SwrCreateContext( pContext->pHotTileMgr = new HotTileMgr(); // initialize callback functions - pContext->pfnLoadTile = pCreateInfo->pfnLoadTile; - pContext->pfnStoreTile = pCreateInfo->pfnStoreTile; - pContext->pfnClearTile = pCreateInfo->pfnClearTile; + pContext->pfnLoadTile = pCreateInfo->pfnLoadTile; + pContext->pfnStoreTile = pCreateInfo->pfnStoreTile; + pContext->pfnClearTile = pCreateInfo->pfnClearTile; pContext->pfnUpdateSoWriteOffset = pCreateInfo->pfnUpdateSoWriteOffset; - pContext->pfnUpdateStats = pCreateInfo->pfnUpdateStats; - pContext->pfnUpdateStatsFE = pCreateInfo->pfnUpdateStatsFE; - + pContext->pfnUpdateStats = pCreateInfo->pfnUpdateStats; + pContext->pfnUpdateStatsFE = pCreateInfo->pfnUpdateStatsFE; + // pass pointer to bucket manager back to caller #ifdef KNOB_ENABLE_RDTSC @@ -212,11 +218,11 @@ void CopyState(DRAW_STATE& dst, const DRAW_STATE& src) memcpy(&dst.state, &src.state, sizeof(API_STATE)); } -template<bool IsDraw> -void QueueWork(SWR_CONTEXT *pContext) +template <bool IsDraw> +void QueueWork(SWR_CONTEXT* pContext) { - DRAW_CONTEXT* pDC = pContext->pCurDrawContext; - uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT; + DRAW_CONTEXT* pDC = pContext->pCurDrawContext; + uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT; if (IsDraw) { @@ -249,7 +255,8 @@ void QueueWork(SWR_CONTEXT *pContext) if (IsDraw) { - uint32_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId }; + uint32_t curDraw[2] = {pContext->pCurDrawContext->drawId, + pContext->pCurDrawContext->drawId}; WorkOnFifoFE(pContext, 0, curDraw[0]); WorkOnFifoBE(pContext, 0, curDraw[1], *pContext->pSingleThreadLockedTiles, 0, 0); } @@ -259,8 +266,11 @@ void QueueWork(SWR_CONTEXT *pContext) WorkOnCompute(pContext, 0, curDispatch); } - // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers). - while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0) {} + // Dequeue the work here, if not already done, since we're single threaded (i.e. no + // workers). + while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0) + { + } // restore csr _mm_setcsr(mxcsr); @@ -272,9 +282,10 @@ void QueueWork(SWR_CONTEXT *pContext) RDTSC_END(APIDrawWakeAllThreads, 1); } - // Set current draw context to NULL so that next state call forces a new draw context to be created and populated. + // Set current draw context to NULL so that next state call forces a new draw context to be + // created and populated. pContext->pPrevDrawContext = pContext->pCurDrawContext; - pContext->pCurDrawContext = nullptr; + pContext->pCurDrawContext = nullptr; } INLINE void QueueDraw(SWR_CONTEXT* pContext) @@ -287,7 +298,7 @@ INLINE void QueueDispatch(SWR_CONTEXT* pContext) QueueWork<false>(pContext); } -DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) +DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT* pContext, bool isSplitDraw = false) { RDTSC_BEGIN(APIGetDrawContext, 0); // If current draw context is null then need to obtain a new draw context to use from ring. @@ -309,14 +320,14 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) pContext->cachingArenaAllocator.FreeOldBlocks(); pContext->lastFrameChecked = pContext->frameCount; - pContext->lastDrawChecked = curDraw; + pContext->lastDrawChecked = curDraw; } DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex]; - pContext->pCurDrawContext = pCurDrawContext; + pContext->pCurDrawContext = pCurDrawContext; // Assign next available entry in DS ring to this DC. - uint32_t dsIndex = pContext->curStateId % pContext->MAX_DRAWS_IN_FLIGHT; + uint32_t dsIndex = pContext->curStateId % pContext->MAX_DRAWS_IN_FLIGHT; pCurDrawContext->pState = &pContext->dsRing[dsIndex]; // Copy previous state to current state. @@ -336,7 +347,7 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) pCurDrawContext->pState->pPrivateState = nullptr; - pContext->curStateId++; // Progress state ring index forward. + pContext->curStateId++; // Progress state ring index forward. } else { @@ -349,21 +360,21 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) else { SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true); - pContext->curStateId++; // Progress state ring index forward. + pContext->curStateId++; // Progress state ring index forward. } SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true); // Reset dependency - pCurDrawContext->dependent = false; + pCurDrawContext->dependent = false; pCurDrawContext->dependentFE = false; - pCurDrawContext->pContext = pContext; + pCurDrawContext->pContext = pContext; pCurDrawContext->isCompute = false; // Dispatch has to set this to true. - pCurDrawContext->doneFE = false; - pCurDrawContext->FeLock = 0; - pCurDrawContext->threadsDone = 0; + pCurDrawContext->doneFE = false; + pCurDrawContext->FeLock = 0; + pCurDrawContext->threadsDone = 0; pCurDrawContext->retireCallback.pfnCallbackFunc = nullptr; pCurDrawContext->dynState.Reset(pContext->NumWorkerThreads); @@ -382,7 +393,7 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) return pContext->pCurDrawContext; } -API_STATE* GetDrawState(SWR_CONTEXT *pContext) +API_STATE* GetDrawState(SWR_CONTEXT* pContext) { DRAW_CONTEXT* pDC = GetDrawContext(pContext); SWR_ASSERT(pDC->pState != nullptr); @@ -392,13 +403,13 @@ API_STATE* GetDrawState(SWR_CONTEXT *pContext) void SwrDestroyContext(HANDLE hContext) { - SWR_CONTEXT *pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); + SWR_CONTEXT* pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); - pDC->FeWork.type = SHUTDOWN; + pDC->FeWork.type = SHUTDOWN; pDC->FeWork.pfnWork = ProcessShutdown; - //enqueue + // enqueue QueueDraw(pContext); DestroyThreadPool(pContext, &pContext->threadPool); @@ -442,72 +453,65 @@ void SwrDestroyContext(HANDLE hContext) void SwrBindApiThread(HANDLE hContext, uint32_t apiThreadId) { - SWR_CONTEXT *pContext = GetContext(hContext); + SWR_CONTEXT* pContext = GetContext(hContext); BindApiThread(pContext, apiThreadId); } -void SWR_API SwrSaveState( - HANDLE hContext, - void* pOutputStateBlock, - size_t memSize) +void SWR_API SwrSaveState(HANDLE hContext, void* pOutputStateBlock, size_t memSize) { - SWR_CONTEXT *pContext = GetContext(hContext); - auto pSrc = GetDrawState(pContext); + SWR_CONTEXT* pContext = GetContext(hContext); + auto pSrc = GetDrawState(pContext); SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc)); memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc)); } -void SWR_API SwrRestoreState( - HANDLE hContext, - const void* pStateBlock, - size_t memSize) +void SWR_API SwrRestoreState(HANDLE hContext, const void* pStateBlock, size_t memSize) { - SWR_CONTEXT *pContext = GetContext(hContext); - auto pDst = GetDrawState(pContext); + SWR_CONTEXT* pContext = GetContext(hContext); + auto pDst = GetDrawState(pContext); SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst)); memcpy(pDst, pStateBlock, sizeof(*pDst)); } -void SetupDefaultState(SWR_CONTEXT *pContext) +void SetupDefaultState(SWR_CONTEXT* pContext) { API_STATE* pState = GetDrawState(pContext); - pState->rastState.cullMode = SWR_CULLMODE_NONE; + pState->rastState.cullMode = SWR_CULLMODE_NONE; pState->rastState.frontWinding = SWR_FRONTWINDING_CCW; - pState->depthBoundsState.depthBoundsTestEnable = false; + pState->depthBoundsState.depthBoundsTestEnable = false; pState->depthBoundsState.depthBoundsTestMinValue = 0.0f; pState->depthBoundsState.depthBoundsTestMaxValue = 1.0f; } -void SWR_API SwrSync( - HANDLE hContext, - PFN_CALLBACK_FUNC pfnFunc, - uint64_t userData, - uint64_t userData2, - uint64_t userData3) +void SWR_API SwrSync(HANDLE hContext, + PFN_CALLBACK_FUNC pfnFunc, + uint64_t userData, + uint64_t userData2, + uint64_t userData3) { SWR_ASSERT(pfnFunc != nullptr); - SWR_CONTEXT *pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); + SWR_CONTEXT* pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); RDTSC_BEGIN(APISync, 0); - pDC->FeWork.type = SYNC; + pDC->FeWork.type = SYNC; pDC->FeWork.pfnWork = ProcessSync; // Setup callback function pDC->retireCallback.pfnCallbackFunc = pfnFunc; - pDC->retireCallback.userData = userData; - pDC->retireCallback.userData2 = userData2; - pDC->retireCallback.userData3 = userData3; + pDC->retireCallback.userData = userData; + pDC->retireCallback.userData2 = userData2; + pDC->retireCallback.userData3 = userData3; AR_API_EVENT(SwrSyncEvent(pDC->drawId)); - //enqueue + // enqueue QueueDraw(pContext); RDTSC_END(APISync, 1); @@ -515,15 +519,15 @@ void SWR_API SwrSync( void SwrStallBE(HANDLE hContext) { - SWR_CONTEXT* pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); + SWR_CONTEXT* pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); pDC->dependent = true; } void SwrWaitForIdle(HANDLE hContext) { - SWR_CONTEXT *pContext = GetContext(hContext); + SWR_CONTEXT* pContext = GetContext(hContext); RDTSC_BEGIN(APIWaitForIdle, 0); @@ -537,7 +541,7 @@ void SwrWaitForIdle(HANDLE hContext) void SwrWaitForIdleFE(HANDLE hContext) { - SWR_CONTEXT *pContext = GetContext(hContext); + SWR_CONTEXT* pContext = GetContext(hContext); RDTSC_BEGIN(APIWaitForIdle, 0); @@ -549,42 +553,34 @@ void SwrWaitForIdleFE(HANDLE hContext) RDTSC_END(APIWaitForIdle, 1); } -void SwrSetVertexBuffers( - HANDLE hContext, - uint32_t numBuffers, - const SWR_VERTEX_BUFFER_STATE* pVertexBuffers) +void SwrSetVertexBuffers(HANDLE hContext, + uint32_t numBuffers, + const SWR_VERTEX_BUFFER_STATE* pVertexBuffers) { API_STATE* pState = GetDrawState(GetContext(hContext)); for (uint32_t i = 0; i < numBuffers; ++i) { - const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i]; - pState->vertexBuffers[pVB->index] = *pVB; + const SWR_VERTEX_BUFFER_STATE* pVB = &pVertexBuffers[i]; + pState->vertexBuffers[pVB->index] = *pVB; } } -void SwrSetIndexBuffer( - HANDLE hContext, - const SWR_INDEX_BUFFER_STATE* pIndexBuffer) +void SwrSetIndexBuffer(HANDLE hContext, const SWR_INDEX_BUFFER_STATE* pIndexBuffer) { API_STATE* pState = GetDrawState(GetContext(hContext)); pState->indexBuffer = *pIndexBuffer; } -void SwrSetFetchFunc( - HANDLE hContext, - PFN_FETCH_FUNC pfnFetchFunc) +void SwrSetFetchFunc(HANDLE hContext, PFN_FETCH_FUNC pfnFetchFunc) { API_STATE* pState = GetDrawState(GetContext(hContext)); pState->pfnFetchFunc = pfnFetchFunc; } -void SwrSetSoFunc( - HANDLE hContext, - PFN_SO_FUNC pfnSoFunc, - uint32_t streamIndex) +void SwrSetSoFunc(HANDLE hContext, PFN_SO_FUNC pfnSoFunc, uint32_t streamIndex) { API_STATE* pState = GetDrawState(GetContext(hContext)); @@ -593,19 +589,14 @@ void SwrSetSoFunc( pState->pfnSoFunc[streamIndex] = pfnSoFunc; } -void SwrSetSoState( - HANDLE hContext, - SWR_STREAMOUT_STATE* pSoState) +void SwrSetSoState(HANDLE hContext, SWR_STREAMOUT_STATE* pSoState) { API_STATE* pState = GetDrawState(GetContext(hContext)); pState->soState = *pSoState; } -void SwrSetSoBuffers( - HANDLE hContext, - SWR_STREAMOUT_BUFFER* pSoBuffer, - uint32_t slot) +void SwrSetSoBuffers(HANDLE hContext, SWR_STREAMOUT_BUFFER* pSoBuffer, uint32_t slot) { API_STATE* pState = GetDrawState(GetContext(hContext)); @@ -614,168 +605,136 @@ void SwrSetSoBuffers( pState->soBuffer[slot] = *pSoBuffer; } -void SwrSetVertexFunc( - HANDLE hContext, - PFN_VERTEX_FUNC pfnVertexFunc) +void SwrSetVertexFunc(HANDLE hContext, PFN_VERTEX_FUNC pfnVertexFunc) { API_STATE* pState = GetDrawState(GetContext(hContext)); pState->pfnVertexFunc = pfnVertexFunc; } -void SwrSetFrontendState( - HANDLE hContext, - SWR_FRONTEND_STATE *pFEState) +void SwrSetFrontendState(HANDLE hContext, SWR_FRONTEND_STATE* pFEState) { - API_STATE* pState = GetDrawState(GetContext(hContext)); + API_STATE* pState = GetDrawState(GetContext(hContext)); pState->frontendState = *pFEState; } -void SwrSetGsState( - HANDLE hContext, - SWR_GS_STATE *pGSState) +void SwrSetGsState(HANDLE hContext, SWR_GS_STATE* pGSState) { API_STATE* pState = GetDrawState(GetContext(hContext)); - pState->gsState = *pGSState; + pState->gsState = *pGSState; } -void SwrSetGsFunc( - HANDLE hContext, - PFN_GS_FUNC pfnGsFunc) +void SwrSetGsFunc(HANDLE hContext, PFN_GS_FUNC pfnGsFunc) { API_STATE* pState = GetDrawState(GetContext(hContext)); pState->pfnGsFunc = pfnGsFunc; } -void SwrSetCsFunc( - HANDLE hContext, - PFN_CS_FUNC pfnCsFunc, - uint32_t totalThreadsInGroup, - uint32_t totalSpillFillSize, - uint32_t scratchSpaceSizePerInstance, - uint32_t numInstances) -{ - API_STATE* pState = GetDrawState(GetContext(hContext)); - pState->pfnCsFunc = pfnCsFunc; - pState->totalThreadsInGroup = totalThreadsInGroup; - pState->totalSpillFillSize = totalSpillFillSize; - pState->scratchSpaceSize = scratchSpaceSizePerInstance; +void SwrSetCsFunc(HANDLE hContext, + PFN_CS_FUNC pfnCsFunc, + uint32_t totalThreadsInGroup, + uint32_t totalSpillFillSize, + uint32_t scratchSpaceSizePerInstance, + uint32_t numInstances) +{ + API_STATE* pState = GetDrawState(GetContext(hContext)); + pState->pfnCsFunc = pfnCsFunc; + pState->totalThreadsInGroup = totalThreadsInGroup; + pState->totalSpillFillSize = totalSpillFillSize; + pState->scratchSpaceSize = scratchSpaceSizePerInstance; pState->scratchSpaceNumInstances = numInstances; } -void SwrSetTsState( - HANDLE hContext, - SWR_TS_STATE *pState) +void SwrSetTsState(HANDLE hContext, SWR_TS_STATE* pState) { API_STATE* pApiState = GetDrawState(GetContext(hContext)); - pApiState->tsState = *pState; + pApiState->tsState = *pState; } -void SwrSetHsFunc( - HANDLE hContext, - PFN_HS_FUNC pfnFunc) +void SwrSetHsFunc(HANDLE hContext, PFN_HS_FUNC pfnFunc) { API_STATE* pApiState = GetDrawState(GetContext(hContext)); pApiState->pfnHsFunc = pfnFunc; } -void SwrSetDsFunc( - HANDLE hContext, - PFN_DS_FUNC pfnFunc) +void SwrSetDsFunc(HANDLE hContext, PFN_DS_FUNC pfnFunc) { API_STATE* pApiState = GetDrawState(GetContext(hContext)); pApiState->pfnDsFunc = pfnFunc; } -void SwrSetDepthStencilState( - HANDLE hContext, - SWR_DEPTH_STENCIL_STATE *pDSState) +void SwrSetDepthStencilState(HANDLE hContext, SWR_DEPTH_STENCIL_STATE* pDSState) { API_STATE* pState = GetDrawState(GetContext(hContext)); pState->depthStencilState = *pDSState; } -void SwrSetBackendState( - HANDLE hContext, - SWR_BACKEND_STATE *pBEState) +void SwrSetBackendState(HANDLE hContext, SWR_BACKEND_STATE* pBEState) { API_STATE* pState = GetDrawState(GetContext(hContext)); pState->backendState = *pBEState; } -void SwrSetDepthBoundsState( - HANDLE hContext, - SWR_DEPTH_BOUNDS_STATE *pDBState) +void SwrSetDepthBoundsState(HANDLE hContext, SWR_DEPTH_BOUNDS_STATE* pDBState) { API_STATE* pState = GetDrawState(GetContext(hContext)); pState->depthBoundsState = *pDBState; } -void SwrSetPixelShaderState( - HANDLE hContext, - SWR_PS_STATE *pPSState) +void SwrSetPixelShaderState(HANDLE hContext, SWR_PS_STATE* pPSState) { - API_STATE *pState = GetDrawState(GetContext(hContext)); - pState->psState = *pPSState; + API_STATE* pState = GetDrawState(GetContext(hContext)); + pState->psState = *pPSState; } -void SwrSetBlendState( - HANDLE hContext, - SWR_BLEND_STATE *pBlendState) +void SwrSetBlendState(HANDLE hContext, SWR_BLEND_STATE* pBlendState) { - API_STATE *pState = GetDrawState(GetContext(hContext)); + API_STATE* pState = GetDrawState(GetContext(hContext)); memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE)); } -void SwrSetBlendFunc( - HANDLE hContext, - uint32_t renderTarget, - PFN_BLEND_JIT_FUNC pfnBlendFunc) +void SwrSetBlendFunc(HANDLE hContext, uint32_t renderTarget, PFN_BLEND_JIT_FUNC pfnBlendFunc) { SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS); - API_STATE *pState = GetDrawState(GetContext(hContext)); + API_STATE* pState = GetDrawState(GetContext(hContext)); pState->pfnBlendFunc[renderTarget] = pfnBlendFunc; } // update guardband multipliers for the viewport -void updateGuardbands(API_STATE *pState) +void updateGuardbands(API_STATE* pState) { uint32_t numGbs = pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1; - for(uint32_t i = 0; i < numGbs; ++i) + for (uint32_t i = 0; i < numGbs; ++i) { // guardband center is viewport center - pState->gbState.left[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width; - pState->gbState.right[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width; - pState->gbState.top[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height; + pState->gbState.left[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width; + pState->gbState.right[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width; + pState->gbState.top[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height; pState->gbState.bottom[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height; } } -void SwrSetRastState( - HANDLE hContext, - const SWR_RASTSTATE *pRastState) +void SwrSetRastState(HANDLE hContext, const SWR_RASTSTATE* pRastState) { - SWR_CONTEXT *pContext = GetContext(hContext); - API_STATE* pState = GetDrawState(pContext); + SWR_CONTEXT* pContext = GetContext(hContext); + API_STATE* pState = GetDrawState(pContext); memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE)); } -void SwrSetViewports( - HANDLE hContext, - uint32_t numViewports, - const SWR_VIEWPORT* pViewports, - const SWR_VIEWPORT_MATRICES* pMatrices) +void SwrSetViewports(HANDLE hContext, + uint32_t numViewports, + const SWR_VIEWPORT* pViewports, + const SWR_VIEWPORT_MATRICES* pMatrices) { - SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS, - "Invalid number of viewports."); + SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS, "Invalid number of viewports."); - SWR_CONTEXT *pContext = GetContext(hContext); - API_STATE* pState = GetDrawState(pContext); + SWR_CONTEXT* pContext = GetContext(hContext); + API_STATE* pState = GetDrawState(pContext); memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports); // @todo Faster to copy portions of the SOA or just copy all of it? @@ -784,27 +743,24 @@ void SwrSetViewports( updateGuardbands(pState); } -void SwrSetScissorRects( - HANDLE hContext, - uint32_t numScissors, - const SWR_RECT* pScissors) +void SwrSetScissorRects(HANDLE hContext, uint32_t numScissors, const SWR_RECT* pScissors) { - SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS, - "Invalid number of scissor rects."); + SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS, "Invalid number of scissor rects."); API_STATE* pState = GetDrawState(GetContext(hContext)); memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(pScissors[0])); }; -void SetupMacroTileScissors(DRAW_CONTEXT *pDC) +void SetupMacroTileScissors(DRAW_CONTEXT* pDC) { - API_STATE *pState = &pDC->pState->state; - uint32_t numScissors = pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1; + API_STATE* pState = &pDC->pState->state; + uint32_t numScissors = + pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1; pState->scissorsTileAligned = true; for (uint32_t index = 0; index < numScissors; ++index) { - SWR_RECT &scissorInFixedPoint = pState->scissorsInFixedPoint[index]; + SWR_RECT& scissorInFixedPoint = pState->scissorsInFixedPoint[index]; // Set up scissor dimensions based on scissor or viewport if (pState->rastState.scissorEnable) @@ -813,8 +769,9 @@ void SetupMacroTileScissors(DRAW_CONTEXT *pDC) } else { - // the vp width and height must be added to origin un-rounded then the result round to -inf. - // The cast to int works for rounding assuming all [left, right, top, bottom] are positive. + // the vp width and height must be added to origin un-rounded then the result round to + // -inf. The cast to int works for rounding assuming all [left, right, top, bottom] are + // positive. scissorInFixedPoint.xmin = (int32_t)pState->vp[index].x; scissorInFixedPoint.xmax = (int32_t)(pState->vp[index].x + pState->vp[index].width); scissorInFixedPoint.ymin = (int32_t)pState->vp[index].y; @@ -826,7 +783,7 @@ void SetupMacroTileScissors(DRAW_CONTEXT *pDC) // Test for tile alignment bool tileAligned; - tileAligned = (scissorInFixedPoint.xmin % KNOB_TILE_X_DIM) == 0; + tileAligned = (scissorInFixedPoint.xmin % KNOB_TILE_X_DIM) == 0; tileAligned &= (scissorInFixedPoint.ymin % KNOB_TILE_Y_DIM) == 0; tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_X_DIM) == 0; tileAligned &= (scissorInFixedPoint.ymax % KNOB_TILE_Y_DIM) == 0; @@ -848,12 +805,12 @@ void SetupMacroTileScissors(DRAW_CONTEXT *pDC) // templated backend function tables -void SetupPipeline(DRAW_CONTEXT *pDC) +void SetupPipeline(DRAW_CONTEXT* pDC) { - DRAW_STATE* pState = pDC->pState; - const SWR_RASTSTATE &rastState = pState->state.rastState; - const SWR_PS_STATE &psState = pState->state.psState; - BACKEND_FUNCS& backendFuncs = pState->backendFuncs; + DRAW_STATE* pState = pDC->pState; + const SWR_RASTSTATE& rastState = pState->state.rastState; + const SWR_PS_STATE& psState = pState->state.psState; + BACKEND_FUNCS& backendFuncs = pState->backendFuncs; // setup backend if (psState.pfnPixelShader == nullptr) @@ -863,35 +820,46 @@ void SetupPipeline(DRAW_CONTEXT *pDC) else { const uint32_t forcedSampleCount = (rastState.forcedSampleCount) ? 1 : 0; - const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || forcedSampleCount) ? 1 : 0; - const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0; - const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesUAV)) ? 1 : 0; + const bool bMultisampleEnable = + ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || forcedSampleCount) ? 1 : 0; + const uint32_t centroid = + ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0; + const uint32_t canEarlyZ = + (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesUAV)) ? 1 : 0; SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask; - + // select backend function - switch(psState.shadingRate) + switch (psState.shadingRate) { case SWR_SHADING_RATE_PIXEL: - if(bMultisampleEnable) + if (bMultisampleEnable) { // always need to generate I & J per sample for Z interpolation - barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); - backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.bIsCenterPattern][psState.inputCoverage] - [centroid][forcedSampleCount][canEarlyZ] + barycentricsMask = + (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); + backendFuncs.pfnBackend = + gBackendPixelRateTable[rastState.sampleCount][rastState.bIsCenterPattern] + [psState.inputCoverage][centroid][forcedSampleCount] + [canEarlyZ] ; } else { // always need to generate I & J per pixel for Z interpolation - barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK); - backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid][canEarlyZ]; + barycentricsMask = + (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK); + backendFuncs.pfnBackend = + gBackendSingleSample[psState.inputCoverage][centroid][canEarlyZ]; } break; case SWR_SHADING_RATE_SAMPLE: SWR_ASSERT(rastState.bIsCenterPattern != true); // always need to generate I & J per sample for Z interpolation - barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); - backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid][canEarlyZ]; + barycentricsMask = + (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); + backendFuncs.pfnBackend = + gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid] + [canEarlyZ]; break; default: SWR_ASSERT(0 && "Invalid shading rate"); @@ -909,10 +877,10 @@ void SetupPipeline(DRAW_CONTEXT *pDC) { case TOP_POINT_LIST: pState->pfnProcessPrims = ClipPoints; - pfnBinner = BinPoints; + pfnBinner = BinPoints; #if USE_SIMD16_FRONTEND pState->pfnProcessPrims_simd16 = ClipPoints_simd16; - pfnBinner_simd16 = BinPoints_simd16; + pfnBinner_simd16 = BinPoints_simd16; #endif break; case TOP_LINE_LIST: @@ -921,15 +889,15 @@ void SetupPipeline(DRAW_CONTEXT *pDC) case TOP_LINE_LIST_ADJ: case TOP_LISTSTRIP_ADJ: pState->pfnProcessPrims = ClipLines; - pfnBinner = BinLines; + pfnBinner = BinLines; #if USE_SIMD16_FRONTEND pState->pfnProcessPrims_simd16 = ClipLines_simd16; - pfnBinner_simd16 = BinLines_simd16; + pfnBinner_simd16 = BinLines_simd16; #endif break; default: pState->pfnProcessPrims = ClipTriangles; - pfnBinner = GetBinTrianglesFunc((rastState.conservativeRast > 0)); + pfnBinner = GetBinTrianglesFunc((rastState.conservativeRast > 0)); #if USE_SIMD16_FRONTEND pState->pfnProcessPrims_simd16 = ClipTriangles_simd16; pfnBinner_simd16 = GetBinTrianglesFunc_simd16((rastState.conservativeRast > 0)); @@ -971,14 +939,16 @@ void SetupPipeline(DRAW_CONTEXT *pDC) // set up the frontend attribute count - pState->state.feNumAttributes = 0; + pState->state.feNumAttributes = 0; const SWR_BACKEND_STATE& backendState = pState->state.backendState; if (backendState.swizzleEnable) { // attribute swizzling is enabled, iterate over the map and record the max attribute used for (uint32_t i = 0; i < backendState.numAttributes; ++i) { - pState->state.feNumAttributes = std::max(pState->state.feNumAttributes, (uint32_t)backendState.swizzleMap[i].sourceAttrib + 1); + pState->state.feNumAttributes = + std::max(pState->state.feNumAttributes, + (uint32_t)backendState.swizzleMap[i].sourceAttrib + 1); } } else @@ -997,37 +967,44 @@ void SetupPipeline(DRAW_CONTEXT *pDC) DWORD maxAttrib; if (_BitScanReverse64(&maxAttrib, streamMasks)) { - pState->state.feNumAttributes = std::max(pState->state.feNumAttributes, (uint32_t)(maxAttrib + 1)); + pState->state.feNumAttributes = + std::max(pState->state.feNumAttributes, (uint32_t)(maxAttrib + 1)); } } // complicated logic to test for cases where we don't need backing hottile memory for a draw - // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled. - pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable && - !pState->state.depthStencilState.depthWriteEnable && - !pState->state.depthBoundsState.depthBoundsTestEnable && - pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) && - (pState->state.depthStencilState.depthTestEnable || - pState->state.depthStencilState.depthWriteEnable || - pState->state.depthBoundsState.depthBoundsTestEnable)) ? true : false; - - pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable && - !pState->state.depthStencilState.stencilWriteEnable && - pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) || - // for stencil we have to check the double sided state as well - (!(pState->state.depthStencilState.doubleSidedStencilTestEnable && - !pState->state.depthStencilState.stencilWriteEnable && - pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) && - (pState->state.depthStencilState.stencilTestEnable || - pState->state.depthStencilState.stencilWriteEnable)) ? true : false; - + // have to check for the special case where depth/stencil test is enabled but depthwrite is + // disabled. + pState->state.depthHottileEnable = + ((!(pState->state.depthStencilState.depthTestEnable && + !pState->state.depthStencilState.depthWriteEnable && + !pState->state.depthBoundsState.depthBoundsTestEnable && + pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) && + (pState->state.depthStencilState.depthTestEnable || + pState->state.depthStencilState.depthWriteEnable || + pState->state.depthBoundsState.depthBoundsTestEnable)) + ? true + : false; + + pState->state.stencilHottileEnable = + (((!(pState->state.depthStencilState.stencilTestEnable && + !pState->state.depthStencilState.stencilWriteEnable && + pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) || + // for stencil we have to check the double sided state as well + (!(pState->state.depthStencilState.doubleSidedStencilTestEnable && + !pState->state.depthStencilState.stencilWriteEnable && + pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) && + (pState->state.depthStencilState.stencilTestEnable || + pState->state.depthStencilState.stencilWriteEnable)) + ? true + : false; uint32_t hotTileEnable = pState->state.psState.renderTargetMask; // Disable hottile for surfaces with no writes if (psState.pfnPixelShader != nullptr) { - DWORD rt; + DWORD rt; uint32_t rtMask = pState->state.psState.renderTargetMask; while (_BitScanForward(&rt, rtMask)) { @@ -1045,33 +1022,39 @@ void SetupPipeline(DRAW_CONTEXT *pDC) pState->state.colorHottileEnable = hotTileEnable; - // Setup depth quantization function if (pState->state.depthHottileEnable) { switch (pState->state.rastState.depthFormat) { - case R32_FLOAT_X8X24_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT_X8X24_TYPELESS > ; break; - case R32_FLOAT: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; break; - case R24_UNORM_X8_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R24_UNORM_X8_TYPELESS > ; break; - case R16_UNORM: pState->state.pfnQuantizeDepth = QuantizeDepth < R16_UNORM > ; break; - default: SWR_INVALID("Unsupported depth format for depth quantiztion."); - pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; + case R32_FLOAT_X8X24_TYPELESS: + pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT_X8X24_TYPELESS>; + break; + case R32_FLOAT: + pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT>; + break; + case R24_UNORM_X8_TYPELESS: + pState->state.pfnQuantizeDepth = QuantizeDepth<R24_UNORM_X8_TYPELESS>; + break; + case R16_UNORM: + pState->state.pfnQuantizeDepth = QuantizeDepth<R16_UNORM>; + break; + default: + SWR_INVALID("Unsupported depth format for depth quantiztion."); + pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT>; } } else { // set up pass-through quantize if depth isn't enabled - pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; + pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT>; } } ////////////////////////////////////////////////////////////////////////// /// @brief InitDraw /// @param pDC - Draw context to initialize for this draw. -void InitDraw( - DRAW_CONTEXT *pDC, - bool isSplitDraw) +void InitDraw(DRAW_CONTEXT* pDC, bool isSplitDraw) { // We don't need to re-setup the scissors/pipeline state again for split draw. if (isSplitDraw == false) @@ -1079,7 +1062,6 @@ void InitDraw( SetupMacroTileScissors(pDC); SetupPipeline(pDC); } - } @@ -1087,10 +1069,7 @@ void InitDraw( /// @brief We can split the draw for certain topologies for better performance. /// @param totalVerts - Total vertices for draw /// @param topology - Topology used for draw -uint32_t MaxVertsPerDraw( - DRAW_CONTEXT* pDC, - uint32_t totalVerts, - PRIMITIVE_TOPOLOGY topology) +uint32_t MaxVertsPerDraw(DRAW_CONTEXT* pDC, uint32_t totalVerts, PRIMITIVE_TOPOLOGY topology) { API_STATE& state = pDC->pState->state; @@ -1157,7 +1136,7 @@ uint32_t MaxVertsPerDraw( if (pDC->pState->state.tsState.tsEnable) { uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE; - vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW; + vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW; } break; default: @@ -1168,7 +1147,6 @@ uint32_t MaxVertsPerDraw( return vertsPerDraw; } - ////////////////////////////////////////////////////////////////////////// /// @brief DrawInstanced /// @param hContext - Handle passed back from SwrCreateContext @@ -1176,31 +1154,31 @@ uint32_t MaxVertsPerDraw( /// @param numVerts - How many vertices to read sequentially from vertex data (per instance). /// @param startVertex - Specifies start vertex for draw. (vertex data) /// @param numInstances - How many instances to render. -/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) -void DrawInstanced( - HANDLE hContext, - PRIMITIVE_TOPOLOGY topology, - uint32_t numVertices, - uint32_t startVertex, - uint32_t numInstances = 1, - uint32_t startInstance = 0) +/// @param startInstance - Which instance to start sequentially fetching from in each buffer +/// (instanced data) +void DrawInstanced(HANDLE hContext, + PRIMITIVE_TOPOLOGY topology, + uint32_t numVertices, + uint32_t startVertex, + uint32_t numInstances = 1, + uint32_t startInstance = 0) { if (KNOB_TOSS_DRAW) { return; } - SWR_CONTEXT *pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); + SWR_CONTEXT* pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); RDTSC_BEGIN(APIDraw, pDC->drawId); uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology); - uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw); - uint32_t remainingVerts = numVertices; + uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw); + uint32_t remainingVerts = numVertices; - API_STATE *pState = &pDC->pState->state; - pState->topology = topology; + API_STATE* pState = &pDC->pState->state; + pState->topology = topology; pState->forceFront = false; // disable culling for points/lines @@ -1208,7 +1186,7 @@ void DrawInstanced( if (topology == TOP_POINT_LIST) { pState->rastState.cullMode = SWR_CULLMODE_NONE; - pState->forceFront = true; + pState->forceFront = true; } else if (topology == TOP_RECT_LIST) { @@ -1218,42 +1196,50 @@ void DrawInstanced( int draw = 0; while (remainingVerts) { - uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ? - remainingVerts : maxVertsPerDraw; + uint32_t numVertsForDraw = + (remainingVerts < maxVertsPerDraw) ? remainingVerts : maxVertsPerDraw; - bool isSplitDraw = (draw > 0) ? true : false; - DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw); + bool isSplitDraw = (draw > 0) ? true : false; + DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw); InitDraw(pDC, isSplitDraw); - pDC->FeWork.type = DRAW; - pDC->FeWork.pfnWork = GetProcessDrawFunc( - false, // IsIndexed - false, // bEnableCutIndex - pState->tsState.tsEnable, - pState->gsState.gsEnable, - pState->soState.soEnable, - pDC->pState->pfnProcessPrims != nullptr); - pDC->FeWork.desc.draw.numVerts = numVertsForDraw; - pDC->FeWork.desc.draw.startVertex = startVertex; - pDC->FeWork.desc.draw.numInstances = numInstances; + pDC->FeWork.type = DRAW; + pDC->FeWork.pfnWork = GetProcessDrawFunc(false, // IsIndexed + false, // bEnableCutIndex + pState->tsState.tsEnable, + pState->gsState.gsEnable, + pState->soState.soEnable, + pDC->pState->pfnProcessPrims != nullptr); + pDC->FeWork.desc.draw.numVerts = numVertsForDraw; + pDC->FeWork.desc.draw.startVertex = startVertex; + pDC->FeWork.desc.draw.numInstances = numInstances; pDC->FeWork.desc.draw.startInstance = startInstance; - pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; + pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw; pDC->cleanupState = (remainingVerts == numVertsForDraw); - //enqueue DC + // enqueue DC QueueDraw(pContext); - AR_API_EVENT(DrawInstancedEvent(pDC->drawId, topology, numVertsForDraw, startVertex, numInstances, - startInstance, pState->tsState.tsEnable, pState->gsState.gsEnable, pState->soState.soEnable, pState->gsState.outputTopology, draw)); + AR_API_EVENT(DrawInstancedEvent(pDC->drawId, + topology, + numVertsForDraw, + startVertex, + numInstances, + startInstance, + pState->tsState.tsEnable, + pState->gsState.gsEnable, + pState->soState.soEnable, + pState->gsState.outputTopology, + draw)); remainingVerts -= numVertsForDraw; draw++; } // restore culling state - pDC = GetDrawContext(pContext); + pDC = GetDrawContext(pContext); pDC->pState->state.rastState.cullMode = oldCullMode; RDTSC_END(APIDraw, numVertices * numInstances); @@ -1265,11 +1251,10 @@ void DrawInstanced( /// @param topology - Specifies topology for draw. /// @param startVertex - Specifies start vertex in vertex buffer for draw. /// @param primCount - Number of vertices. -void SwrDraw( - HANDLE hContext, - PRIMITIVE_TOPOLOGY topology, - uint32_t startVertex, - uint32_t numVertices) +void SwrDraw(HANDLE hContext, + PRIMITIVE_TOPOLOGY topology, + uint32_t startVertex, + uint32_t numVertices) { DrawInstanced(hContext, topology, numVertices, startVertex); } @@ -1281,17 +1266,17 @@ void SwrDraw( /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data. /// @param numInstances - How many instances to render. /// @param startVertex - Specifies start vertex for draw. (vertex data) -/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) -void SwrDrawInstanced( - HANDLE hContext, - PRIMITIVE_TOPOLOGY topology, - uint32_t numVertsPerInstance, - uint32_t numInstances, - uint32_t startVertex, - uint32_t startInstance - ) -{ - DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance); +/// @param startInstance - Which instance to start sequentially fetching from in each buffer +/// (instanced data) +void SwrDrawInstanced(HANDLE hContext, + PRIMITIVE_TOPOLOGY topology, + uint32_t numVertsPerInstance, + uint32_t numInstances, + uint32_t startVertex, + uint32_t startInstance) +{ + DrawInstanced( + hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance); } ////////////////////////////////////////////////////////////////////////// @@ -1302,46 +1287,52 @@ void SwrDrawInstanced( /// @param indexOffset - Starting index into index buffer. /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. /// @param numInstances - Number of instances to render. -/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) -void DrawIndexedInstance( - HANDLE hContext, - PRIMITIVE_TOPOLOGY topology, - uint32_t numIndices, - uint32_t indexOffset, - int32_t baseVertex, - uint32_t numInstances = 1, - uint32_t startInstance = 0) +/// @param startInstance - Which instance to start sequentially fetching from in each buffer +/// (instanced data) +void DrawIndexedInstance(HANDLE hContext, + PRIMITIVE_TOPOLOGY topology, + uint32_t numIndices, + uint32_t indexOffset, + int32_t baseVertex, + uint32_t numInstances = 1, + uint32_t startInstance = 0) { if (KNOB_TOSS_DRAW) { return; } - SWR_CONTEXT *pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); - API_STATE* pState = &pDC->pState->state; + SWR_CONTEXT* pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); + API_STATE* pState = &pDC->pState->state; RDTSC_BEGIN(APIDrawIndexed, pDC->drawId); uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology); - uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw); - uint32_t remainingIndices = numIndices; + uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw); + uint32_t remainingIndices = numIndices; uint32_t indexSize = 0; switch (pState->indexBuffer.format) { - case R32_UINT: indexSize = sizeof(uint32_t); break; - case R16_UINT: indexSize = sizeof(uint16_t); break; - case R8_UINT: indexSize = sizeof(uint8_t); break; + case R32_UINT: + indexSize = sizeof(uint32_t); + break; + case R16_UINT: + indexSize = sizeof(uint16_t); + break; + case R8_UINT: + indexSize = sizeof(uint8_t); + break; default: SWR_INVALID("Invalid index buffer format: %d", pState->indexBuffer.format); } - int draw = 0; + int draw = 0; gfxptr_t xpIB = pState->indexBuffer.xpIndices; xpIB += (uint64_t)indexOffset * (uint64_t)indexSize; - pState->topology = topology; + pState->topology = topology; pState->forceFront = false; // disable culling for points/lines @@ -1349,7 +1340,7 @@ void DrawIndexedInstance( if (topology == TOP_POINT_LIST) { pState->rastState.cullMode = SWR_CULLMODE_NONE; - pState->forceFront = true; + pState->forceFront = true; } else if (topology == TOP_RECT_LIST) { @@ -1358,8 +1349,8 @@ void DrawIndexedInstance( while (remainingIndices) { - uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ? - remainingIndices : maxIndicesPerDraw; + uint32_t numIndicesForDraw = + (remainingIndices < maxIndicesPerDraw) ? remainingIndices : maxIndicesPerDraw; // When breaking up draw, we need to obtain new draw context for each iteration. bool isSplitDraw = (draw > 0) ? true : false; @@ -1367,31 +1358,40 @@ void DrawIndexedInstance( pDC = GetDrawContext(pContext, isSplitDraw); InitDraw(pDC, isSplitDraw); - pDC->FeWork.type = DRAW; - pDC->FeWork.pfnWork = GetProcessDrawFunc( - true, // IsIndexed - pState->frontendState.bEnableCutIndex, - pState->tsState.tsEnable, - pState->gsState.gsEnable, - pState->soState.soEnable, - pDC->pState->pfnProcessPrims != nullptr); - pDC->FeWork.desc.draw.pDC = pDC; + pDC->FeWork.type = DRAW; + pDC->FeWork.pfnWork = GetProcessDrawFunc(true, // IsIndexed + pState->frontendState.bEnableCutIndex, + pState->tsState.tsEnable, + pState->gsState.gsEnable, + pState->soState.soEnable, + pDC->pState->pfnProcessPrims != nullptr); + pDC->FeWork.desc.draw.pDC = pDC; pDC->FeWork.desc.draw.numIndices = numIndicesForDraw; - pDC->FeWork.desc.draw.xpIB = xpIB; - pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format; + pDC->FeWork.desc.draw.xpIB = xpIB; + pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format; - pDC->FeWork.desc.draw.numInstances = numInstances; + pDC->FeWork.desc.draw.numInstances = numInstances; pDC->FeWork.desc.draw.startInstance = startInstance; - pDC->FeWork.desc.draw.baseVertex = baseVertex; - pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; + pDC->FeWork.desc.draw.baseVertex = baseVertex; + pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; pDC->cleanupState = (remainingIndices == numIndicesForDraw); - //enqueue DC + // enqueue DC QueueDraw(pContext); - AR_API_EVENT(DrawIndexedInstancedEvent(pDC->drawId, topology, numIndicesForDraw, indexOffset, baseVertex, - numInstances, startInstance, pState->tsState.tsEnable, pState->gsState.gsEnable, pState->soState.soEnable, pState->gsState.outputTopology, draw)); + AR_API_EVENT(DrawIndexedInstancedEvent(pDC->drawId, + topology, + numIndicesForDraw, + indexOffset, + baseVertex, + numInstances, + startInstance, + pState->tsState.tsEnable, + pState->gsState.gsEnable, + pState->soState.soEnable, + pState->gsState.outputTopology, + draw)); xpIB += maxIndicesPerDraw * indexSize; remainingIndices -= numIndicesForDraw; @@ -1399,13 +1399,12 @@ void DrawIndexedInstance( } // Restore culling state - pDC = GetDrawContext(pContext); + pDC = GetDrawContext(pContext); pDC->pState->state.rastState.cullMode = oldCullMode; - + RDTSC_END(APIDrawIndexed, numIndices * numInstances); } - ////////////////////////////////////////////////////////////////////////// /// @brief DrawIndexed /// @param hContext - Handle passed back from SwrCreateContext @@ -1413,13 +1412,11 @@ void DrawIndexedInstance( /// @param numIndices - Number of indices to read sequentially from index buffer. /// @param indexOffset - Starting index into index buffer. /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. -void SwrDrawIndexed( - HANDLE hContext, - PRIMITIVE_TOPOLOGY topology, - uint32_t numIndices, - uint32_t indexOffset, - int32_t baseVertex - ) +void SwrDrawIndexed(HANDLE hContext, + PRIMITIVE_TOPOLOGY topology, + uint32_t numIndices, + uint32_t indexOffset, + int32_t baseVertex) { DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex); } @@ -1432,48 +1429,49 @@ void SwrDrawIndexed( /// @param numInstances - Number of instances to render. /// @param indexOffset - Starting index into index buffer. /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. -/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) -void SwrDrawIndexedInstanced( - HANDLE hContext, - PRIMITIVE_TOPOLOGY topology, - uint32_t numIndices, - uint32_t numInstances, - uint32_t indexOffset, - int32_t baseVertex, - uint32_t startInstance) -{ - DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance); +/// @param startInstance - Which instance to start sequentially fetching from in each buffer +/// (instanced data) +void SwrDrawIndexedInstanced(HANDLE hContext, + PRIMITIVE_TOPOLOGY topology, + uint32_t numIndices, + uint32_t numInstances, + uint32_t indexOffset, + int32_t baseVertex, + uint32_t startInstance) +{ + DrawIndexedInstance( + hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance); } ////////////////////////////////////////////////////////////////////////// /// @brief SwrInvalidateTiles /// @param hContext - Handle passed back from SwrCreateContext -/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate. +/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to +/// invalidate. /// @param invalidateRect - The pixel-coordinate rectangle to invalidate. This will be expanded to /// be hottile size-aligned. -void SWR_API SwrInvalidateTiles( - HANDLE hContext, - uint32_t attachmentMask, - const SWR_RECT& invalidateRect) +void SWR_API SwrInvalidateTiles(HANDLE hContext, + uint32_t attachmentMask, + const SWR_RECT& invalidateRect) { if (KNOB_TOSS_DRAW) { return; } - SWR_CONTEXT *pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); + SWR_CONTEXT* pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); - pDC->FeWork.type = DISCARDINVALIDATETILES; - pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles; + pDC->FeWork.type = DISCARDINVALIDATETILES; + pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles; pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask; - pDC->FeWork.desc.discardInvalidateTiles.rect = invalidateRect; + pDC->FeWork.desc.discardInvalidateTiles.rect = invalidateRect; pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect; - pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID; + pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID; pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false; - pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false; + pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false; - //enqueue + // enqueue QueueDraw(pContext); AR_API_EVENT(SwrInvalidateTilesEvent(pDC->drawId)); @@ -1485,30 +1483,27 @@ void SWR_API SwrInvalidateTiles( /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard. /// @param rect - The pixel-coordinate rectangle to discard. Only fully-covered hottiles will be /// discarded. -void SWR_API SwrDiscardRect( - HANDLE hContext, - uint32_t attachmentMask, - const SWR_RECT& rect) +void SWR_API SwrDiscardRect(HANDLE hContext, uint32_t attachmentMask, const SWR_RECT& rect) { if (KNOB_TOSS_DRAW) { return; } - SWR_CONTEXT *pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); + SWR_CONTEXT* pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); // Queue a load to the hottile - pDC->FeWork.type = DISCARDINVALIDATETILES; - pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles; + pDC->FeWork.type = DISCARDINVALIDATETILES; + pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles; pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask; - pDC->FeWork.desc.discardInvalidateTiles.rect = rect; + pDC->FeWork.desc.discardInvalidateTiles.rect = rect; pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect; - pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED; + pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED; pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true; - pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true; + pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true; - //enqueue + // enqueue QueueDraw(pContext); AR_API_EVENT(SwrDiscardRectEvent(pDC->drawId)); @@ -1520,23 +1515,23 @@ void SWR_API SwrDiscardRect( /// @param threadGroupCountX - Number of thread groups dispatched in X direction /// @param threadGroupCountY - Number of thread groups dispatched in Y direction /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction -void SwrDispatch( - HANDLE hContext, - uint32_t threadGroupCountX, - uint32_t threadGroupCountY, - uint32_t threadGroupCountZ) +void SwrDispatch(HANDLE hContext, + uint32_t threadGroupCountX, + uint32_t threadGroupCountY, + uint32_t threadGroupCountZ) { if (KNOB_TOSS_DRAW) { return; } - SWR_CONTEXT *pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); + SWR_CONTEXT* pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); RDTSC_BEGIN(APIDispatch, pDC->drawId); - AR_API_EVENT(DispatchEvent(pDC->drawId, threadGroupCountX, threadGroupCountY, threadGroupCountZ)); - pDC->isCompute = true; // This is a compute context. + AR_API_EVENT( + DispatchEvent(pDC->drawId, threadGroupCountX, threadGroupCountY, threadGroupCountZ)); + pDC->isCompute = true; // This is a compute context. COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64); @@ -1545,8 +1540,8 @@ void SwrDispatch( pTaskData->threadGroupCountZ = threadGroupCountZ; uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ; - uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT; - pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex]; + uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT; + pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex]; pDC->pDispatch->initialize(totalThreadGroups, pTaskData, &ProcessComputeBE); QueueDispatch(pContext); @@ -1555,30 +1550,29 @@ void SwrDispatch( // Deswizzles, converts and stores current contents of the hot tiles to surface // described by pState -void SWR_API SwrStoreTiles( - HANDLE hContext, - uint32_t attachmentMask, - SWR_TILE_STATE postStoreTileState, - const SWR_RECT& storeRect) +void SWR_API SwrStoreTiles(HANDLE hContext, + uint32_t attachmentMask, + SWR_TILE_STATE postStoreTileState, + const SWR_RECT& storeRect) { if (KNOB_TOSS_DRAW) { return; } - SWR_CONTEXT *pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); + SWR_CONTEXT* pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); RDTSC_BEGIN(APIStoreTiles, pDC->drawId); - pDC->FeWork.type = STORETILES; - pDC->FeWork.pfnWork = ProcessStoreTiles; - pDC->FeWork.desc.storeTiles.attachmentMask = attachmentMask; + pDC->FeWork.type = STORETILES; + pDC->FeWork.pfnWork = ProcessStoreTiles; + pDC->FeWork.desc.storeTiles.attachmentMask = attachmentMask; pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState; - pDC->FeWork.desc.storeTiles.rect = storeRect; + pDC->FeWork.desc.storeTiles.rect = storeRect; pDC->FeWork.desc.storeTiles.rect &= g_MaxScissorRect; - //enqueue + // enqueue QueueDraw(pContext); AR_API_EVENT(SwrStoreTilesEvent(pDC->drawId)); @@ -1595,37 +1589,36 @@ void SWR_API SwrStoreTiles( /// @param z - depth value use for clearing depth buffer /// @param stencil - stencil value used for clearing stencil buffer /// @param clearRect - The pixel-coordinate rectangle to clear in all cleared buffers -void SWR_API SwrClearRenderTarget( - HANDLE hContext, - uint32_t attachmentMask, - uint32_t renderTargetArrayIndex, - const float clearColor[4], - float z, - uint8_t stencil, - const SWR_RECT& clearRect) +void SWR_API SwrClearRenderTarget(HANDLE hContext, + uint32_t attachmentMask, + uint32_t renderTargetArrayIndex, + const float clearColor[4], + float z, + uint8_t stencil, + const SWR_RECT& clearRect) { if (KNOB_TOSS_DRAW) { return; } - SWR_CONTEXT *pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); + SWR_CONTEXT* pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); RDTSC_BEGIN(APIClearRenderTarget, pDC->drawId); - pDC->FeWork.type = CLEAR; - pDC->FeWork.pfnWork = ProcessClear; + pDC->FeWork.type = CLEAR; + pDC->FeWork.pfnWork = ProcessClear; pDC->FeWork.desc.clear.rect = clearRect; pDC->FeWork.desc.clear.rect &= g_MaxScissorRect; - pDC->FeWork.desc.clear.attachmentMask = attachmentMask; + pDC->FeWork.desc.clear.attachmentMask = attachmentMask; pDC->FeWork.desc.clear.renderTargetArrayIndex = renderTargetArrayIndex; - pDC->FeWork.desc.clear.clearDepth = z; - pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0]; - pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1]; - pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2]; - pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3]; - pDC->FeWork.desc.clear.clearStencil = stencil; + pDC->FeWork.desc.clear.clearDepth = z; + pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0]; + pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1]; + pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2]; + pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3]; + pDC->FeWork.desc.clear.clearStencil = stencil; // enqueue draw QueueDraw(pContext); @@ -1639,16 +1632,16 @@ void SWR_API SwrClearRenderTarget( /// sampler. /// SWR is responsible for the allocation of the private context state. /// @param hContext - Handle passed back from SwrCreateContext -VOID* SwrGetPrivateContextState( - HANDLE hContext) +VOID* SwrGetPrivateContextState(HANDLE hContext) { - SWR_CONTEXT* pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); - DRAW_STATE* pState = pDC->pState; + SWR_CONTEXT* pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); + DRAW_STATE* pState = pDC->pState; if (pState->pPrivateState == nullptr) { - pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float)); + pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize, + KNOB_SIMD_WIDTH * sizeof(float)); } return pState->pPrivateState; @@ -1662,13 +1655,10 @@ VOID* SwrGetPrivateContextState( /// @param hContext - Handle passed back from SwrCreateContext /// @param size - Size of allocation /// @param align - Alignment needed for allocation. -VOID* SwrAllocDrawContextMemory( - HANDLE hContext, - uint32_t size, - uint32_t align) +VOID* SwrAllocDrawContextMemory(HANDLE hContext, uint32_t size, uint32_t align) { - SWR_CONTEXT* pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); + SWR_CONTEXT* pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); return pDC->pState->pArena->AllocAligned(size, align); } @@ -1677,12 +1667,10 @@ VOID* SwrAllocDrawContextMemory( /// @brief Enables stats counting /// @param hContext - Handle passed back from SwrCreateContext /// @param enable - If true then counts are incremented. -void SwrEnableStatsFE( - HANDLE hContext, - bool enable) +void SwrEnableStatsFE(HANDLE hContext, bool enable) { - SWR_CONTEXT *pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); + SWR_CONTEXT* pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); pDC->pState->state.enableStatsFE = enable; } @@ -1691,12 +1679,10 @@ void SwrEnableStatsFE( /// @brief Enables stats counting /// @param hContext - Handle passed back from SwrCreateContext /// @param enable - If true then counts are incremented. -void SwrEnableStatsBE( - HANDLE hContext, - bool enable) +void SwrEnableStatsBE(HANDLE hContext, bool enable) { - SWR_CONTEXT *pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); + SWR_CONTEXT* pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); pDC->pState->state.enableStatsBE = enable; } @@ -1704,11 +1690,10 @@ void SwrEnableStatsBE( ////////////////////////////////////////////////////////////////////////// /// @brief Mark end of frame - used for performance profiling /// @param hContext - Handle passed back from SwrCreateContext -void SWR_API SwrEndFrame( - HANDLE hContext) +void SWR_API SwrEndFrame(HANDLE hContext) { - SWR_CONTEXT *pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); + SWR_CONTEXT* pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); (void)pDC; // var used RDTSC_ENDFRAME(); @@ -1733,55 +1718,55 @@ void SwrInit() InitRasterizerFunctions(); } -void SwrGetInterface(SWR_INTERFACE &out_funcs) -{ - out_funcs.pfnSwrCreateContext = SwrCreateContext; - out_funcs.pfnSwrDestroyContext = SwrDestroyContext; - out_funcs.pfnSwrBindApiThread = SwrBindApiThread; - out_funcs.pfnSwrSaveState = SwrSaveState; - out_funcs.pfnSwrRestoreState = SwrRestoreState; - out_funcs.pfnSwrSync = SwrSync; - out_funcs.pfnSwrStallBE = SwrStallBE; - out_funcs.pfnSwrWaitForIdle = SwrWaitForIdle; - out_funcs.pfnSwrWaitForIdleFE = SwrWaitForIdleFE; - out_funcs.pfnSwrSetVertexBuffers = SwrSetVertexBuffers; - out_funcs.pfnSwrSetIndexBuffer = SwrSetIndexBuffer; - out_funcs.pfnSwrSetFetchFunc = SwrSetFetchFunc; - out_funcs.pfnSwrSetSoFunc = SwrSetSoFunc; - out_funcs.pfnSwrSetSoState = SwrSetSoState; - out_funcs.pfnSwrSetSoBuffers = SwrSetSoBuffers; - out_funcs.pfnSwrSetVertexFunc = SwrSetVertexFunc; - out_funcs.pfnSwrSetFrontendState = SwrSetFrontendState; - out_funcs.pfnSwrSetGsState = SwrSetGsState; - out_funcs.pfnSwrSetGsFunc = SwrSetGsFunc; - out_funcs.pfnSwrSetCsFunc = SwrSetCsFunc; - out_funcs.pfnSwrSetTsState = SwrSetTsState; - out_funcs.pfnSwrSetHsFunc = SwrSetHsFunc; - out_funcs.pfnSwrSetDsFunc = SwrSetDsFunc; - out_funcs.pfnSwrSetDepthStencilState = SwrSetDepthStencilState; - out_funcs.pfnSwrSetBackendState = SwrSetBackendState; - out_funcs.pfnSwrSetDepthBoundsState = SwrSetDepthBoundsState; - out_funcs.pfnSwrSetPixelShaderState = SwrSetPixelShaderState; - out_funcs.pfnSwrSetBlendState = SwrSetBlendState; - out_funcs.pfnSwrSetBlendFunc = SwrSetBlendFunc; - out_funcs.pfnSwrDraw = SwrDraw; - out_funcs.pfnSwrDrawInstanced = SwrDrawInstanced; - out_funcs.pfnSwrDrawIndexed = SwrDrawIndexed; - out_funcs.pfnSwrDrawIndexedInstanced = SwrDrawIndexedInstanced; - out_funcs.pfnSwrInvalidateTiles = SwrInvalidateTiles; - out_funcs.pfnSwrDiscardRect = SwrDiscardRect; - out_funcs.pfnSwrDispatch = SwrDispatch; - out_funcs.pfnSwrStoreTiles = SwrStoreTiles; - out_funcs.pfnSwrClearRenderTarget = SwrClearRenderTarget; - out_funcs.pfnSwrSetRastState = SwrSetRastState; - out_funcs.pfnSwrSetViewports = SwrSetViewports; - out_funcs.pfnSwrSetScissorRects = SwrSetScissorRects; +void SwrGetInterface(SWR_INTERFACE& out_funcs) +{ + out_funcs.pfnSwrCreateContext = SwrCreateContext; + out_funcs.pfnSwrDestroyContext = SwrDestroyContext; + out_funcs.pfnSwrBindApiThread = SwrBindApiThread; + out_funcs.pfnSwrSaveState = SwrSaveState; + out_funcs.pfnSwrRestoreState = SwrRestoreState; + out_funcs.pfnSwrSync = SwrSync; + out_funcs.pfnSwrStallBE = SwrStallBE; + out_funcs.pfnSwrWaitForIdle = SwrWaitForIdle; + out_funcs.pfnSwrWaitForIdleFE = SwrWaitForIdleFE; + out_funcs.pfnSwrSetVertexBuffers = SwrSetVertexBuffers; + out_funcs.pfnSwrSetIndexBuffer = SwrSetIndexBuffer; + out_funcs.pfnSwrSetFetchFunc = SwrSetFetchFunc; + out_funcs.pfnSwrSetSoFunc = SwrSetSoFunc; + out_funcs.pfnSwrSetSoState = SwrSetSoState; + out_funcs.pfnSwrSetSoBuffers = SwrSetSoBuffers; + out_funcs.pfnSwrSetVertexFunc = SwrSetVertexFunc; + out_funcs.pfnSwrSetFrontendState = SwrSetFrontendState; + out_funcs.pfnSwrSetGsState = SwrSetGsState; + out_funcs.pfnSwrSetGsFunc = SwrSetGsFunc; + out_funcs.pfnSwrSetCsFunc = SwrSetCsFunc; + out_funcs.pfnSwrSetTsState = SwrSetTsState; + out_funcs.pfnSwrSetHsFunc = SwrSetHsFunc; + out_funcs.pfnSwrSetDsFunc = SwrSetDsFunc; + out_funcs.pfnSwrSetDepthStencilState = SwrSetDepthStencilState; + out_funcs.pfnSwrSetBackendState = SwrSetBackendState; + out_funcs.pfnSwrSetDepthBoundsState = SwrSetDepthBoundsState; + out_funcs.pfnSwrSetPixelShaderState = SwrSetPixelShaderState; + out_funcs.pfnSwrSetBlendState = SwrSetBlendState; + out_funcs.pfnSwrSetBlendFunc = SwrSetBlendFunc; + out_funcs.pfnSwrDraw = SwrDraw; + out_funcs.pfnSwrDrawInstanced = SwrDrawInstanced; + out_funcs.pfnSwrDrawIndexed = SwrDrawIndexed; + out_funcs.pfnSwrDrawIndexedInstanced = SwrDrawIndexedInstanced; + out_funcs.pfnSwrInvalidateTiles = SwrInvalidateTiles; + out_funcs.pfnSwrDiscardRect = SwrDiscardRect; + out_funcs.pfnSwrDispatch = SwrDispatch; + out_funcs.pfnSwrStoreTiles = SwrStoreTiles; + out_funcs.pfnSwrClearRenderTarget = SwrClearRenderTarget; + out_funcs.pfnSwrSetRastState = SwrSetRastState; + out_funcs.pfnSwrSetViewports = SwrSetViewports; + out_funcs.pfnSwrSetScissorRects = SwrSetScissorRects; out_funcs.pfnSwrGetPrivateContextState = SwrGetPrivateContextState; out_funcs.pfnSwrAllocDrawContextMemory = SwrAllocDrawContextMemory; - out_funcs.pfnSwrEnableStatsFE = SwrEnableStatsFE; - out_funcs.pfnSwrEnableStatsBE = SwrEnableStatsBE; - out_funcs.pfnSwrEndFrame = SwrEndFrame; - out_funcs.pfnSwrInit = SwrInit; + out_funcs.pfnSwrEnableStatsFE = SwrEnableStatsFE; + out_funcs.pfnSwrEnableStatsBE = SwrEnableStatsBE; + out_funcs.pfnSwrEndFrame = SwrEndFrame; + out_funcs.pfnSwrInit = SwrInit; out_funcs.pfnSwrLoadHotTile = SwrLoadHotTile; out_funcs.pfnSwrStoreHotTileToSurface = SwrStoreHotTileToSurface; out_funcs.pfnSwrStoreHotTileClear = SwrStoreHotTileClear; diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h index b171188c927..9cc5292e7b0 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.h +++ b/src/gallium/drivers/swr/rasterizer/core/api.h @@ -1,30 +1,30 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file api.h -* -* @brief API definitions -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file api.h + * + * @brief API definitions + * + ******************************************************************************/ #ifndef __SWR_API_H__ #define __SWR_API_H__ @@ -38,7 +38,7 @@ #include "common/formats.h" #include "core/state.h" -typedef void(SWR_API *PFN_CALLBACK_FUNC)(uint64_t data, uint64_t data2, uint64_t data3); +typedef void(SWR_API* PFN_CALLBACK_FUNC)(uint64_t data, uint64_t data2, uint64_t data3); ////////////////////////////////////////////////////////////////////////// /// @brief Rectangle structure @@ -47,20 +47,15 @@ struct SWR_RECT int32_t xmin; ///< inclusive int32_t ymin; ///< inclusive int32_t xmax; ///< exclusive - int32_t ymax; ///< exclusive + int32_t ymax; ///< exclusive - bool operator == (const SWR_RECT& rhs) + bool operator==(const SWR_RECT& rhs) { - return (this->ymin == rhs.ymin && - this->ymax == rhs.ymax && - this->xmin == rhs.xmin && - this->xmax == rhs.xmax); + return (this->ymin == rhs.ymin && this->ymax == rhs.ymax && this->xmin == rhs.xmin && + this->xmax == rhs.xmax); } - bool operator != (const SWR_RECT& rhs) - { - return !(*this == rhs); - } + bool operator!=(const SWR_RECT& rhs) { return !(*this == rhs); } SWR_RECT& Intersect(const SWR_RECT& other) { @@ -69,8 +64,7 @@ struct SWR_RECT this->xmax = std::min(this->xmax, other.xmax); this->ymax = std::min(this->ymax, other.ymax); - if (xmax - xmin < 0 || - ymax - ymin < 0) + if (xmax - xmin < 0 || ymax - ymin < 0) { // Zero area ymin = ymax = xmin = xmax = 0; @@ -78,10 +72,7 @@ struct SWR_RECT return *this; } - SWR_RECT& operator &= (const SWR_RECT& other) - { - return Intersect(other); - } + SWR_RECT& operator&=(const SWR_RECT& other) { return Intersect(other); } SWR_RECT& Union(const SWR_RECT& other) { @@ -93,10 +84,7 @@ struct SWR_RECT return *this; } - SWR_RECT& operator |= (const SWR_RECT& other) - { - return Union(other); - } + SWR_RECT& operator|=(const SWR_RECT& other) { return Union(other); } void Translate(int32_t x, int32_t y) { @@ -115,10 +103,14 @@ struct SWR_RECT /// @param x - destination x coordinate /// @param y - destination y coordinate /// @param pDstHotTile - pointer to the hot tile surface -typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, HANDLE hWorkerPrivateData, - SWR_FORMAT dstFormat, - SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, - uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t *pDstHotTile); +typedef void(SWR_API* PFN_LOAD_TILE)(HANDLE hPrivateContext, + HANDLE hWorkerPrivateData, + SWR_FORMAT dstFormat, + SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, + uint32_t x, + uint32_t y, + uint32_t renderTargetArrayIndex, + uint8_t* pDstHotTile); ////////////////////////////////////////////////////////////////////////// /// @brief Function signature for store hot tiles @@ -128,10 +120,14 @@ typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, HANDLE hWorkerPriva /// @param x - destination x coordinate /// @param y - destination y coordinate /// @param pSrcHotTile - pointer to the hot tile surface -typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, HANDLE hWorkerPrivateData, - SWR_FORMAT srcFormat, - SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, - uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t *pSrcHotTile); +typedef void(SWR_API* PFN_STORE_TILE)(HANDLE hPrivateContext, + HANDLE hWorkerPrivateData, + SWR_FORMAT srcFormat, + SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, + uint32_t x, + uint32_t y, + uint32_t renderTargetArrayIndex, + uint8_t* pSrcHotTile); ////////////////////////////////////////////////////////////////////////// /// @brief Function signature for clearing from the hot tiles clear value @@ -141,9 +137,13 @@ typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, HANDLE hWorkerPriv /// @param y - destination y coordinate /// @param renderTargetArrayIndex - render target array offset from arrayIndex /// @param pClearColor - pointer to the hot tile's clear value -typedef void(SWR_API *PFN_CLEAR_TILE)(HANDLE hPrivateContext, HANDLE hWorkerPrivateData, - SWR_RENDERTARGET_ATTACHMENT rtIndex, - uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, const float* pClearColor); +typedef void(SWR_API* PFN_CLEAR_TILE)(HANDLE hPrivateContext, + HANDLE hWorkerPrivateData, + SWR_RENDERTARGET_ATTACHMENT rtIndex, + uint32_t x, + uint32_t y, + uint32_t renderTargetArrayIndex, + const float* pClearColor); ////////////////////////////////////////////////////////////////////////// /// @brief Callback to allow driver to update their copy of streamout write offset. @@ -152,15 +152,15 @@ typedef void(SWR_API *PFN_CLEAR_TILE)(HANDLE hPrivateContext, HANDLE hWorkerPriv /// @param hPrivateContext - handle to private data /// @param soBufferSlot - buffer slot for write offset /// @param soWriteOffset - update value for so write offset. -typedef void(SWR_API *PFN_UPDATE_SO_WRITE_OFFSET)(HANDLE hPrivateContext, - uint32_t soBufferSlot, uint32_t soWriteOffset); +typedef void(SWR_API* PFN_UPDATE_SO_WRITE_OFFSET)(HANDLE hPrivateContext, + uint32_t soBufferSlot, + uint32_t soWriteOffset); ////////////////////////////////////////////////////////////////////////// /// @brief Callback to allow driver to update their copy of stats. /// @param hPrivateContext - handle to private data /// @param pStats - pointer to draw stats -typedef void(SWR_API *PFN_UPDATE_STATS)(HANDLE hPrivateContext, - const SWR_STATS* pStats); +typedef void(SWR_API* PFN_UPDATE_STATS)(HANDLE hPrivateContext, const SWR_STATS* pStats); ////////////////////////////////////////////////////////////////////////// /// @brief Callback to allow driver to update their copy of FE stats. @@ -169,8 +169,7 @@ typedef void(SWR_API *PFN_UPDATE_STATS)(HANDLE hPrivateContext, /// to sum up the stats across all of the workers. /// @param hPrivateContext - handle to private data /// @param pStats - pointer to draw stats -typedef void(SWR_API *PFN_UPDATE_STATS_FE)(HANDLE hPrivateContext, - const SWR_STATS_FE* pStats); +typedef void(SWR_API* PFN_UPDATE_STATS_FE)(HANDLE hPrivateContext, const SWR_STATS_FE* pStats); ////////////////////////////////////////////////////////////////////////// /// BucketManager @@ -183,14 +182,14 @@ class BucketManager; ///////////////////////////////////////////////////////////////////////// struct SWR_THREADING_INFO { - uint32_t BASE_NUMA_NODE; - uint32_t BASE_CORE; - uint32_t BASE_THREAD; - uint32_t MAX_WORKER_THREADS; - uint32_t MAX_NUMA_NODES; - uint32_t MAX_CORES_PER_NUMA_NODE; - uint32_t MAX_THREADS_PER_CORE; - bool SINGLE_THREADED; + uint32_t BASE_NUMA_NODE; + uint32_t BASE_CORE; + uint32_t BASE_THREAD; + uint32_t MAX_WORKER_THREADS; + uint32_t MAX_NUMA_NODES; + uint32_t MAX_CORES_PER_NUMA_NODE; + uint32_t MAX_THREADS_PER_CORE; + bool SINGLE_THREADED; }; ////////////////////////////////////////////////////////////////////////// @@ -206,8 +205,8 @@ struct SWR_API_THREADING_INFO uint32_t bindAPIThread0; // Default is true if numAPIReservedThreads is > 0, // binds thread used in SwrCreateContext to API Reserved // thread 0 - uint32_t numAPIThreadsPerCore; // 0 - means use all threads per core, else clamp to this number. - // Independent of KNOB_MAX_THREADS_PER_CORE. + uint32_t numAPIThreadsPerCore; // 0 - means use all threads per core, else clamp to this number. + // Independent of KNOB_MAX_THREADS_PER_CORE. }; ////////////////////////////////////////////////////////////////////////// @@ -217,13 +216,13 @@ struct SWR_API_THREADING_INFO ///////////////////////////////////////////////////////////////////////// struct SWR_WORKER_PRIVATE_STATE { - typedef void (SWR_API *PFN_WORKER_DATA)(HANDLE hWorkerPrivateData, uint32_t iWorkerNum); + typedef void(SWR_API* PFN_WORKER_DATA)(HANDLE hWorkerPrivateData, uint32_t iWorkerNum); - size_t perWorkerPrivateStateSize; ///< Amount of data to allocate per-worker - PFN_WORKER_DATA pfnInitWorkerData; ///< Init function for worker data. If null - ///< worker data will be initialized to 0. - PFN_WORKER_DATA pfnFinishWorkerData; ///< Finish / destroy function for worker data. - ///< Can be null. + size_t perWorkerPrivateStateSize; ///< Amount of data to allocate per-worker + PFN_WORKER_DATA pfnInitWorkerData; ///< Init function for worker data. If null + ///< worker data will be initialized to 0. + PFN_WORKER_DATA pfnFinishWorkerData; ///< Finish / destroy function for worker data. + ///< Can be null. }; ////////////////////////////////////////////////////////////////////////// @@ -233,198 +232,167 @@ struct SWR_CREATECONTEXT_INFO { // External functions (e.g. sampler) need per draw context state. // Use SwrGetPrivateContextState() to access private state. - size_t privateStateSize; + size_t privateStateSize; // Optional per-worker state, can be NULL for no worker-private data - SWR_WORKER_PRIVATE_STATE* pWorkerPrivateState; + SWR_WORKER_PRIVATE_STATE* pWorkerPrivateState; // Callback functions - PFN_LOAD_TILE pfnLoadTile; - PFN_STORE_TILE pfnStoreTile; - PFN_CLEAR_TILE pfnClearTile; - PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset; - PFN_UPDATE_STATS pfnUpdateStats; - PFN_UPDATE_STATS_FE pfnUpdateStatsFE; + PFN_LOAD_TILE pfnLoadTile; + PFN_STORE_TILE pfnStoreTile; + PFN_CLEAR_TILE pfnClearTile; + PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset; + PFN_UPDATE_STATS pfnUpdateStats; + PFN_UPDATE_STATS_FE pfnUpdateStatsFE; // Pointer to rdtsc buckets mgr returned to the caller. // Only populated when KNOB_ENABLE_RDTSC is set - BucketManager* pBucketMgr; + BucketManager* pBucketMgr; // Output: size required memory passed to for SwrSaveState / SwrRestoreState - size_t contextSaveSize; + size_t contextSaveSize; // ArchRast event manager. - HANDLE hArEventManager; + HANDLE hArEventManager; // Input (optional): Threading info that overrides any set KNOB values. - SWR_THREADING_INFO* pThreadInfo; + SWR_THREADING_INFO* pThreadInfo; // Input (optional): Info for reserving API threads - SWR_API_THREADING_INFO* pApiThreadInfo; + SWR_API_THREADING_INFO* pApiThreadInfo; // Input: if set to non-zero value, overrides KNOB value for maximum // number of draws in flight - uint32_t MAX_DRAWS_IN_FLIGHT; + uint32_t MAX_DRAWS_IN_FLIGHT; }; ////////////////////////////////////////////////////////////////////////// /// @brief Create SWR Context. /// @param pCreateInfo - pointer to creation info. -SWR_FUNC(HANDLE, SwrCreateContext, - SWR_CREATECONTEXT_INFO* pCreateInfo); +SWR_FUNC(HANDLE, SwrCreateContext, SWR_CREATECONTEXT_INFO* pCreateInfo); ////////////////////////////////////////////////////////////////////////// /// @brief Destroys SWR Context. /// @param hContext - Handle passed back from SwrCreateContext -SWR_FUNC(void, SwrDestroyContext, - HANDLE hContext); +SWR_FUNC(void, SwrDestroyContext, HANDLE hContext); ////////////////////////////////////////////////////////////////////////// /// @brief Bind current thread to an API reserved HW thread /// @param hContext - Handle passed back from SwrCreateContext /// @param apiThreadId - index of reserved HW thread to bind to. -SWR_FUNC(void, SwrBindApiThread, - HANDLE hContext, - uint32_t apiThreadId); +SWR_FUNC(void, SwrBindApiThread, HANDLE hContext, uint32_t apiThreadId); ////////////////////////////////////////////////////////////////////////// /// @brief Saves API state associated with hContext /// @param hContext - Handle passed back from SwrCreateContext /// @param pOutputStateBlock - Memory block to receive API state data /// @param memSize - Size of memory pointed to by pOutputStateBlock -SWR_FUNC(void, SwrSaveState, - HANDLE hContext, - void* pOutputStateBlock, - size_t memSize); +SWR_FUNC(void, SwrSaveState, HANDLE hContext, void* pOutputStateBlock, size_t memSize); ////////////////////////////////////////////////////////////////////////// /// @brief Restores API state to hContext previously saved with SwrSaveState /// @param hContext - Handle passed back from SwrCreateContext /// @param pStateBlock - Memory block to read API state data from /// @param memSize - Size of memory pointed to by pStateBlock -SWR_FUNC(void, SwrRestoreState, - HANDLE hContext, - const void* pStateBlock, - size_t memSize); +SWR_FUNC(void, SwrRestoreState, HANDLE hContext, const void* pStateBlock, size_t memSize); ////////////////////////////////////////////////////////////////////////// /// @brief Sync cmd. Executes the callback func when all rendering up to this sync /// has been completed /// @param hContext - Handle passed back from SwrCreateContext /// @param pfnFunc - pointer to callback function, -/// @param userData - user data to pass back -SWR_FUNC(void, SwrSync, - HANDLE hContext, - PFN_CALLBACK_FUNC pfnFunc, - uint64_t userData, - uint64_t userData2, - uint64_t userData3); +/// @param userData - user data to pass back +SWR_FUNC(void, + SwrSync, + HANDLE hContext, + PFN_CALLBACK_FUNC pfnFunc, + uint64_t userData, + uint64_t userData2, + uint64_t userData3); ////////////////////////////////////////////////////////////////////////// /// @brief Stall cmd. Stalls the backend until all previous work has been completed. /// Frontend work can continue to make progress /// @param hContext - Handle passed back from SwrCreateContext -SWR_FUNC(void, SwrStallBE, - HANDLE hContext); +SWR_FUNC(void, SwrStallBE, HANDLE hContext); ////////////////////////////////////////////////////////////////////////// /// @brief Blocks until all rendering has been completed. /// @param hContext - Handle passed back from SwrCreateContext -SWR_FUNC(void, SwrWaitForIdle, - HANDLE hContext); +SWR_FUNC(void, SwrWaitForIdle, HANDLE hContext); ////////////////////////////////////////////////////////////////////////// /// @brief Blocks until all FE rendering has been completed. /// @param hContext - Handle passed back from SwrCreateContext -SWR_FUNC(void, SwrWaitForIdleFE, - HANDLE hContext); +SWR_FUNC(void, SwrWaitForIdleFE, HANDLE hContext); ////////////////////////////////////////////////////////////////////////// /// @brief Set vertex buffer state. /// @param hContext - Handle passed back from SwrCreateContext /// @param numBuffers - Number of vertex buffer state descriptors. /// @param pVertexBuffers - Array of vertex buffer state descriptors. -SWR_FUNC(void, SwrSetVertexBuffers, - HANDLE hContext, - uint32_t numBuffers, - const SWR_VERTEX_BUFFER_STATE* pVertexBuffers); +SWR_FUNC(void, + SwrSetVertexBuffers, + HANDLE hContext, + uint32_t numBuffers, + const SWR_VERTEX_BUFFER_STATE* pVertexBuffers); ////////////////////////////////////////////////////////////////////////// /// @brief Set index buffer /// @param hContext - Handle passed back from SwrCreateContext /// @param pIndexBuffer - Index buffer. -SWR_FUNC(void, SwrSetIndexBuffer, - HANDLE hContext, - const SWR_INDEX_BUFFER_STATE* pIndexBuffer); +SWR_FUNC(void, SwrSetIndexBuffer, HANDLE hContext, const SWR_INDEX_BUFFER_STATE* pIndexBuffer); ////////////////////////////////////////////////////////////////////////// /// @brief Set fetch shader pointer. /// @param hContext - Handle passed back from SwrCreateContext /// @param pfnFetchFunc - Pointer to shader. -SWR_FUNC(void, SwrSetFetchFunc, - HANDLE hContext, - PFN_FETCH_FUNC pfnFetchFunc); +SWR_FUNC(void, SwrSetFetchFunc, HANDLE hContext, PFN_FETCH_FUNC pfnFetchFunc); ////////////////////////////////////////////////////////////////////////// /// @brief Set streamout shader pointer. /// @param hContext - Handle passed back from SwrCreateContext /// @param pfnSoFunc - Pointer to shader. /// @param streamIndex - specifies stream -SWR_FUNC(void, SwrSetSoFunc, - HANDLE hContext, - PFN_SO_FUNC pfnSoFunc, - uint32_t streamIndex); +SWR_FUNC(void, SwrSetSoFunc, HANDLE hContext, PFN_SO_FUNC pfnSoFunc, uint32_t streamIndex); ////////////////////////////////////////////////////////////////////////// /// @brief Set streamout state /// @param hContext - Handle passed back from SwrCreateContext /// @param pSoState - Pointer to streamout state. -SWR_FUNC(void, SwrSetSoState, - HANDLE hContext, - SWR_STREAMOUT_STATE* pSoState); +SWR_FUNC(void, SwrSetSoState, HANDLE hContext, SWR_STREAMOUT_STATE* pSoState); ////////////////////////////////////////////////////////////////////////// /// @brief Set streamout buffer state /// @param hContext - Handle passed back from SwrCreateContext /// @param pSoBuffer - Pointer to streamout buffer. /// @param slot - Slot to bind SO buffer to. -SWR_FUNC(void, SwrSetSoBuffers, - HANDLE hContext, - SWR_STREAMOUT_BUFFER* pSoBuffer, - uint32_t slot); +SWR_FUNC(void, SwrSetSoBuffers, HANDLE hContext, SWR_STREAMOUT_BUFFER* pSoBuffer, uint32_t slot); ////////////////////////////////////////////////////////////////////////// /// @brief Set vertex shader pointer. /// @param hContext - Handle passed back from SwrCreateContext /// @param pfnVertexFunc - Pointer to shader. -SWR_FUNC(void, SwrSetVertexFunc, - HANDLE hContext, - PFN_VERTEX_FUNC pfnVertexFunc); +SWR_FUNC(void, SwrSetVertexFunc, HANDLE hContext, PFN_VERTEX_FUNC pfnVertexFunc); ////////////////////////////////////////////////////////////////////////// /// @brief Set frontend state. /// @param hContext - Handle passed back from SwrCreateContext /// @param pState - Pointer to state -SWR_FUNC(void, SwrSetFrontendState, - HANDLE hContext, - SWR_FRONTEND_STATE *pState); +SWR_FUNC(void, SwrSetFrontendState, HANDLE hContext, SWR_FRONTEND_STATE* pState); ////////////////////////////////////////////////////////////////////////// /// @brief Set geometry shader state. /// @param hContext - Handle passed back from SwrCreateContext /// @param pState - Pointer to state -SWR_FUNC(void, SwrSetGsState, - HANDLE hContext, - SWR_GS_STATE *pState); +SWR_FUNC(void, SwrSetGsState, HANDLE hContext, SWR_GS_STATE* pState); ////////////////////////////////////////////////////////////////////////// /// @brief Set geometry shader /// @param hContext - Handle passed back from SwrCreateContext /// @param pState - Pointer to geometry shader function -SWR_FUNC(void, SwrSetGsFunc, - HANDLE hContext, - PFN_GS_FUNC pfnGsFunc); +SWR_FUNC(void, SwrSetGsFunc, HANDLE hContext, PFN_GS_FUNC pfnGsFunc); ////////////////////////////////////////////////////////////////////////// /// @brief Set compute shader @@ -434,88 +402,70 @@ SWR_FUNC(void, SwrSetGsFunc, /// @param totalSpillFillSize - size in bytes needed for spill/fill. /// @param scratchSpaceSizePerInstance - size of the scratch space needed per simd instance /// @param numInstances - number of simd instances that are run per execution of the shader -SWR_FUNC(void, SwrSetCsFunc, - HANDLE hContext, - PFN_CS_FUNC pfnCsFunc, - uint32_t totalThreadsInGroup, - uint32_t totalSpillFillSize, - uint32_t scratchSpaceSizePerInstance, - uint32_t numInstances - ); +SWR_FUNC(void, + SwrSetCsFunc, + HANDLE hContext, + PFN_CS_FUNC pfnCsFunc, + uint32_t totalThreadsInGroup, + uint32_t totalSpillFillSize, + uint32_t scratchSpaceSizePerInstance, + uint32_t numInstances); ////////////////////////////////////////////////////////////////////////// /// @brief Set tessellation state. /// @param hContext - Handle passed back from SwrCreateContext /// @param pState - Pointer to state -SWR_FUNC(void, SwrSetTsState, - HANDLE hContext, - SWR_TS_STATE *pState); +SWR_FUNC(void, SwrSetTsState, HANDLE hContext, SWR_TS_STATE* pState); ////////////////////////////////////////////////////////////////////////// /// @brief Set hull shader /// @param hContext - Handle passed back from SwrCreateContext /// @param pfnFunc - Pointer to shader function -SWR_FUNC(void, SwrSetHsFunc, - HANDLE hContext, - PFN_HS_FUNC pfnFunc); +SWR_FUNC(void, SwrSetHsFunc, HANDLE hContext, PFN_HS_FUNC pfnFunc); ////////////////////////////////////////////////////////////////////////// /// @brief Set domain shader /// @param hContext - Handle passed back from SwrCreateContext /// @param pfnFunc - Pointer to shader function -SWR_FUNC(void, SwrSetDsFunc, - HANDLE hContext, - PFN_DS_FUNC pfnFunc); +SWR_FUNC(void, SwrSetDsFunc, HANDLE hContext, PFN_DS_FUNC pfnFunc); ////////////////////////////////////////////////////////////////////////// /// @brief Set depth stencil state /// @param hContext - Handle passed back from SwrCreateContext /// @param pState - Pointer to state. -SWR_FUNC(void, SwrSetDepthStencilState, - HANDLE hContext, - SWR_DEPTH_STENCIL_STATE *pState); +SWR_FUNC(void, SwrSetDepthStencilState, HANDLE hContext, SWR_DEPTH_STENCIL_STATE* pState); ////////////////////////////////////////////////////////////////////////// /// @brief Set backend state /// @param hContext - Handle passed back from SwrCreateContext /// @param pState - Pointer to state. -SWR_FUNC(void, SwrSetBackendState, - HANDLE hContext, - SWR_BACKEND_STATE *pState); +SWR_FUNC(void, SwrSetBackendState, HANDLE hContext, SWR_BACKEND_STATE* pState); ////////////////////////////////////////////////////////////////////////// /// @brief Set depth bounds state /// @param hContext - Handle passed back from SwrCreateContext /// @param pState - Pointer to state. -SWR_FUNC(void, SwrSetDepthBoundsState, - HANDLE hContext, - SWR_DEPTH_BOUNDS_STATE *pState); +SWR_FUNC(void, SwrSetDepthBoundsState, HANDLE hContext, SWR_DEPTH_BOUNDS_STATE* pState); ////////////////////////////////////////////////////////////////////////// /// @brief Set pixel shader state /// @param hContext - Handle passed back from SwrCreateContext /// @param pState - Pointer to state. -SWR_FUNC(void, SwrSetPixelShaderState, - HANDLE hContext, - SWR_PS_STATE *pState); +SWR_FUNC(void, SwrSetPixelShaderState, HANDLE hContext, SWR_PS_STATE* pState); ////////////////////////////////////////////////////////////////////////// /// @brief Set blend state /// @param hContext - Handle passed back from SwrCreateContext /// @param pState - Pointer to state. -SWR_FUNC(void, SwrSetBlendState, - HANDLE hContext, - SWR_BLEND_STATE *pState); +SWR_FUNC(void, SwrSetBlendState, HANDLE hContext, SWR_BLEND_STATE* pState); ////////////////////////////////////////////////////////////////////////// /// @brief Set blend function /// @param hContext - Handle passed back from SwrCreateContext /// @param renderTarget - render target index /// @param pfnBlendFunc - function pointer -SWR_FUNC(void, SwrSetBlendFunc, - HANDLE hContext, - uint32_t renderTarget, - PFN_BLEND_JIT_FUNC pfnBlendFunc); +SWR_FUNC( + void, SwrSetBlendFunc, HANDLE hContext, uint32_t renderTarget, PFN_BLEND_JIT_FUNC pfnBlendFunc); ////////////////////////////////////////////////////////////////////////// /// @brief SwrDraw @@ -523,11 +473,12 @@ SWR_FUNC(void, SwrSetBlendFunc, /// @param topology - Specifies topology for draw. /// @param startVertex - Specifies start vertex in vertex buffer for draw. /// @param primCount - Number of vertices. -SWR_FUNC(void, SwrDraw, - HANDLE hContext, - PRIMITIVE_TOPOLOGY topology, - uint32_t startVertex, - uint32_t primCount); +SWR_FUNC(void, + SwrDraw, + HANDLE hContext, + PRIMITIVE_TOPOLOGY topology, + uint32_t startVertex, + uint32_t primCount); ////////////////////////////////////////////////////////////////////////// /// @brief SwrDrawInstanced @@ -536,14 +487,16 @@ SWR_FUNC(void, SwrDraw, /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data. /// @param numInstances - How many instances to render. /// @param startVertex - Specifies start vertex for draw. (vertex data) -/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) -SWR_FUNC(void, SwrDrawInstanced, - HANDLE hContext, - PRIMITIVE_TOPOLOGY topology, - uint32_t numVertsPerInstance, - uint32_t numInstances, - uint32_t startVertex, - uint32_t startInstance); +/// @param startInstance - Which instance to start sequentially fetching from in each buffer +/// (instanced data) +SWR_FUNC(void, + SwrDrawInstanced, + HANDLE hContext, + PRIMITIVE_TOPOLOGY topology, + uint32_t numVertsPerInstance, + uint32_t numInstances, + uint32_t startVertex, + uint32_t startInstance); ////////////////////////////////////////////////////////////////////////// /// @brief DrawIndexed @@ -552,12 +505,13 @@ SWR_FUNC(void, SwrDrawInstanced, /// @param numIndices - Number of indices to read sequentially from index buffer. /// @param indexOffset - Starting index into index buffer. /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. -SWR_FUNC(void, SwrDrawIndexed, - HANDLE hContext, - PRIMITIVE_TOPOLOGY topology, - uint32_t numIndices, - uint32_t indexOffset, - int32_t baseVertex); +SWR_FUNC(void, + SwrDrawIndexed, + HANDLE hContext, + PRIMITIVE_TOPOLOGY topology, + uint32_t numIndices, + uint32_t indexOffset, + int32_t baseVertex); ////////////////////////////////////////////////////////////////////////// /// @brief SwrDrawIndexedInstanced @@ -567,26 +521,30 @@ SWR_FUNC(void, SwrDrawIndexed, /// @param numInstances - Number of instances to render. /// @param indexOffset - Starting index into index buffer. /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. -/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) -SWR_FUNC(void, SwrDrawIndexedInstanced, - HANDLE hContext, - PRIMITIVE_TOPOLOGY topology, - uint32_t numIndices, - uint32_t numInstances, - uint32_t indexOffset, - int32_t baseVertex, - uint32_t startInstance); +/// @param startInstance - Which instance to start sequentially fetching from in each buffer +/// (instanced data) +SWR_FUNC(void, + SwrDrawIndexedInstanced, + HANDLE hContext, + PRIMITIVE_TOPOLOGY topology, + uint32_t numIndices, + uint32_t numInstances, + uint32_t indexOffset, + int32_t baseVertex, + uint32_t startInstance); ////////////////////////////////////////////////////////////////////////// /// @brief SwrInvalidateTiles /// @param hContext - Handle passed back from SwrCreateContext -/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate. +/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to +/// invalidate. /// @param invalidateRect - The pixel-coordinate rectangle to invalidate. This will be expanded to /// be hottile size-aligned. -SWR_FUNC(void, SwrInvalidateTiles, - HANDLE hContext, - uint32_t attachmentMask, - const SWR_RECT& invalidateRect); +SWR_FUNC(void, + SwrInvalidateTiles, + HANDLE hContext, + uint32_t attachmentMask, + const SWR_RECT& invalidateRect); ////////////////////////////////////////////////////////////////////////// /// @brief SwrDiscardRect @@ -594,10 +552,7 @@ SWR_FUNC(void, SwrInvalidateTiles, /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard. /// @param rect - The pixel-coordinate rectangle to discard. Only fully-covered hottiles will be /// discarded. -SWR_FUNC(void, SwrDiscardRect, - HANDLE hContext, - uint32_t attachmentMask, - const SWR_RECT& rect); +SWR_FUNC(void, SwrDiscardRect, HANDLE hContext, uint32_t attachmentMask, const SWR_RECT& rect); ////////////////////////////////////////////////////////////////////////// /// @brief SwrDispatch @@ -605,27 +560,29 @@ SWR_FUNC(void, SwrDiscardRect, /// @param threadGroupCountX - Number of thread groups dispatched in X direction /// @param threadGroupCountY - Number of thread groups dispatched in Y direction /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction -SWR_FUNC(void, SwrDispatch, - HANDLE hContext, - uint32_t threadGroupCountX, - uint32_t threadGroupCountY, - uint32_t threadGroupCountZ); - +SWR_FUNC(void, + SwrDispatch, + HANDLE hContext, + uint32_t threadGroupCountX, + uint32_t threadGroupCountY, + uint32_t threadGroupCountZ); enum SWR_TILE_STATE { - SWR_TILE_INVALID = 0, // tile is in unitialized state and should be loaded with surface contents before rendering - SWR_TILE_DIRTY = 2, // tile contains newer data than surface it represents - SWR_TILE_RESOLVED = 3, // is in sync with surface it represents + SWR_TILE_INVALID = 0, // tile is in unitialized state and should be loaded with surface contents + // before rendering + SWR_TILE_DIRTY = 2, // tile contains newer data than surface it represents + SWR_TILE_RESOLVED = 3, // is in sync with surface it represents }; -/// @todo Add a good description for what attachments are and when and why you would use the different SWR_TILE_STATEs. -SWR_FUNC(void, SwrStoreTiles, - HANDLE hContext, - uint32_t attachmentMask, - SWR_TILE_STATE postStoreTileState, - const SWR_RECT& storeRect); - +/// @todo Add a good description for what attachments are and when and why you would use the +/// different SWR_TILE_STATEs. +SWR_FUNC(void, + SwrStoreTiles, + HANDLE hContext, + uint32_t attachmentMask, + SWR_TILE_STATE postStoreTileState, + const SWR_RECT& storeRect); ////////////////////////////////////////////////////////////////////////// /// @brief SwrClearRenderTarget - Clear attached render targets / depth / stencil @@ -636,22 +593,21 @@ SWR_FUNC(void, SwrStoreTiles, /// @param z - depth value use for clearing depth buffer /// @param stencil - stencil value used for clearing stencil buffer /// @param clearRect - The pixel-coordinate rectangle to clear in all cleared buffers -SWR_FUNC(void, SwrClearRenderTarget, - HANDLE hContext, - uint32_t attachmentMask, - uint32_t renderTargetArrayIndex, - const float clearColor[4], - float z, - uint8_t stencil, - const SWR_RECT& clearRect); +SWR_FUNC(void, + SwrClearRenderTarget, + HANDLE hContext, + uint32_t attachmentMask, + uint32_t renderTargetArrayIndex, + const float clearColor[4], + float z, + uint8_t stencil, + const SWR_RECT& clearRect); ////////////////////////////////////////////////////////////////////////// /// @brief SwrSetRastState /// @param hContext - Handle passed back from SwrCreateContext /// @param pRastState - New SWR_RASTSTATE used for SwrDraw* commands -SWR_FUNC(void, SwrSetRastState, - HANDLE hContext, - const SWR_RASTSTATE *pRastState); +SWR_FUNC(void, SwrSetRastState, HANDLE hContext, const SWR_RASTSTATE* pRastState); ////////////////////////////////////////////////////////////////////////// /// @brief SwrSetViewports @@ -659,21 +615,20 @@ SWR_FUNC(void, SwrSetRastState, /// @param numViewports - number of viewports passed in /// @param pViewports - Specifies extents of viewport. /// @param pMatrices - If not specified then SWR computes a default one. -SWR_FUNC(void, SwrSetViewports, - HANDLE hContext, - uint32_t numViewports, - const SWR_VIEWPORT* pViewports, - const SWR_VIEWPORT_MATRICES* pMatrices); +SWR_FUNC(void, + SwrSetViewports, + HANDLE hContext, + uint32_t numViewports, + const SWR_VIEWPORT* pViewports, + const SWR_VIEWPORT_MATRICES* pMatrices); ////////////////////////////////////////////////////////////////////////// /// @brief SwrSetScissorRects /// @param hContext - Handle passed back from SwrCreateContext /// @param numScissors - number of scissors passed in /// @param pScissors - array of scissors -SWR_FUNC(void, SwrSetScissorRects, - HANDLE hContext, - uint32_t numScissors, - const SWR_RECT* pScissors); +SWR_FUNC( + void, SwrSetScissorRects, HANDLE hContext, uint32_t numScissors, const SWR_RECT* pScissors); ////////////////////////////////////////////////////////////////////////// /// @brief Returns a pointer to the private context state for the current @@ -683,8 +638,7 @@ SWR_FUNC(void, SwrSetScissorRects, /// @note Client needs to resend private state prior to each draw call. /// Also, SWR is responsible for the private state memory. /// @param hContext - Handle passed back from SwrCreateContext -SWR_FUNC(void*, SwrGetPrivateContextState, - HANDLE hContext); +SWR_FUNC(void*, SwrGetPrivateContextState, HANDLE hContext); ////////////////////////////////////////////////////////////////////////// /// @brief Clients can use this to allocate memory for draw/dispatch @@ -694,32 +648,24 @@ SWR_FUNC(void*, SwrGetPrivateContextState, /// @param hContext - Handle passed back from SwrCreateContext /// @param size - Size of allocation /// @param align - Alignment needed for allocation. -SWR_FUNC(void*, SwrAllocDrawContextMemory, - HANDLE hContext, - uint32_t size, - uint32_t align); +SWR_FUNC(void*, SwrAllocDrawContextMemory, HANDLE hContext, uint32_t size, uint32_t align); ////////////////////////////////////////////////////////////////////////// /// @brief Enables stats counting /// @param hContext - Handle passed back from SwrCreateContext /// @param enable - If true then counts are incremented. -SWR_FUNC(void, SwrEnableStatsFE, - HANDLE hContext, - bool enable); +SWR_FUNC(void, SwrEnableStatsFE, HANDLE hContext, bool enable); ////////////////////////////////////////////////////////////////////////// /// @brief Enables stats counting /// @param hContext - Handle passed back from SwrCreateContext /// @param enable - If true then counts are incremented. -SWR_FUNC(void, SwrEnableStatsBE, - HANDLE hContext, - bool enable); +SWR_FUNC(void, SwrEnableStatsBE, HANDLE hContext, bool enable); ////////////////////////////////////////////////////////////////////////// /// @brief Mark end of frame - used for performance profiling /// @param hContext - Handle passed back from SwrCreateContext -SWR_FUNC(void, SwrEndFrame, - HANDLE hContext); +SWR_FUNC(void, SwrEndFrame, HANDLE hContext); ////////////////////////////////////////////////////////////////////////// /// @brief Initialize swr backend and memory internal tables @@ -733,13 +679,16 @@ SWR_FUNC(void, SwrInit); /// @param renderTargetIndex - Index to src render target /// @param x, y - Coordinates to raster tile. /// @param pDstHotTile - Pointer to Hot Tile -SWR_FUNC(void, SwrLoadHotTile, - HANDLE hWorkerPrivateData, - const SWR_SURFACE_STATE *pSrcSurface, - SWR_FORMAT dstFormat, - SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, - uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, - uint8_t *pDstHotTile); +SWR_FUNC(void, + SwrLoadHotTile, + HANDLE hWorkerPrivateData, + const SWR_SURFACE_STATE* pSrcSurface, + SWR_FORMAT dstFormat, + SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, + uint32_t x, + uint32_t y, + uint32_t renderTargetArrayIndex, + uint8_t* pDstHotTile); ////////////////////////////////////////////////////////////////////////// /// @brief Deswizzles and stores a full hottile to a render surface @@ -748,13 +697,16 @@ SWR_FUNC(void, SwrLoadHotTile, /// @param renderTargetIndex - Index to destination render target /// @param x, y - Coordinates to raster tile. /// @param pSrcHotTile - Pointer to Hot Tile -SWR_FUNC(void, SwrStoreHotTileToSurface, - HANDLE hWorkerPrivateData, - SWR_SURFACE_STATE *pDstSurface, - SWR_FORMAT srcFormat, - SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, - uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, - uint8_t *pSrcHotTile); +SWR_FUNC(void, + SwrStoreHotTileToSurface, + HANDLE hWorkerPrivateData, + SWR_SURFACE_STATE* pDstSurface, + SWR_FORMAT srcFormat, + SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, + uint32_t x, + uint32_t y, + uint32_t renderTargetArrayIndex, + uint8_t* pSrcHotTile); ////////////////////////////////////////////////////////////////////////// /// @brief Writes clear color to every pixel of a render surface @@ -762,72 +714,73 @@ SWR_FUNC(void, SwrStoreHotTileToSurface, /// @param renderTargetIndex - Index to destination render target /// @param x, y - Coordinates to raster tile. /// @param pClearColor - Pointer to clear color -SWR_FUNC(void, SwrStoreHotTileClear, - HANDLE hWorkerPrivateData, - SWR_SURFACE_STATE *pDstSurface, +SWR_FUNC(void, + SwrStoreHotTileClear, + HANDLE hWorkerPrivateData, + SWR_SURFACE_STATE* pDstSurface, SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, - uint32_t x, - uint32_t y, - uint32_t renderTargetArrayIndex, - const float* pClearColor); + uint32_t x, + uint32_t y, + uint32_t renderTargetArrayIndex, + const float* pClearColor); struct SWR_INTERFACE { - PFNSwrCreateContext pfnSwrCreateContext; - PFNSwrDestroyContext pfnSwrDestroyContext; - PFNSwrBindApiThread pfnSwrBindApiThread; - PFNSwrSaveState pfnSwrSaveState; - PFNSwrRestoreState pfnSwrRestoreState; - PFNSwrSync pfnSwrSync; - PFNSwrStallBE pfnSwrStallBE; - PFNSwrWaitForIdle pfnSwrWaitForIdle; - PFNSwrWaitForIdleFE pfnSwrWaitForIdleFE; - PFNSwrSetVertexBuffers pfnSwrSetVertexBuffers; - PFNSwrSetIndexBuffer pfnSwrSetIndexBuffer; - PFNSwrSetFetchFunc pfnSwrSetFetchFunc; - PFNSwrSetSoFunc pfnSwrSetSoFunc; - PFNSwrSetSoState pfnSwrSetSoState; - PFNSwrSetSoBuffers pfnSwrSetSoBuffers; - PFNSwrSetVertexFunc pfnSwrSetVertexFunc; - PFNSwrSetFrontendState pfnSwrSetFrontendState; - PFNSwrSetGsState pfnSwrSetGsState; - PFNSwrSetGsFunc pfnSwrSetGsFunc; - PFNSwrSetCsFunc pfnSwrSetCsFunc; - PFNSwrSetTsState pfnSwrSetTsState; - PFNSwrSetHsFunc pfnSwrSetHsFunc; - PFNSwrSetDsFunc pfnSwrSetDsFunc; - PFNSwrSetDepthStencilState pfnSwrSetDepthStencilState; - PFNSwrSetBackendState pfnSwrSetBackendState; - PFNSwrSetDepthBoundsState pfnSwrSetDepthBoundsState; - PFNSwrSetPixelShaderState pfnSwrSetPixelShaderState; - PFNSwrSetBlendState pfnSwrSetBlendState; - PFNSwrSetBlendFunc pfnSwrSetBlendFunc; - PFNSwrDraw pfnSwrDraw; - PFNSwrDrawInstanced pfnSwrDrawInstanced; - PFNSwrDrawIndexed pfnSwrDrawIndexed; - PFNSwrDrawIndexedInstanced pfnSwrDrawIndexedInstanced; - PFNSwrInvalidateTiles pfnSwrInvalidateTiles; - PFNSwrDiscardRect pfnSwrDiscardRect; - PFNSwrDispatch pfnSwrDispatch; - PFNSwrStoreTiles pfnSwrStoreTiles; - PFNSwrClearRenderTarget pfnSwrClearRenderTarget; - PFNSwrSetRastState pfnSwrSetRastState; - PFNSwrSetViewports pfnSwrSetViewports; - PFNSwrSetScissorRects pfnSwrSetScissorRects; + PFNSwrCreateContext pfnSwrCreateContext; + PFNSwrDestroyContext pfnSwrDestroyContext; + PFNSwrBindApiThread pfnSwrBindApiThread; + PFNSwrSaveState pfnSwrSaveState; + PFNSwrRestoreState pfnSwrRestoreState; + PFNSwrSync pfnSwrSync; + PFNSwrStallBE pfnSwrStallBE; + PFNSwrWaitForIdle pfnSwrWaitForIdle; + PFNSwrWaitForIdleFE pfnSwrWaitForIdleFE; + PFNSwrSetVertexBuffers pfnSwrSetVertexBuffers; + PFNSwrSetIndexBuffer pfnSwrSetIndexBuffer; + PFNSwrSetFetchFunc pfnSwrSetFetchFunc; + PFNSwrSetSoFunc pfnSwrSetSoFunc; + PFNSwrSetSoState pfnSwrSetSoState; + PFNSwrSetSoBuffers pfnSwrSetSoBuffers; + PFNSwrSetVertexFunc pfnSwrSetVertexFunc; + PFNSwrSetFrontendState pfnSwrSetFrontendState; + PFNSwrSetGsState pfnSwrSetGsState; + PFNSwrSetGsFunc pfnSwrSetGsFunc; + PFNSwrSetCsFunc pfnSwrSetCsFunc; + PFNSwrSetTsState pfnSwrSetTsState; + PFNSwrSetHsFunc pfnSwrSetHsFunc; + PFNSwrSetDsFunc pfnSwrSetDsFunc; + PFNSwrSetDepthStencilState pfnSwrSetDepthStencilState; + PFNSwrSetBackendState pfnSwrSetBackendState; + PFNSwrSetDepthBoundsState pfnSwrSetDepthBoundsState; + PFNSwrSetPixelShaderState pfnSwrSetPixelShaderState; + PFNSwrSetBlendState pfnSwrSetBlendState; + PFNSwrSetBlendFunc pfnSwrSetBlendFunc; + PFNSwrDraw pfnSwrDraw; + PFNSwrDrawInstanced pfnSwrDrawInstanced; + PFNSwrDrawIndexed pfnSwrDrawIndexed; + PFNSwrDrawIndexedInstanced pfnSwrDrawIndexedInstanced; + PFNSwrInvalidateTiles pfnSwrInvalidateTiles; + PFNSwrDiscardRect pfnSwrDiscardRect; + PFNSwrDispatch pfnSwrDispatch; + PFNSwrStoreTiles pfnSwrStoreTiles; + PFNSwrClearRenderTarget pfnSwrClearRenderTarget; + PFNSwrSetRastState pfnSwrSetRastState; + PFNSwrSetViewports pfnSwrSetViewports; + PFNSwrSetScissorRects pfnSwrSetScissorRects; PFNSwrGetPrivateContextState pfnSwrGetPrivateContextState; PFNSwrAllocDrawContextMemory pfnSwrAllocDrawContextMemory; - PFNSwrEnableStatsFE pfnSwrEnableStatsFE; - PFNSwrEnableStatsBE pfnSwrEnableStatsBE; - PFNSwrEndFrame pfnSwrEndFrame; - PFNSwrInit pfnSwrInit; - PFNSwrLoadHotTile pfnSwrLoadHotTile; + PFNSwrEnableStatsFE pfnSwrEnableStatsFE; + PFNSwrEnableStatsBE pfnSwrEnableStatsBE; + PFNSwrEndFrame pfnSwrEndFrame; + PFNSwrInit pfnSwrInit; + PFNSwrLoadHotTile pfnSwrLoadHotTile; PFNSwrStoreHotTileToSurface pfnSwrStoreHotTileToSurface; - PFNSwrStoreHotTileClear pfnSwrStoreHotTileClear; + PFNSwrStoreHotTileClear pfnSwrStoreHotTileClear; }; extern "C" { -typedef void (SWR_API * PFNSwrGetInterface)(SWR_INTERFACE &out_funcs); -SWR_VISIBLE void SWR_API SwrGetInterface(SWR_INTERFACE &out_funcs); +typedef void(SWR_API* PFNSwrGetInterface)(SWR_INTERFACE& out_funcs); +SWR_VISIBLE void SWR_API SwrGetInterface(SWR_INTERFACE& out_funcs); } #endif diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h index 1db09726cb7..a3cfdb47818 100644 --- a/src/gallium/drivers/swr/rasterizer/core/arena.h +++ b/src/gallium/drivers/swr/rasterizer/core/arena.h @@ -1,35 +1,35 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file arena.h -* -* @brief Arena memory manager -* The arena is convenient and fast for managing allocations for any of -* our allocations that are associated with operations and can all be freed -* once when their operation has completed. Allocations are cheap since -* most of the time its simply an increment of an offset. Also, no need to -* free individual allocations. All of the arena memory can be freed at once. -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file arena.h + * + * @brief Arena memory manager + * The arena is convenient and fast for managing allocations for any of + * our allocations that are associated with operations and can all be freed + * once when their operation has completed. Allocations are cheap since + * most of the time its simply an increment of an offset. Also, no need to + * free individual allocations. All of the arena memory can be freed at once. + * + ******************************************************************************/ #pragma once #include <mutex> @@ -42,10 +42,9 @@ static const size_t ARENA_BLOCK_ALIGN = 64; struct ArenaBlock { size_t blockSize = 0; - ArenaBlock* pNext = nullptr; + ArenaBlock* pNext = nullptr; }; -static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN, - "Increase BLOCK_ALIGN size"); +static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN, "Increase BLOCK_ALIGN size"); class DefaultAllocator { @@ -55,7 +54,7 @@ public: SWR_ASSUME_ASSERT(size >= sizeof(ArenaBlock)); ArenaBlock* p = new (AlignedMalloc(size, align)) ArenaBlock(); - p->blockSize = size; + p->blockSize = size; return p; } @@ -70,7 +69,7 @@ public: }; // Caching Allocator for Arena -template<uint32_t NumBucketsT = 8, uint32_t StartBucketBitT = 12> +template <uint32_t NumBucketsT = 8, uint32_t StartBucketBitT = 12> struct CachingAllocatorT : DefaultAllocator { ArenaBlock* AllocateAligned(size_t size, size_t align) @@ -83,8 +82,8 @@ struct CachingAllocatorT : DefaultAllocator { // search cached blocks std::lock_guard<std::mutex> l(m_mutex); - ArenaBlock* pPrevBlock = &m_cachedBlocks[bucket]; - ArenaBlock* pBlock = SearchBlocks(pPrevBlock, size, align); + ArenaBlock* pPrevBlock = &m_cachedBlocks[bucket]; + ArenaBlock* pBlock = SearchBlocks(pPrevBlock, size, align); if (pBlock) { @@ -97,7 +96,7 @@ struct CachingAllocatorT : DefaultAllocator else { pPrevBlock = &m_oldCachedBlocks[bucket]; - pBlock = SearchBlocks(pPrevBlock, size, align); + pBlock = SearchBlocks(pPrevBlock, size, align); if (pBlock) { @@ -113,7 +112,7 @@ struct CachingAllocatorT : DefaultAllocator { SWR_ASSUME_ASSERT(pPrevBlock && pPrevBlock->pNext == pBlock); pPrevBlock->pNext = pBlock->pNext; - pBlock->pNext = nullptr; + pBlock->pNext = nullptr; return pBlock; } @@ -150,7 +149,10 @@ struct CachingAllocatorT : DefaultAllocator void FreeOldBlocks() { - if (!m_cachedSize) { return; } + if (!m_cachedSize) + { + return; + } std::lock_guard<std::mutex> l(m_mutex); bool doFree = (m_oldCachedSize > MAX_UNUSED_SIZE); @@ -169,7 +171,7 @@ struct CachingAllocatorT : DefaultAllocator pBlock = pNext; } m_oldCachedBlocks[i].pNext = nullptr; - m_pOldLastCachedBlocks[i] = &m_oldCachedBlocks[i]; + m_pOldLastCachedBlocks[i] = &m_oldCachedBlocks[i]; } if (m_pLastCachedBlocks[i] != &m_cachedBlocks[i]) @@ -179,8 +181,8 @@ struct CachingAllocatorT : DefaultAllocator // We know that all blocks are the same size. // Just move the list over. m_pLastCachedBlocks[i]->pNext = m_oldCachedBlocks[i].pNext; - m_oldCachedBlocks[i].pNext = m_cachedBlocks[i].pNext; - m_cachedBlocks[i].pNext = nullptr; + m_oldCachedBlocks[i].pNext = m_cachedBlocks[i].pNext; + m_cachedBlocks[i].pNext = nullptr; if (m_pOldLastCachedBlocks[i]->pNext) { m_pOldLastCachedBlocks[i] = m_pLastCachedBlocks[i]; @@ -195,13 +197,13 @@ struct CachingAllocatorT : DefaultAllocator while (pBlock) { ArenaBlock* pNext = pBlock->pNext; - pBlock->pNext = nullptr; + pBlock->pNext = nullptr; m_cachedSize -= pBlock->blockSize; InsertCachedBlock<true>(i, pBlock); pBlock = pNext; } - m_pLastCachedBlocks[i] = &m_cachedBlocks[i]; + m_pLastCachedBlocks[i] = &m_cachedBlocks[i]; m_cachedBlocks[i].pNext = nullptr; } } @@ -215,7 +217,7 @@ struct CachingAllocatorT : DefaultAllocator { for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i) { - m_pLastCachedBlocks[i] = &m_cachedBlocks[i]; + m_pLastCachedBlocks[i] = &m_cachedBlocks[i]; m_pOldLastCachedBlocks[i] = &m_oldCachedBlocks[i]; } } @@ -260,7 +262,8 @@ private: { SWR_ASSUME_ASSERT(bucketId < CACHE_NUM_BUCKETS); - ArenaBlock* pPrevBlock = OldBlockT ? &m_oldCachedBlocks[bucketId] : &m_cachedBlocks[bucketId]; + ArenaBlock* pPrevBlock = + OldBlockT ? &m_oldCachedBlocks[bucketId] : &m_cachedBlocks[bucketId]; ArenaBlock* pBlock = pPrevBlock->pNext; while (pBlock) @@ -271,13 +274,13 @@ private: break; } pPrevBlock = pBlock; - pBlock = pBlock->pNext; + pBlock = pBlock->pNext; } // Insert into list SWR_ASSUME_ASSERT(pPrevBlock); pPrevBlock->pNext = pNewBlock; - pNewBlock->pNext = pBlock; + pNewBlock->pNext = pBlock; if (OldBlockT) { @@ -301,9 +304,9 @@ private: static ArenaBlock* SearchBlocks(ArenaBlock*& pPrevBlock, size_t blockSize, size_t align) { - ArenaBlock* pBlock = pPrevBlock->pNext; + ArenaBlock* pBlock = pPrevBlock->pNext; ArenaBlock* pPotentialBlock = nullptr; - ArenaBlock* pPotentialPrev = nullptr; + ArenaBlock* pPotentialPrev = nullptr; while (pBlock) { @@ -320,26 +323,26 @@ private: // We could use this as it is larger than we wanted, but // continue to search for a better match pPotentialBlock = pBlock; - pPotentialPrev = pPrevBlock; + pPotentialPrev = pPrevBlock; } } else { // Blocks are sorted by size (biggest first) - // So, if we get here, there are no blocks + // So, if we get here, there are no blocks // large enough, fall through to allocation. pBlock = nullptr; break; } pPrevBlock = pBlock; - pBlock = pBlock->pNext; + pBlock = pBlock->pNext; } if (!pBlock) { // Couldn't find an exact match, use next biggest size - pBlock = pPotentialBlock; + pBlock = pPotentialBlock; pPrevBlock = pPotentialPrev; } @@ -347,35 +350,32 @@ private: } // buckets, for block sizes < (1 << (start+1)), < (1 << (start+2)), ... - static const uint32_t CACHE_NUM_BUCKETS = NumBucketsT; - static const uint32_t CACHE_START_BUCKET_BIT = StartBucketBitT; - static const size_t MAX_UNUSED_SIZE = sizeof(MEGABYTE); + static const uint32_t CACHE_NUM_BUCKETS = NumBucketsT; + static const uint32_t CACHE_START_BUCKET_BIT = StartBucketBitT; + static const size_t MAX_UNUSED_SIZE = sizeof(MEGABYTE); - ArenaBlock m_cachedBlocks[CACHE_NUM_BUCKETS]; - ArenaBlock* m_pLastCachedBlocks[CACHE_NUM_BUCKETS]; - ArenaBlock m_oldCachedBlocks[CACHE_NUM_BUCKETS]; - ArenaBlock* m_pOldLastCachedBlocks[CACHE_NUM_BUCKETS]; - std::mutex m_mutex; + ArenaBlock m_cachedBlocks[CACHE_NUM_BUCKETS]; + ArenaBlock* m_pLastCachedBlocks[CACHE_NUM_BUCKETS]; + ArenaBlock m_oldCachedBlocks[CACHE_NUM_BUCKETS]; + ArenaBlock* m_pOldLastCachedBlocks[CACHE_NUM_BUCKETS]; + std::mutex m_mutex; - size_t m_totalAllocated = 0; + size_t m_totalAllocated = 0; - size_t m_cachedSize = 0; - size_t m_oldCachedSize = 0; + size_t m_cachedSize = 0; + size_t m_oldCachedSize = 0; }; typedef CachingAllocatorT<> CachingAllocator; -template<typename T = DefaultAllocator, size_t BlockSizeT = 128 * sizeof(KILOBYTE)> +template <typename T = DefaultAllocator, size_t BlockSizeT = 128 * sizeof(KILOBYTE)> class TArena { public: - TArena(T& in_allocator) : m_allocator(in_allocator) {} - TArena() : m_allocator(m_defAllocator) {} - ~TArena() - { - Reset(true); - } + TArena(T& in_allocator) : m_allocator(in_allocator) {} + TArena() : m_allocator(m_defAllocator) {} + ~TArena() { Reset(true); } - void* AllocAligned(size_t size, size_t align) + void* AllocAligned(size_t size, size_t align) { if (0 == size) { @@ -387,12 +387,12 @@ public: if (m_pCurBlock) { ArenaBlock* pCurBlock = m_pCurBlock; - size_t offset = AlignUp(m_offset, align); + size_t offset = AlignUp(m_offset, align); if ((offset + size) <= pCurBlock->blockSize) { void* pMem = PtrAdd(pCurBlock, offset); - m_offset = offset + size; + m_offset = offset + size; return pMem; } @@ -401,17 +401,18 @@ public: } static const size_t ArenaBlockSize = BlockSizeT; - size_t blockSize = std::max(size + ARENA_BLOCK_ALIGN, ArenaBlockSize); + size_t blockSize = std::max(size + ARENA_BLOCK_ALIGN, ArenaBlockSize); // Add in one BLOCK_ALIGN unit to store ArenaBlock in. blockSize = AlignUp(blockSize, ARENA_BLOCK_ALIGN); - ArenaBlock* pNewBlock = m_allocator.AllocateAligned(blockSize, ARENA_BLOCK_ALIGN); // Arena blocks are always simd byte aligned. + ArenaBlock* pNewBlock = m_allocator.AllocateAligned( + blockSize, ARENA_BLOCK_ALIGN); // Arena blocks are always simd byte aligned. SWR_ASSERT(pNewBlock != nullptr); if (pNewBlock != nullptr) { - m_offset = ARENA_BLOCK_ALIGN; + m_offset = ARENA_BLOCK_ALIGN; pNewBlock->pNext = m_pCurBlock; m_pCurBlock = pNewBlock; @@ -420,10 +421,7 @@ public: return AllocAligned(size, align); } - void* Alloc(size_t size) - { - return AllocAligned(size, 1); - } + void* Alloc(size_t size) { return AllocAligned(size, 1); } void* AllocAlignedSync(size_t size, size_t align) { @@ -453,12 +451,12 @@ public: if (m_pCurBlock) { - ArenaBlock *pUsedBlocks = m_pCurBlock->pNext; - m_pCurBlock->pNext = nullptr; + ArenaBlock* pUsedBlocks = m_pCurBlock->pNext; + m_pCurBlock->pNext = nullptr; while (pUsedBlocks) { ArenaBlock* pBlock = pUsedBlocks; - pUsedBlocks = pBlock->pNext; + pUsedBlocks = pBlock->pNext; m_allocator.Free(pBlock); } @@ -473,20 +471,20 @@ public: bool IsEmpty() { - return (m_pCurBlock == nullptr) || (m_offset == ARENA_BLOCK_ALIGN && m_pCurBlock->pNext == nullptr); + return (m_pCurBlock == nullptr) || + (m_offset == ARENA_BLOCK_ALIGN && m_pCurBlock->pNext == nullptr); } private: - - ArenaBlock* m_pCurBlock = nullptr; - size_t m_offset = ARENA_BLOCK_ALIGN; + ArenaBlock* m_pCurBlock = nullptr; + size_t m_offset = ARENA_BLOCK_ALIGN; /// @note Mutex is only used by sync allocation functions. - std::mutex m_mutex; + std::mutex m_mutex; - DefaultAllocator m_defAllocator; - T& m_allocator; + DefaultAllocator m_defAllocator; + T& m_allocator; }; -using StdArena = TArena<DefaultAllocator>; -using CachingArena = TArena<CachingAllocator>; +using StdArena = TArena<DefaultAllocator>; +using CachingArena = TArena<CachingAllocator>; diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp index 5ac9ceb165e..8f8dbcf7884 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp @@ -1,31 +1,31 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file backend.cpp -* -* @brief Backend handles rasterization, pixel shading and output merger -* operations. -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file backend.cpp + * + * @brief Backend handles rasterization, pixel shading and output merger + * operations. + * + ******************************************************************************/ #include <smmintrin.h> @@ -44,9 +44,13 @@ /// @param pDC - pointer to draw context (dispatch). /// @param workerId - The unique worker ID that is assigned to this thread. /// @param threadGroupId - the linear index for the thread group within the dispatch. -void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace) +void ProcessComputeBE(DRAW_CONTEXT* pDC, + uint32_t workerId, + uint32_t threadGroupId, + void*& pSpillFillBuffer, + void*& pScratchSpace) { - SWR_CONTEXT *pContext = pDC->pContext; + SWR_CONTEXT* pContext = pDC->pContext; RDTSC_BEGIN(BEDispatch, pDC->drawId); @@ -59,8 +63,9 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup { pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES); } - - size_t scratchSpaceSize = pDC->pState->state.scratchSpaceSize * pDC->pState->state.scratchSpaceNumInstances; + + size_t scratchSpaceSize = + pDC->pState->state.scratchSpaceSize * pDC->pState->state.scratchSpaceNumInstances; if (scratchSpaceSize && pScratchSpace == nullptr) { pScratchSpace = pDC->pArena->AllocAlignedSync(scratchSpaceSize, KNOB_SIMD_BYTES); @@ -68,17 +73,19 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup const API_STATE& state = GetApiState(pDC); - SWR_CS_CONTEXT csContext{ 0 }; - csContext.tileCounter = threadGroupId; - csContext.dispatchDims[0] = pTaskData->threadGroupCountX; - csContext.dispatchDims[1] = pTaskData->threadGroupCountY; - csContext.dispatchDims[2] = pTaskData->threadGroupCountZ; - csContext.pTGSM = pContext->ppScratch[workerId]; - csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer; - csContext.pScratchSpace = (uint8_t*)pScratchSpace; + SWR_CS_CONTEXT csContext{0}; + csContext.tileCounter = threadGroupId; + csContext.dispatchDims[0] = pTaskData->threadGroupCountX; + csContext.dispatchDims[1] = pTaskData->threadGroupCountY; + csContext.dispatchDims[2] = pTaskData->threadGroupCountZ; + csContext.pTGSM = pContext->ppScratch[workerId]; + csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer; + csContext.pScratchSpace = (uint8_t*)pScratchSpace; csContext.scratchSpacePerSimd = pDC->pState->state.scratchSpaceSize; - state.pfnCsFunc(GetPrivateState(pDC), pContext->threadPool.pThreadData[workerId].pWorkerPrivateData, &csContext); + state.pfnCsFunc(GetPrivateState(pDC), + pContext->threadPool.pThreadData[workerId].pWorkerPrivateData, + &csContext); UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup); AR_EVENT(CSStats(csContext.stats.numInstExecuted)); @@ -91,23 +98,26 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup /// @param pDC - pointer to draw context (dispatch). /// @param workerId - The unique worker ID that is assigned to this thread. /// @param threadGroupId - the linear index for the thread group within the dispatch. -void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData) +void ProcessShutdownBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData) { // Dummy function } -void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData) +void ProcessSyncBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData) { uint32_t x, y; MacroTileMgr::getTileIndices(macroTile, x, y); SWR_ASSERT(x == 0 && y == 0); } -void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc, - SWR_RENDERTARGET_ATTACHMENT attachment) +void ProcessStoreTileBE(DRAW_CONTEXT* pDC, + uint32_t workerId, + uint32_t macroTile, + STORE_TILES_DESC* pDesc, + SWR_RENDERTARGET_ATTACHMENT attachment) { - SWR_CONTEXT *pContext = pDC->pContext; - HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; + SWR_CONTEXT* pContext = pDC->pContext; + HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; RDTSC_BEGIN(BEStoreTiles, pDC->drawId); @@ -121,17 +131,27 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile case SWR_ATTACHMENT_COLOR4: case SWR_ATTACHMENT_COLOR5: case SWR_ATTACHMENT_COLOR6: - case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break; - case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break; - case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break; - default: SWR_INVALID("Unknown attachment: %d", attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break; + case SWR_ATTACHMENT_COLOR7: + srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; + break; + case SWR_ATTACHMENT_DEPTH: + srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; + break; + case SWR_ATTACHMENT_STENCIL: + srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; + break; + default: + SWR_INVALID("Unknown attachment: %d", attachment); + srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; + break; } uint32_t x, y; MacroTileMgr::getTileIndices(macroTile, x, y); // Only need to store the hottile if it's been rendered to... - HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false); + HOTTILE* pHotTile = + pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false); if (pHotTile) { // clear if clear is pending (i.e., not rendered to), then mark as dirty for store. @@ -140,22 +160,35 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat]; SWR_ASSERT(pfnClearTiles != nullptr); - pfnClearTiles(pDC, hWorkerPrivateData, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect); + pfnClearTiles(pDC, + hWorkerPrivateData, + attachment, + macroTile, + pHotTile->renderTargetArrayIndex, + pHotTile->clearData, + pDesc->rect); } - if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY) + if (pHotTile->state == HOTTILE_DIRTY || + pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY) { int32_t destX = KNOB_MACROTILE_X_DIM * x; int32_t destY = KNOB_MACROTILE_Y_DIM * y; - pContext->pfnStoreTile(GetPrivateState(pDC), hWorkerPrivateData, srcFormat, - attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); + pContext->pfnStoreTile(GetPrivateState(pDC), + hWorkerPrivateData, + srcFormat, + attachment, + destX, + destY, + pHotTile->renderTargetArrayIndex, + pHotTile->pBuffer); } - if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED) { - if (!(pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY && pHotTile->state == HOTTILE_RESOLVED)) + if (!(pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY && + pHotTile->state == HOTTILE_RESOLVED)) { pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState; } @@ -164,12 +197,12 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile RDTSC_END(BEStoreTiles, 1); } -void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) +void ProcessStoreTilesBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData) { - STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData; + STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pData; - unsigned long rt = 0; - uint32_t mask = pDesc->attachmentMask; + unsigned long rt = 0; + uint32_t mask = pDesc->attachmentMask; while (_BitScanForward(&rt, mask)) { mask &= ~(1 << rt); @@ -177,10 +210,13 @@ void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTil } } -void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) +void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT* pDC, + uint32_t workerId, + uint32_t macroTile, + void* pData) { - DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData; - SWR_CONTEXT *pContext = pDC->pContext; + DISCARD_INVALIDATE_TILES_DESC* pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pData; + SWR_CONTEXT* pContext = pDC->pContext; const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount); @@ -188,8 +224,13 @@ void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint3 { if (pDesc->attachmentMask & (1 << i)) { - HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad( - pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples); + HOTTILE* pHotTile = + pContext->pHotTileMgr->GetHotTileNoLoad(pContext, + pDC, + macroTile, + (SWR_RENDERTARGET_ATTACHMENT)i, + pDesc->createNewTiles, + numSamples); if (pHotTile) { pHotTile->state = (HOTTILE_STATE)pDesc->newTileState; @@ -198,14 +239,19 @@ void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint3 } } -template<uint32_t sampleCountT> -void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) +template <uint32_t sampleCountT> +void BackendNullPS(DRAW_CONTEXT* pDC, + uint32_t workerId, + uint32_t x, + uint32_t y, + SWR_TRIANGLE_DESC& work, + RenderOutputBuffers& renderBuffers) { RDTSC_BEGIN(BENullBackend, pDC->drawId); ///@todo: handle center multisample pattern RDTSC_BEGIN(BESetup, pDC->drawId); - const API_STATE &state = GetApiState(pDC); + const API_STATE& state = GetApiState(pDC); BarycentricCoeffs coeffs; SetupBarycentricCoeffs(&coeffs, work); @@ -220,7 +266,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y))); - const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); + const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { @@ -231,8 +277,8 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { // iterate over active samples - unsigned long sample = 0; - uint32_t sampleMask = state.blendState.sampleMask; + unsigned long sample = 0; + uint32_t sampleMask = state.blendState.sampleMask; while (_BitScanForward(&sample, sampleMask)) { sampleMask &= ~(1 << sample); @@ -242,14 +288,16 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, if (coverageMask) { // offset depth/stencil buffers current sample - uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); - uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); + uint8_t* pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); + uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) { - static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); + static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, + "Unsupported depth hot tile format"); - const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample)); + const simdscalar z = + _simd_load_ps(reinterpret_cast<const float*>(pDepthSample)); const float minz = state.depthBoundsState.depthBoundsTestMinValue; const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; @@ -266,7 +314,11 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, CalcSampleBarycentrics(coeffs, psContext); // interpolate and quantize z - psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); + psContext.vZ = vplaneps(coeffs.vZa, + coeffs.vZb, + coeffs.vZc, + psContext.vI.sample, + psContext.vJ.sample); psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); RDTSC_END(BEBarycentric, 0); @@ -274,21 +326,39 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, // interpolate user clip distance if available if (state.backendState.clipDistanceMask) { - coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample); + coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, + work.pUserClipBuffer, + psContext.vI.sample, + psContext.vJ.sample); } - simdscalar vCoverageMask = _simd_vmask_ps(coverageMask); + simdscalar vCoverageMask = _simd_vmask_ps(coverageMask); simdscalar stencilPassMask = vCoverageMask; RDTSC_BEGIN(BEEarlyDepthTest, pDC->drawId); - simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, - psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); - AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask))); - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, - pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); + simdscalar depthPassMask = DepthStencilTest(&state, + work.triFlags.frontFacing, + work.triFlags.viewportIndex, + psContext.vZ, + pDepthSample, + vCoverageMask, + pStencilSample, + &stencilPassMask); + AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask), + _simd_movemask_ps(stencilPassMask), + _simd_movemask_ps(vCoverageMask))); + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], + &state.depthStencilState, + work.triFlags.frontFacing, + psContext.vZ, + pDepthSample, + depthPassMask, + vCoverageMask, + pStencilSample, + stencilPassMask); RDTSC_END(BEEarlyDepthTest, 0); - uint32_t statMask = _simd_movemask_ps(depthPassMask); + uint32_t statMask = _simd_movemask_ps(depthPassMask); uint32_t statCount = _mm_popcnt_u32(statMask); UPDATE_STAT_BE(DepthPassCount, statCount); } @@ -299,7 +369,8 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, } pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; - pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; + pStencilBuffer += + (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx); } @@ -310,34 +381,30 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, RDTSC_END(BENullBackend, 0); } -PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS] = {}; +PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS] = {}; PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT]; -PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT] - [2] // centroid - [2] // canEarlyZ - = {}; -PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT] - [2] // isCenterPattern - [SWR_INPUT_COVERAGE_COUNT] - [2] // centroid - [2] // forcedSampleCount - [2] // canEarlyZ - = {}; -PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT] - [SWR_INPUT_COVERAGE_COUNT] +PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2] // centroid + [2] // canEarlyZ + = {}; +PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][2] // isCenterPattern + [SWR_INPUT_COVERAGE_COUNT][2] // centroid + [2] // forcedSampleCount + [2] // canEarlyZ + = {}; +PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT] [2] // centroid [2] // canEarlyZ - = {}; + = {}; void InitBackendFuncTables() -{ +{ InitBackendPixelRate(); InitBackendSingleFuncTable(gBackendSingleSample); InitBackendSampleFuncTable(gBackendSampleRateTable); - gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ; - gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ; - gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ; - gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ; - gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ; + gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS<SWR_MULTISAMPLE_1X>; + gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS<SWR_MULTISAMPLE_2X>; + gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS<SWR_MULTISAMPLE_4X>; + gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS<SWR_MULTISAMPLE_8X>; + gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS<SWR_MULTISAMPLE_16X>; } diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h index 7a842fe0e20..79d9007bee6 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend.h @@ -1,31 +1,31 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file backend.h -* -* @brief Backend handles rasterization, pixel shading and output merger -* operations. -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file backend.h + * + * @brief Backend handles rasterization, pixel shading and output merger + * operations. + * + ******************************************************************************/ #pragma once #include "common/os.h" @@ -34,29 +34,37 @@ #include "depthstencil.h" #include "rdtsc_core.h" -void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace); -void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); -void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); -void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); -void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); -void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); +void ProcessComputeBE(DRAW_CONTEXT* pDC, + uint32_t workerId, + uint32_t threadGroupId, + void*& pSpillFillBuffer, + void*& pScratchSpace); +void ProcessSyncBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData); +void ProcessClearBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData); +void ProcessStoreTilesBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData); +void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT* pDC, + uint32_t workerId, + uint32_t macroTile, + void* pData); +void ProcessShutdownBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData); -typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, HANDLE hWorkerData, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, uint32_t, DWORD[4], const SWR_RECT& rect); +typedef void (*PFN_CLEAR_TILES)(DRAW_CONTEXT*, + HANDLE hWorkerData, + SWR_RENDERTARGET_ATTACHMENT rt, + uint32_t, + uint32_t, + DWORD[4], + const SWR_RECT& rect); -extern PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS]; +extern PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS]; extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT]; -extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT] - [2] // centroid - [2]; // canEarlyZ -extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT] - [2] // isCenterPattern - [SWR_INPUT_COVERAGE_COUNT] - [2] // centroid - [2] // forcedSampleCount - [2] // canEarlyZ - ; +extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2] // centroid + [2]; // canEarlyZ +extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][2] // isCenterPattern + [SWR_INPUT_COVERAGE_COUNT][2] // centroid + [2] // forcedSampleCount + [2] // canEarlyZ + ; extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT] - [SWR_INPUT_COVERAGE_COUNT] - [2] // centroid - [2]; // canEarlyZ - + [SWR_INPUT_COVERAGE_COUNT][2] // centroid + [2]; // canEarlyZ diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp index af031f9f9d7..0b14ca09f4c 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp @@ -1,31 +1,31 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file backend.cpp -* -* @brief Backend handles rasterization, pixel shading and output merger -* operations. -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file backend.cpp + * + * @brief Backend handles rasterization, pixel shading and output merger + * operations. + * + ******************************************************************************/ #include <smmintrin.h> @@ -37,17 +37,17 @@ #include <algorithm> -template<SWR_FORMAT format> -void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value) +template <SWR_FORMAT format> +void ClearRasterTile(uint8_t* pTileBuffer, simdvector& value) { - auto lambda = [&](int32_t comp) - { + auto lambda = [&](int32_t comp) { FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]); pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8); }; - const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM); + const uint32_t numIter = + (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM); for (uint32_t i = 0; i < numIter; ++i) { @@ -56,17 +56,17 @@ void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value) } #if USE_8x2_TILE_BACKEND -template<SWR_FORMAT format> -void ClearRasterTile(uint8_t *pTileBuffer, simd16vector &value) +template <SWR_FORMAT format> +void ClearRasterTile(uint8_t* pTileBuffer, simd16vector& value) { - auto lambda = [&](int32_t comp) - { + auto lambda = [&](int32_t comp) { FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]); pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8); }; - const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM); + const uint32_t numIter = + (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM); for (uint32_t i = 0; i < numIter; ++i) { @@ -75,8 +75,14 @@ void ClearRasterTile(uint8_t *pTileBuffer, simd16vector &value) } #endif -template<SWR_FORMAT format> -INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, HANDLE hWorkerPrivateData, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, uint32_t renderTargetArrayIndex, DWORD clear[4], const SWR_RECT& rect) +template <SWR_FORMAT format> +INLINE void ClearMacroTile(DRAW_CONTEXT* pDC, + HANDLE hWorkerPrivateData, + SWR_RENDERTARGET_ATTACHMENT rt, + uint32_t macroTile, + uint32_t renderTargetArrayIndex, + DWORD clear[4], + const SWR_RECT& rect) { // convert clear color to hottile format // clear color is in RGBA float/uint32 @@ -91,7 +97,7 @@ INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, HANDLE hWorkerPrivateData, SWR_REN vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp))); vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp)); } - vComp = FormatTraits<format>::pack(comp, vComp); + vComp = FormatTraits<format>::pack(comp, vComp); vClear.v[FormatTraits<format>::swizzle(comp)] = vComp; } @@ -106,7 +112,7 @@ INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, HANDLE hWorkerPrivateData, SWR_REN vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp))); vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp)); } - vComp = FormatTraits<format>::pack(comp, vComp); + vComp = FormatTraits<format>::pack(comp, vComp); vClear.v[FormatTraits<format>::swizzle(comp)] = vComp; } @@ -115,8 +121,7 @@ INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, HANDLE hWorkerPrivateData, SWR_REN MacroTileMgr::getTileIndices(macroTile, tileX, tileY); // Init to full macrotile - SWR_RECT clearTile = - { + SWR_RECT clearTile = { KNOB_MACROTILE_X_DIM * int32_t(tileX), KNOB_MACROTILE_Y_DIM * int32_t(tileY), KNOB_MACROTILE_X_DIM * int32_t(tileX + 1), @@ -127,7 +132,8 @@ INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, HANDLE hWorkerPrivateData, SWR_REN clearTile &= rect; // translate to local hottile origin - clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, -int32_t(tileY) * KNOB_MACROTILE_Y_DIM); + clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, + -int32_t(tileY) * KNOB_MACROTILE_Y_DIM); // Make maximums inclusive (needed for convert to raster tiles) clearTile.xmax -= 1; @@ -141,14 +147,29 @@ INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, HANDLE hWorkerPrivateData, SWR_REN const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount); // compute steps between raster tile samples / raster tiles / macro tile rows - const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8; - const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples; + const uint32_t rasterTileSampleStep = + KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8; + const uint32_t rasterTileStep = + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples; const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep; - const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8); - - HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, hWorkerPrivateData, macroTile, rt, true, numSamples, renderTargetArrayIndex); - uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples; - uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples; + const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8); + + HOTTILE* pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, + pDC, + hWorkerPrivateData, + macroTile, + rt, + true, + numSamples, + renderTargetArrayIndex); + uint32_t rasterTileStartOffset = + (ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp>>( + pitch, clearTile.xmin, clearTile.ymin)) * + numSamples; + uint8_t* pRasterTileRow = + pHotTile->pBuffer + + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, + // FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples; // loop over all raster tiles in the current hot tile for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y) @@ -156,7 +177,7 @@ INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, HANDLE hWorkerPrivateData, SWR_REN uint8_t* pRasterTile = pRasterTileRow; for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x) { - for( int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++) + for (int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++) { ClearRasterTile<format>(pRasterTile, vClear); pRasterTile += rasterTileSampleStep; @@ -168,17 +189,16 @@ INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, HANDLE hWorkerPrivateData, SWR_REN pHotTile->state = HOTTILE_DIRTY; } - -void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData) +void ProcessClearBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData) { - SWR_CONTEXT *pContext = pDC->pContext; - HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; + SWR_CONTEXT* pContext = pDC->pContext; + HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; if (KNOB_FAST_CLEAR) { - CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData; + CLEAR_DESC* pClear = (CLEAR_DESC*)pUserData; SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount; - uint32_t numSamples = GetNumSamples(sampleCount); + uint32_t numSamples = GetNumSamples(sampleCount); SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason. @@ -186,36 +206,58 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR) { - unsigned long rt = 0; - uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR; + unsigned long rt = 0; + uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR; while (_BitScanForward(&rt, mask)) { mask &= ~(1 << rt); - HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroTile, (SWR_RENDERTARGET_ATTACHMENT)rt, true, numSamples, pClear->renderTargetArrayIndex); + HOTTILE* pHotTile = + pContext->pHotTileMgr->GetHotTile(pContext, + pDC, + hWorkerPrivateData, + macroTile, + (SWR_RENDERTARGET_ATTACHMENT)rt, + true, + numSamples, + pClear->renderTargetArrayIndex); // All we want to do here is to mark the hot tile as being in a "needs clear" state. pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]); pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]); pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]); pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]); - pHotTile->state = HOTTILE_CLEAR; + pHotTile->state = HOTTILE_CLEAR; } } if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT) { - HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples, pClear->renderTargetArrayIndex); + HOTTILE* pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, + pDC, + hWorkerPrivateData, + macroTile, + SWR_ATTACHMENT_DEPTH, + true, + numSamples, + pClear->renderTargetArrayIndex); pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth; - pHotTile->state = HOTTILE_CLEAR; + pHotTile->state = HOTTILE_CLEAR; } if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT) { - HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples, pClear->renderTargetArrayIndex); + HOTTILE* pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, + pDC, + hWorkerPrivateData, + macroTile, + SWR_ATTACHMENT_STENCIL, + true, + numSamples, + pClear->renderTargetArrayIndex); pHotTile->clearData[0] = pClear->clearStencil; - pHotTile->state = HOTTILE_CLEAR; + pHotTile->state = HOTTILE_CLEAR; } RDTSC_END(BEClear, 1); @@ -223,7 +265,7 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo else { // Legacy clear - CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData; + CLEAR_DESC* pClear = (CLEAR_DESC*)pUserData; RDTSC_BEGIN(BEClear, pDC->drawId); if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR) @@ -237,33 +279,51 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT]; SWR_ASSERT(pfnClearTiles != nullptr); - unsigned long rt = 0; - uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR; + unsigned long rt = 0; + uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR; while (_BitScanForward(&rt, mask)) { mask &= ~(1 << rt); - pfnClearTiles(pDC, hWorkerPrivateData, (SWR_RENDERTARGET_ATTACHMENT)rt, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect); + pfnClearTiles(pDC, + hWorkerPrivateData, + (SWR_RENDERTARGET_ATTACHMENT)rt, + macroTile, + pClear->renderTargetArrayIndex, + clearData, + pClear->rect); } } if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT) { DWORD clearData[4]; - clearData[0] = *(DWORD*)&pClear->clearDepth; + clearData[0] = *(DWORD*)&pClear->clearDepth; PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT]; SWR_ASSERT(pfnClearTiles != nullptr); - pfnClearTiles(pDC, hWorkerPrivateData, SWR_ATTACHMENT_DEPTH, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect); + pfnClearTiles(pDC, + hWorkerPrivateData, + SWR_ATTACHMENT_DEPTH, + macroTile, + pClear->renderTargetArrayIndex, + clearData, + pClear->rect); } if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT) { DWORD clearData[4]; - clearData[0] = pClear->clearStencil; + clearData[0] = pClear->clearStencil; PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT]; - pfnClearTiles(pDC, hWorkerPrivateData, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect); + pfnClearTiles(pDC, + hWorkerPrivateData, + SWR_ATTACHMENT_STENCIL, + macroTile, + pClear->renderTargetArrayIndex, + clearData, + pClear->rect); } RDTSC_END(BEClear, 1); @@ -274,9 +334,9 @@ void InitClearTilesTable() { memset(gClearTilesTable, 0, sizeof(gClearTilesTable)); - gClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>; - gClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>; - gClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>; - gClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>; - gClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>; + gClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>; + gClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>; + gClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>; + gClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>; + gClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>; } diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h index 05234c21822..1798dad7bc5 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h @@ -1,37 +1,39 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file backend.h -* -* @brief Backend handles rasterization, pixel shading and output merger -* operations. -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file backend.h + * + * @brief Backend handles rasterization, pixel shading and output merger + * operations. + * + ******************************************************************************/ #pragma once -void InitBackendSingleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_INPUT_COVERAGE_COUNT][2][2]); -void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]); +void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2]); +void InitBackendSampleFuncTable( + PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]); -static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext); +static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, + SWR_PS_CONTEXT& psContext); enum SWR_BACKEND_FUNCS @@ -45,15 +47,18 @@ enum SWR_BACKEND_FUNCS #if KNOB_SIMD_WIDTH == 8 static const __m256 vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5}; static const __m256 vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5}; -static const __m256 vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0}; -static const __m256 vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0}; +static const __m256 vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0}; +static const __m256 vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0}; #define MASK 0xff #endif -static INLINE simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar const &vI, simdscalar const &vJ) +static INLINE simdmask ComputeUserClipMask(uint8_t clipMask, + float* pUserClipBuffer, + simdscalar const& vI, + simdscalar const& vJ) { - simdscalar vClipMask = _simd_setzero_ps(); - uint32_t numClipDistance = _mm_popcnt_u32(clipMask); + simdscalar vClipMask = _simd_setzero_ps(); + uint32_t numClipDistance = _mm_popcnt_u32(clipMask); for (uint32_t i = 0; i < numClipDistance; ++i) { @@ -76,23 +81,29 @@ static INLINE simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuf INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) { - static const uint32_t RasterTileColorOffsets[16] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15, + static const uint32_t RasterTileColorOffsets[16]{ + 0, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8), + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * + 10, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * + 11, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * + 12, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * + 13, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * + 14, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * + 15, }; assert(sampleNum < 16); return RasterTileColorOffsets[sampleNum]; @@ -100,23 +111,29 @@ INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) { - static const uint32_t RasterTileDepthOffsets[16] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15, + static const uint32_t RasterTileDepthOffsets[16]{ + 0, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8), + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * + 10, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * + 11, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * + 12, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * + 13, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * + 14, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * + 15, }; assert(sampleNum < 16); return RasterTileDepthOffsets[sampleNum]; @@ -124,60 +141,78 @@ INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) { - static const uint32_t RasterTileStencilOffsets[16] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15, + static const uint32_t RasterTileStencilOffsets[16]{ + 0, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8), + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * + 2, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * + 3, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * + 4, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * + 5, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * + 6, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * + 7, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * + 8, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * + 9, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * + 10, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * + 11, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * + 12, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * + 13, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * + 14, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * + 15, }; assert(sampleNum < 16); return RasterTileStencilOffsets[sampleNum]; } -template<typename T, uint32_t InputCoverage> +template <typename T, uint32_t InputCoverage> struct generateInputCoverage { - INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask) + INLINE generateInputCoverage(const uint64_t* const coverageMask, + uint32_t (&inputMask)[KNOB_SIMD_WIDTH], + const uint32_t sampleMask) { // will need to update for avx512 assert(KNOB_SIMD_WIDTH == 8); simdscalari mask[2]; simdscalari sampleCoverage[2]; - - if(T::bIsCenterPattern) + + if (T::bIsCenterPattern) { // center coverage is the same for all samples; just broadcast to the sample slots uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK); - if(T::MultisampleT::numSamples == 1) + if (T::MultisampleT::numSamples == 1) { sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage); } - else if(T::MultisampleT::numSamples == 2) + else if (T::MultisampleT::numSamples == 2) { - sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage); + sampleCoverage[0] = + _simd_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage); } - else if(T::MultisampleT::numSamples == 4) + else if (T::MultisampleT::numSamples == 4) { - sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage); + sampleCoverage[0] = _simd_set_epi32( + 0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage); } - else if(T::MultisampleT::numSamples == 8) + else if (T::MultisampleT::numSamples == 8) { sampleCoverage[0] = _simd_set1_epi32(centerCoverage); } - else if(T::MultisampleT::numSamples == 16) + else if (T::MultisampleT::numSamples == 16) { sampleCoverage[0] = _simd_set1_epi32(centerCoverage); sampleCoverage[1] = _simd_set1_epi32(centerCoverage); @@ -185,80 +220,127 @@ struct generateInputCoverage } else { - simdscalari src = _simd_set1_epi32(0); + simdscalari src = _simd_set1_epi32(0); simdscalari index0 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1; - if(T::MultisampleT::numSamples == 1) + if (T::MultisampleT::numSamples == 1) { mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, -1); } - else if(T::MultisampleT::numSamples == 2) + else if (T::MultisampleT::numSamples == 2) { mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); } - else if(T::MultisampleT::numSamples == 4) + else if (T::MultisampleT::numSamples == 4) { mask[0] = _simd_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); } - else if(T::MultisampleT::numSamples == 8) + else if (T::MultisampleT::numSamples == 8) { mask[0] = _simd_set1_epi32(-1); } - else if(T::MultisampleT::numSamples == 16) + else if (T::MultisampleT::numSamples == 16) { mask[0] = _simd_set1_epi32(-1); mask[1] = _simd_set1_epi32(-1); - index1 = _simd_set_epi32(15, 14, 13, 12, 11, 10, 9, 8); + index1 = _simd_set_epi32(15, 14, 13, 12, 11, 10, 9, 8); } // gather coverage for samples 0-7 - sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8)); - if(T::MultisampleT::numSamples > 8) + sampleCoverage[0] = + _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), + (const float*)coverageMask, + index0, + _mm256_castsi256_ps(mask[0]), + 8)); + if (T::MultisampleT::numSamples > 8) { // gather coverage for samples 8-15 - sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8)); + sampleCoverage[1] = + _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), + (const float*)coverageMask, + index1, + _mm256_castsi256_ps(mask[1]), + 8)); } } - mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0); + mask[0] = _mm256_set_epi8(-1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + 0xC, + 0x8, + 0x4, + 0x0, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + 0xC, + 0x8, + 0x4, + 0x0); // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane simdscalari packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]); simdscalari packedCoverage1; - if(T::MultisampleT::numSamples > 8) + if (T::MultisampleT::numSamples > 8) { - // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane + // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit + // lane packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]); } - #if (KNOB_ARCH == KNOB_ARCH_AVX) - // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane +#if (KNOB_ARCH == KNOB_ARCH_AVX) + // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane simdscalari hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83); - simdscalar shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); - packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE)); + simdscalar shufRes = _mm256_shuffle_ps( + _mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); + packedCoverage0 = _mm256_castps_si256( + _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE)); simdscalari packedSampleCoverage; - if(T::MultisampleT::numSamples > 8) + if (T::MultisampleT::numSamples > 8) { // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane - hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83); - shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); - shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE); - packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01))); - packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC)); + hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83); + shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), + _mm256_castsi256_ps(hiToLow), + _MM_SHUFFLE(1, 1, 0, 1)); + shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE); + packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps( + _mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01))); + packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps( + _mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC)); } else { packedSampleCoverage = packedCoverage0; } - #else +#else simdscalari permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0); - // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane + // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask); simdscalari packedSampleCoverage; - if(T::MultisampleT::numSamples > 8) + if (T::MultisampleT::numSamples > 8) { permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7); // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane @@ -271,14 +353,15 @@ struct generateInputCoverage { packedSampleCoverage = packedCoverage0; } - #endif +#endif - for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--) + for (int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--) { - // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2 + // convert packed sample coverage masks into single coverage masks for all samples for + // each pixel in the 4x2 inputMask[i] = _simd_movemask_epi8(packedSampleCoverage); - if(!T::bForcedSampleCount) + if (!T::bForcedSampleCount) { // input coverage has to be anded with sample mask if MSAA isn't forced on inputMask[i] &= sampleMask; @@ -289,35 +372,47 @@ struct generateInputCoverage } } - INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask) + INLINE generateInputCoverage(const uint64_t* const coverageMask, + simdscalar& inputCoverage, + const uint32_t sampleMask) { uint32_t inputMask[KNOB_SIMD_WIDTH]; generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask); - inputCoverage = _simd_castsi_ps(_simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0])); + inputCoverage = _simd_castsi_ps(_simd_set_epi32(inputMask[7], + inputMask[6], + inputMask[5], + inputMask[4], + inputMask[3], + inputMask[2], + inputMask[1], + inputMask[0])); } - }; -template<typename T> +template <typename T> struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE> { - INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask) + INLINE generateInputCoverage(const uint64_t* const coverageMask, + simdscalar& inputCoverage, + const uint32_t sampleMask) { // will need to update for avx512 assert(KNOB_SIMD_WIDTH == 8); - simdscalari vec = _simd_set1_epi32(coverageMask[0]); + simdscalari vec = _simd_set1_epi32(coverageMask[0]); const simdscalari bit = _simd_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); - vec = _simd_and_si(vec, bit); - vec = _simd_cmplt_epi32(_simd_setzero_si(), vec); - vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec); - inputCoverage = _simd_castsi_ps(vec); + vec = _simd_and_si(vec, bit); + vec = _simd_cmplt_epi32(_simd_setzero_si(), vec); + vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec); + inputCoverage = _simd_castsi_ps(vec); } - INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask) + INLINE generateInputCoverage(const uint64_t* const coverageMask, + uint32_t (&inputMask)[KNOB_SIMD_WIDTH], + const uint32_t sampleMask) { - uint32_t simdCoverage = (coverageMask[0] & MASK); + uint32_t simdCoverage = (coverageMask[0] & MASK); static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1; - for(int i = 0; i < KNOB_SIMD_WIDTH; i++) + for (int i = 0; i < KNOB_SIMD_WIDTH; i++) { // set all samples to covered if conservative coverage mask is set for that pixel inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0; @@ -327,18 +422,25 @@ struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE> //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Centroid behaves exactly as follows : -// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to +// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center +// (even if the sample pattern does not happen to // have a sample location there). -// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the +// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample +// index, where sample coverage is after ANDing the // coverage with the SampleMask Rasterizer State. -// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is -// evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the -// SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point. +// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to +// fill out 2x2 pixel stamps, the attribute is +// evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the +// pixel, then the first sample covered by the SampleMask Rasterizer State is the evaluation +// point.Otherwise (full SampleMask), the pixel center is the evaluation point. //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -template<typename T> -INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS& samplePos, - const uint64_t *const coverageMask, const uint32_t sampleMask, - simdscalar const &vXSamplePosUL, simdscalar const &vYSamplePosUL) +template <typename T> +INLINE void CalcCentroidPos(SWR_PS_CONTEXT& psContext, + const SWR_MULTISAMPLE_POS& samplePos, + const uint64_t* const coverageMask, + const uint32_t sampleMask, + simdscalar const& vXSamplePosUL, + simdscalar const& vYSamplePosUL) { uint32_t inputMask[KNOB_SIMD_WIDTH]; generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask); @@ -356,50 +458,60 @@ INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0); (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0); - // look up and set the sample offsets from UL pixel corner for first covered sample + // look up and set the sample offsets from UL pixel corner for first covered sample simdscalar vXSample = _simd_set_ps(samplePos.X(sampleNum[7]), - samplePos.X(sampleNum[6]), - samplePos.X(sampleNum[5]), - samplePos.X(sampleNum[4]), - samplePos.X(sampleNum[3]), - samplePos.X(sampleNum[2]), - samplePos.X(sampleNum[1]), - samplePos.X(sampleNum[0])); + samplePos.X(sampleNum[6]), + samplePos.X(sampleNum[5]), + samplePos.X(sampleNum[4]), + samplePos.X(sampleNum[3]), + samplePos.X(sampleNum[2]), + samplePos.X(sampleNum[1]), + samplePos.X(sampleNum[0])); simdscalar vYSample = _simd_set_ps(samplePos.Y(sampleNum[7]), - samplePos.Y(sampleNum[6]), - samplePos.Y(sampleNum[5]), - samplePos.Y(sampleNum[4]), - samplePos.Y(sampleNum[3]), - samplePos.Y(sampleNum[2]), - samplePos.Y(sampleNum[1]), - samplePos.Y(sampleNum[0])); + samplePos.Y(sampleNum[6]), + samplePos.Y(sampleNum[5]), + samplePos.Y(sampleNum[4]), + samplePos.Y(sampleNum[3]), + samplePos.Y(sampleNum[2]), + samplePos.Y(sampleNum[1]), + samplePos.Y(sampleNum[0])); // add sample offset to UL pixel corner vXSample = _simd_add_ps(vXSamplePosUL, vXSample); vYSample = _simd_add_ps(vYSamplePosUL, vYSample); // Case (1) and case (3b) - All samples covered or not covered with full SampleMask static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask(); - simdscalari vInputCoveragei = _simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]); + simdscalari vInputCoveragei = _simd_set_epi32(inputMask[7], + inputMask[6], + inputMask[5], + inputMask[4], + inputMask[3], + inputMask[2], + inputMask[1], + inputMask[0]); simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask); static const simdscalari vZero = _simd_setzero_si(); - const simdscalari vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask); - simdscalari vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero); - simdscalari vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask); - simdscalari vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask); + const simdscalari vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask); + simdscalari vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero); + simdscalari vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask); + simdscalari vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask); simdscalari vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b); // set the centroid position based on results from above - psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter)); - psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter)); + psContext.vX.centroid = + _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter)); + psContext.vY.centroid = + _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter)); // Case (3a) No samples covered and partial sample mask simdscalari vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask); // sample mask should never be all 0's for this case, but handle it anyways unsigned long firstCoveredSampleMaskSample = 0; - (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0); + (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) + : (firstCoveredSampleMaskSample = 0); simdscalari vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples); @@ -407,24 +519,34 @@ INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample)); // blend in case 3a pixel locations - psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a)); - psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a)); + psContext.vX.centroid = + _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a)); + psContext.vY.centroid = + _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a)); } -INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext, - const simdscalar &vXSamplePosUL, const simdscalar &vYSamplePosUL) +INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, + SWR_PS_CONTEXT& psContext, + const simdscalar& vXSamplePosUL, + const simdscalar& vYSamplePosUL) { // evaluate I,J - psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid); - psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid); + psContext.vI.centroid = + vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid); + psContext.vJ.centroid = + vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid); psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet); psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet); // interpolate 1/w - psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid); + psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, + coeffs.vBOneOverW, + coeffs.vCOneOverW, + psContext.vI.centroid, + psContext.vJ.centroid); } -INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar const &z, float minz, float maxz) +INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar const& z, float minz, float maxz) { const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz)); const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz)); @@ -432,16 +554,17 @@ INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar const &z, float minz, float return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask)); } -template<typename T> +template <typename T> INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount) { // RT has to be single sample if we're in forcedMSAA mode - if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X)) + if (T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X)) { return 1; } - // unless we're forced to single sample, in which case we run the OM at the sample count of the RT - else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X)) + // unless we're forced to single sample, in which case we run the OM at the sample count of the + // RT + else if (T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X)) { return GetNumSamples(blendSampleCount); } @@ -452,7 +575,7 @@ INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount) } } -inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE_DESC &work) +inline void SetupBarycentricCoeffs(BarycentricCoeffs* coeffs, const SWR_TRIANGLE_DESC& work) { // broadcast scalars @@ -475,9 +598,12 @@ inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]); } -inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorHotTileMask, RenderOutputBuffers &renderBuffers) +inline void SetupRenderBuffers(uint8_t* pColorBuffer[SWR_NUM_RENDERTARGETS], + uint8_t** pDepthBuffer, + uint8_t** pStencilBuffer, + uint32_t colorHotTileMask, + RenderOutputBuffers& renderBuffers) { - DWORD index; while (_BitScanForward(&index, colorHotTileMask)) { @@ -493,41 +619,51 @@ inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uin if (pStencilBuffer) { - *pStencilBuffer = renderBuffers.pStencil;; + *pStencilBuffer = renderBuffers.pStencil; + ; } } -template<typename T> -void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos, SWR_TRIANGLE_DESC &work) +template <typename T> +void SetupPixelShaderContext(SWR_PS_CONTEXT* psContext, + const SWR_MULTISAMPLE_POS& samplePos, + SWR_TRIANGLE_DESC& work) { - psContext->pAttribs = work.pAttribs; - psContext->pPerspAttribs = work.pPerspAttribs; - psContext->frontFace = work.triFlags.frontFacing; + psContext->pAttribs = work.pAttribs; + psContext->pPerspAttribs = work.pPerspAttribs; + psContext->frontFace = work.triFlags.frontFacing; psContext->renderTargetArrayIndex = work.triFlags.renderTargetArrayIndex; - // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs + // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull + // attribs psContext->I = work.I; psContext->J = work.J; psContext->recipDet = work.recipDet; - psContext->pRecipW = work.pRecipW; - psContext->pSamplePosX = samplePos.X();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosX); - psContext->pSamplePosY = samplePos.Y();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosY); + psContext->pRecipW = work.pRecipW; + psContext->pSamplePosX = + samplePos.X(); // reinterpret_cast<const float *>(&T::MultisampleT::samplePosX); + psContext->pSamplePosY = + samplePos.Y(); // reinterpret_cast<const float *>(&T::MultisampleT::samplePosY); psContext->rasterizerSampleCount = T::MultisampleT::numSamples; - psContext->sampleIndex = 0; + psContext->sampleIndex = 0; } -template<typename T, bool IsSingleSample> -void CalcCentroid(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos, - const BarycentricCoeffs &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask) +template <typename T, bool IsSingleSample> +void CalcCentroid(SWR_PS_CONTEXT* psContext, + const SWR_MULTISAMPLE_POS& samplePos, + const BarycentricCoeffs& coeffs, + const uint64_t* const coverageMask, + uint32_t sampleMask) { - if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different + if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid + // positions are still different { // for 1x case, centroid is pixel center - psContext->vX.centroid = psContext->vX.center; - psContext->vY.centroid = psContext->vY.center; - psContext->vI.centroid = psContext->vI.center; - psContext->vJ.centroid = psContext->vJ.center; + psContext->vX.centroid = psContext->vX.center; + psContext->vY.centroid = psContext->vY.center; + psContext->vI.centroid = psContext->vI.center; + psContext->vJ.centroid = psContext->vJ.center; psContext->vOneOverW.centroid = psContext->vOneOverW.center; } else @@ -542,8 +678,14 @@ void CalcCentroid(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePo } else { - // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'.. - CalcCentroidPos<T>(*psContext, samplePos, coverageMask, sampleMask, psContext->vX.UL, psContext->vY.UL); + // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate + // coverage 2X'.. + CalcCentroidPos<T>(*psContext, + samplePos, + coverageMask, + sampleMask, + psContext->vX.UL, + psContext->vY.UL); } CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL); @@ -556,47 +698,61 @@ void CalcCentroid(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePo } } -template<typename T> +template <typename T> struct PixelRateZTestLoop { - PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState, - uint8_t*& depthBuffer, uint8_t*& stencilBuffer, const uint8_t ClipDistanceMask) : - pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState), - samplePos(state.rastState.samplePositions), - clipDistanceMask(ClipDistanceMask), pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){}; + PixelRateZTestLoop(DRAW_CONTEXT* DC, + uint32_t _workerId, + const SWR_TRIANGLE_DESC& Work, + const BarycentricCoeffs& Coeffs, + const API_STATE& apiState, + uint8_t*& depthBuffer, + uint8_t*& stencilBuffer, + const uint8_t ClipDistanceMask) : + pDC(DC), + workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState), + samplePos(state.rastState.samplePositions), clipDistanceMask(ClipDistanceMask), + pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){}; INLINE - uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext, - const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0) + uint32_t operator()(simdscalar& activeLanes, + SWR_PS_CONTEXT& psContext, + const CORE_BUCKETS BEDepthBucket, + uint32_t currentSimdIn8x8 = 0) { - uint32_t statCount = 0; + uint32_t statCount = 0; simdscalar anyDepthSamplePassed = _simd_setzero_ps(); - for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++) + for (uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++) { - const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample]; - vCoverageMask[sample] = _simd_and_ps(activeLanes, _simd_vmask_ps(pCoverageMask[currentSimdIn8x8] & MASK)); + const uint8_t* pCoverageMask = (uint8_t*)&work.coverageMask[sample]; + vCoverageMask[sample] = + _simd_and_ps(activeLanes, _simd_vmask_ps(pCoverageMask[currentSimdIn8x8] & MASK)); - if(!_simd_movemask_ps(vCoverageMask[sample])) + if (!_simd_movemask_ps(vCoverageMask[sample])) { - vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps(); + vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = + _simd_setzero_ps(); continue; } // offset depth/stencil buffers current sample - uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); - uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); + uint8_t* pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); + uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) { - static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); + static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, + "Unsupported depth hot tile format"); - const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample)); + const simdscalar z = _simd_load_ps(reinterpret_cast<const float*>(pDepthSample)); const float minz = state.depthBoundsState.depthBoundsTestMinValue; const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; - vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(CalcDepthBoundsAcceptMask(z, minz, maxz))); + vCoverageMask[sample] = + _simd_and_ps(vCoverageMask[sample], + _simd_vmask_ps(CalcDepthBoundsAcceptMask(z, minz, maxz))); } RDTSC_BEGIN(BEBarycentric, pDC->drawId); @@ -608,7 +764,7 @@ struct PixelRateZTestLoop // calc I & J per sample CalcSampleBarycentrics(coeffs, psContext); - if(psState.writesODepth) + if (psState.writesODepth) { { // broadcast and test oDepth(psContext.vZ) written from the PS for each sample @@ -617,7 +773,8 @@ struct PixelRateZTestLoop } else { - vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); + vZ[sample] = vplaneps( + coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); vZ[sample] = state.pfnQuantizeDepth(vZ[sample]); } @@ -625,36 +782,52 @@ struct PixelRateZTestLoop ///@todo: perspective correct vs non-perspective correct clipping? // if clip distances are enabled, we need to interpolate for each sample - if(clipDistanceMask) + if (clipDistanceMask) { - uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample); + uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, + work.pUserClipBuffer, + psContext.vI.sample, + psContext.vJ.sample); - vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(~clipMask)); + vCoverageMask[sample] = + _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(~clipMask)); } // ZTest for this sample ///@todo Need to uncomment out this bucket. - //RDTSC_BEGIN(BEDepthBucket, pDC->drawId); - depthPassMask[sample] = vCoverageMask[sample]; + // RDTSC_BEGIN(BEDepthBucket, pDC->drawId); + depthPassMask[sample] = vCoverageMask[sample]; stencilPassMask[sample] = vCoverageMask[sample]; - depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, - vZ[sample], pDepthSample, vCoverageMask[sample], - pStencilSample, &stencilPassMask[sample]); - //RDTSC_END(BEDepthBucket, 0); + depthPassMask[sample] = DepthStencilTest(&state, + work.triFlags.frontFacing, + work.triFlags.viewportIndex, + vZ[sample], + pDepthSample, + vCoverageMask[sample], + pStencilSample, + &stencilPassMask[sample]); + // RDTSC_END(BEDepthBucket, 0); // early-exit if no pixels passed depth or earlyZ is forced on - if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample])) + if (psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample])) { - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample], - pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]); - - if(!_simd_movemask_ps(depthPassMask[sample])) + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], + &state.depthStencilState, + work.triFlags.frontFacing, + vZ[sample], + pDepthSample, + depthPassMask[sample], + vCoverageMask[sample], + pStencilSample, + stencilPassMask[sample]); + + if (!_simd_movemask_ps(depthPassMask[sample])) { continue; } } anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]); - uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]); + uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]); statCount += _mm_popcnt_u32(statMask); } @@ -672,106 +845,129 @@ struct PixelRateZTestLoop private: // functor inputs DRAW_CONTEXT* pDC; - uint32_t workerId; + uint32_t workerId; - const SWR_TRIANGLE_DESC& work; - const BarycentricCoeffs& coeffs; - const API_STATE& state; - const SWR_PS_STATE& psState; + const SWR_TRIANGLE_DESC& work; + const BarycentricCoeffs& coeffs; + const API_STATE& state; + const SWR_PS_STATE& psState; const SWR_MULTISAMPLE_POS& samplePos; - const uint8_t clipDistanceMask; - uint8_t*& pDepthBuffer; - uint8_t*& pStencilBuffer; + const uint8_t clipDistanceMask; + uint8_t*& pDepthBuffer; + uint8_t*& pStencilBuffer; }; -INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext) +INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT& psContext) { // evaluate I,J - psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center); - psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center); + psContext.vI.center = + vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center); + psContext.vJ.center = + vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center); psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet); psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet); // interpolate 1/w - psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center); + psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, + coeffs.vBOneOverW, + coeffs.vCOneOverW, + psContext.vI.center, + psContext.vJ.center); } -static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext) +static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, + SWR_PS_CONTEXT& psContext) { // evaluate I,J - psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample); - psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample); + psContext.vI.sample = + vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample); + psContext.vJ.sample = + vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample); psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet); psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet); // interpolate 1/w - psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample); + psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, + coeffs.vBOneOverW, + coeffs.vCOneOverW, + psContext.vI.sample, + psContext.vJ.sample); } // Merge Output to 4x2 SIMD Tile Format -INLINE void OutputMerger4x2(DRAW_CONTEXT *pDC, SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState, - const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar const &depthPassMask, uint32_t renderTargetMask, uint32_t workerId) +INLINE void OutputMerger4x2(DRAW_CONTEXT* pDC, + SWR_PS_CONTEXT& psContext, + uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], + uint32_t sample, + const SWR_BLEND_STATE* pBlendState, + const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], + simdscalar& coverageMask, + simdscalar const& depthPassMask, + uint32_t renderTargetMask, + uint32_t workerId) { // type safety guaranteed from template instantiation in BEChooser<>::GetFunc const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample); - simdvector blendOut; + simdvector blendOut; DWORD rt = 0; while (_BitScanForward(&rt, renderTargetMask)) { renderTargetMask &= ~(1 << rt); - uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset; + uint8_t* pColorSample = pColorBase[rt] + rasterTileColorOffset; - const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt]; + const SWR_RENDER_TARGET_BLEND_STATE* pRTBlend = &pBlendState->renderTarget[rt]; - SWR_BLEND_CONTEXT blendContext = { 0 }; + SWR_BLEND_CONTEXT blendContext = {0}; { // pfnBlendFunc may not update all channels. Initialize with PS output. /// TODO: move this into the blend JIT. blendOut = psContext.shaded[rt]; blendContext.pBlendState = pBlendState; - blendContext.src = &psContext.shaded[rt]; - blendContext.src1 = &psContext.shaded[1]; - blendContext.src0alpha = reinterpret_cast<simdvector *>(&psContext.shaded[0].w); - blendContext.sampleNum = sample; - blendContext.pDst = (simdvector *) &pColorSample; - blendContext.result = &blendOut; - blendContext.oMask = &psContext.oMask; - blendContext.pMask = reinterpret_cast<simdscalari *>(&coverageMask); + blendContext.src = &psContext.shaded[rt]; + blendContext.src1 = &psContext.shaded[1]; + blendContext.src0alpha = reinterpret_cast<simdvector*>(&psContext.shaded[0].w); + blendContext.sampleNum = sample; + blendContext.pDst = (simdvector*)&pColorSample; + blendContext.result = &blendOut; + blendContext.oMask = &psContext.oMask; + blendContext.pMask = reinterpret_cast<simdscalari*>(&coverageMask); // Blend outputs and update coverage mask for alpha test - if(pfnBlendFunc[rt] != nullptr) + if (pfnBlendFunc[rt] != nullptr) { pfnBlendFunc[rt](&blendContext); } } // Track alpha events - AR_EVENT(AlphaInfoEvent(pDC->drawId, blendContext.isAlphaTested, blendContext.isAlphaBlended)); + AR_EVENT( + AlphaInfoEvent(pDC->drawId, blendContext.isAlphaTested, blendContext.isAlphaBlended)); - // final write mask + // final write mask simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask)); ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT. - static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format"); + static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, + "Unsupported hot tile format"); const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float); // store with color mask - if(!pRTBlend->writeDisableRed) + if (!pRTBlend->writeDisableRed) { _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x); } - if(!pRTBlend->writeDisableGreen) + if (!pRTBlend->writeDisableGreen) { _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y); } - if(!pRTBlend->writeDisableBlue) + if (!pRTBlend->writeDisableBlue) { _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z); } - if(!pRTBlend->writeDisableAlpha) + if (!pRTBlend->writeDisableAlpha) { _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w); } @@ -780,8 +976,17 @@ INLINE void OutputMerger4x2(DRAW_CONTEXT *pDC, SWR_PS_CONTEXT &psContext, uint8_ #if USE_8x2_TILE_BACKEND // Merge Output to 8x2 SIMD16 Tile Format -INLINE void OutputMerger8x2(DRAW_CONTEXT *pDC, SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState, - const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar const &depthPassMask, uint32_t renderTargetMask, bool useAlternateOffset, uint32_t workerId) +INLINE void OutputMerger8x2(DRAW_CONTEXT* pDC, + SWR_PS_CONTEXT& psContext, + uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], + uint32_t sample, + const SWR_BLEND_STATE* pBlendState, + const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], + simdscalar& coverageMask, + simdscalar const& depthPassMask, + uint32_t renderTargetMask, + bool useAlternateOffset, + uint32_t workerId) { // type safety guaranteed from template instantiation in BEChooser<>::GetFunc uint32_t rasterTileColorOffset = RasterTileColorOffset(sample); @@ -799,150 +1004,180 @@ INLINE void OutputMerger8x2(DRAW_CONTEXT *pDC, SWR_PS_CONTEXT &psContext, uint8_ { renderTargetMask &= ~(1 << rt); - const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt]; + const SWR_RENDER_TARGET_BLEND_STATE* pRTBlend = &pBlendState->renderTarget[rt]; simdscalar* pColorSample; - bool hotTileEnable = !pRTBlend->writeDisableAlpha || !pRTBlend->writeDisableRed || !pRTBlend->writeDisableGreen || !pRTBlend->writeDisableBlue; + bool hotTileEnable = !pRTBlend->writeDisableAlpha || !pRTBlend->writeDisableRed || + !pRTBlend->writeDisableGreen || !pRTBlend->writeDisableBlue; if (hotTileEnable) { - pColorSample = reinterpret_cast<simdscalar *>(pColorBase[rt] + rasterTileColorOffset); - blendSrc[0] = pColorSample[0]; - blendSrc[1] = pColorSample[2]; - blendSrc[2] = pColorSample[4]; - blendSrc[3] = pColorSample[6]; + pColorSample = reinterpret_cast<simdscalar*>(pColorBase[rt] + rasterTileColorOffset); + blendSrc[0] = pColorSample[0]; + blendSrc[1] = pColorSample[2]; + blendSrc[2] = pColorSample[4]; + blendSrc[3] = pColorSample[6]; } else { pColorSample = nullptr; } - SWR_BLEND_CONTEXT blendContext = { 0 }; + SWR_BLEND_CONTEXT blendContext = {0}; { // pfnBlendFunc may not update all channels. Initialize with PS output. /// TODO: move this into the blend JIT. blendOut = psContext.shaded[rt]; - blendContext.pBlendState = pBlendState; - blendContext.src = &psContext.shaded[rt]; - blendContext.src1 = &psContext.shaded[1]; - blendContext.src0alpha = reinterpret_cast<simdvector *>(&psContext.shaded[0].w); - blendContext.sampleNum = sample; - blendContext.pDst = &blendSrc; - blendContext.result = &blendOut; - blendContext.oMask = &psContext.oMask; - blendContext.pMask = reinterpret_cast<simdscalari *>(&coverageMask); + blendContext.pBlendState = pBlendState; + blendContext.src = &psContext.shaded[rt]; + blendContext.src1 = &psContext.shaded[1]; + blendContext.src0alpha = reinterpret_cast<simdvector*>(&psContext.shaded[0].w); + blendContext.sampleNum = sample; + blendContext.pDst = &blendSrc; + blendContext.result = &blendOut; + blendContext.oMask = &psContext.oMask; + blendContext.pMask = reinterpret_cast<simdscalari*>(&coverageMask); // Blend outputs and update coverage mask for alpha test - if(pfnBlendFunc[rt] != nullptr) + if (pfnBlendFunc[rt] != nullptr) { pfnBlendFunc[rt](&blendContext); } } // Track alpha events - AR_EVENT(AlphaInfoEvent(pDC->drawId, blendContext.isAlphaTested, blendContext.isAlphaBlended)); + AR_EVENT( + AlphaInfoEvent(pDC->drawId, blendContext.isAlphaTested, blendContext.isAlphaBlended)); - // final write mask + // final write mask simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask)); ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT. - static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format"); + static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, + "Unsupported hot tile format"); // store with color mask if (!pRTBlend->writeDisableRed) { - _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[0]), outputMask, blendOut.x); + _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[0]), outputMask, blendOut.x); } if (!pRTBlend->writeDisableGreen) { - _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[2]), outputMask, blendOut.y); + _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[2]), outputMask, blendOut.y); } if (!pRTBlend->writeDisableBlue) { - _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[4]), outputMask, blendOut.z); + _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[4]), outputMask, blendOut.z); } if (!pRTBlend->writeDisableAlpha) { - _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[6]), outputMask, blendOut.w); + _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[6]), outputMask, blendOut.w); } } } #endif -template<typename T> -void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) +template <typename T> +void BackendPixelRate(DRAW_CONTEXT* pDC, + uint32_t workerId, + uint32_t x, + uint32_t y, + SWR_TRIANGLE_DESC& work, + RenderOutputBuffers& renderBuffers) { - ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the backend + ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the + /// backend RDTSC_BEGIN(BEPixelRateBackend, pDC->drawId); RDTSC_BEGIN(BESetup, pDC->drawId); - const API_STATE &state = GetApiState(pDC); + const API_STATE& state = GetApiState(pDC); BarycentricCoeffs coeffs; SetupBarycentricCoeffs(&coeffs, work); - SWR_CONTEXT *pContext = pDC->pContext; - void* pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; + SWR_CONTEXT* pContext = pDC->pContext; + void* pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; - SWR_PS_CONTEXT psContext; + SWR_PS_CONTEXT psContext; const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; SetupPixelShaderContext<T>(&psContext, samplePos, work); uint8_t *pDepthBuffer, *pStencilBuffer; - SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers); + SetupRenderBuffers(psContext.pColorBuffer, + &pDepthBuffer, + &pStencilBuffer, + state.colorHottileEnable, + renderBuffers); RDTSC_END(BESetup, 0); - PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.backendState.clipDistanceMask); + PixelRateZTestLoop<T> PixelRateZTest(pDC, + workerId, + work, + coeffs, + state, + pDepthBuffer, + pStencilBuffer, + state.backendState.clipDistanceMask); - psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y))); + psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y))); psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y))); const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); - for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) + for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { - psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x))); + psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x))); psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x))); const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM)); - for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) + for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { #if USE_8x2_TILE_BACKEND const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); #endif simdscalar activeLanes; - if(!(work.anyCoveredSamples & MASK)) {goto Endtile;}; + if (!(work.anyCoveredSamples & MASK)) + { + goto Endtile; + }; activeLanes = _simd_vmask_ps(work.anyCoveredSamples & MASK); if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) { - const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0]; + const uint64_t* pCoverageMask = + (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) + ? &work.innerCoverageMask + : &work.coverageMask[0]; - generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask); + generateInputCoverage<T, T::InputCoverage>( + pCoverageMask, psContext.inputMask, state.blendState.sampleMask); } RDTSC_BEGIN(BEBarycentric, pDC->drawId); CalcPixelBarycentrics(coeffs, psContext); - CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); + CalcCentroid<T, false>( + &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); RDTSC_END(BEBarycentric, 0); - if(T::bForcedSampleCount) + if (T::bForcedSampleCount) { - // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set - const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si())); - activeLanes = _simd_and_ps(activeLanes, vSampleMask); + // candidate pixels (that passed coverage) will cause shader invocation if any bits + // in the samplemask are set + const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32( + _simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si())); + activeLanes = _simd_and_ps(activeLanes, vSampleMask); } // Early-Z? - if(T::bCanEarlyZ && !T::bForcedSampleCount) + if (T::bCanEarlyZ && !T::bForcedSampleCount) { uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest); UPDATE_STAT_BE(DepthPassCount, depthPassCount); @@ -950,20 +1185,24 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t } // if we have no covered samples that passed depth at this point, go to next tile - if(!_simd_movemask_ps(activeLanes)) { goto Endtile; }; + if (!_simd_movemask_ps(activeLanes)) + { + goto Endtile; + }; - if(state.psState.usesSourceDepth) + if (state.psState.usesSourceDepth) { RDTSC_BEGIN(BEBarycentric, pDC->drawId); // interpolate and quantize z - psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); + psContext.vZ = vplaneps( + coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); RDTSC_END(BEBarycentric, 0); } // pixels that are currently active psContext.activeMask = _simd_castps_si(activeLanes); - psContext.oMask = T::MultisampleT::FullSampleMask(); + psContext.oMask = T::MultisampleT::FullSampleMask(); // execute pixel shader RDTSC_BEGIN(BEPixelShader, pDC->drawId); @@ -976,29 +1215,39 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t AR_EVENT(PSStats(psContext.stats.numInstExecuted)); // update active lanes to remove any discarded or oMask'd pixels - activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si()))); - if(!_simd_movemask_ps(activeLanes)) { goto Endtile; }; + activeLanes = _simd_castsi_ps(_simd_and_si( + psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si()))); + if (!_simd_movemask_ps(activeLanes)) + { + goto Endtile; + }; // late-Z - if(!T::bCanEarlyZ && !T::bForcedSampleCount) + if (!T::bCanEarlyZ && !T::bForcedSampleCount) { uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest); UPDATE_STAT_BE(DepthPassCount, depthPassCount); AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes))); } - // if we have no covered samples that passed depth at this point, skip OM and go to next tile - if(!_simd_movemask_ps(activeLanes)) { goto Endtile; }; + // if we have no covered samples that passed depth at this point, skip OM and go to next + // tile + if (!_simd_movemask_ps(activeLanes)) + { + goto Endtile; + }; // output merger // loop over all samples, broadcasting the results of the PS to all passing pixels - for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++) + for (uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); + sample++) { RDTSC_BEGIN(BEOutputMerger, pDC->drawId); - // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples - uint32_t coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample; + // center pattern does a single coverage/depth/stencil test, standard pattern tests + // all samples + uint32_t coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample; simdscalar coverageMask, depthMask; - if(T::bForcedSampleCount) + if (T::bForcedSampleCount) { coverageMask = depthMask = activeLanes; } @@ -1006,40 +1255,66 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t { coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum]; depthMask = PixelRateZTest.depthPassMask[coverageSampleNum]; - if(!_simd_movemask_ps(depthMask)) + if (!_simd_movemask_ps(depthMask)) { // stencil should already have been written in early/lateZ tests RDTSC_END(BEOutputMerger, 0); continue; } } - + // broadcast the results of the PS to all passing pixels #if USE_8x2_TILE_BACKEND - OutputMerger8x2(pDC, psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.renderTargetMask, useAlternateOffset, workerId); -#else // USE_8x2_TILE_BACKEND - OutputMerger4x2(pDC, psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.renderTargetMask, workerId); + OutputMerger8x2(pDC, + psContext, + psContext.pColorBuffer, + sample, + &state.blendState, + state.pfnBlendFunc, + coverageMask, + depthMask, + state.psState.renderTargetMask, + useAlternateOffset, + workerId); +#else // USE_8x2_TILE_BACKEND + OutputMerger4x2(pDC, + psContext, + psContext.pColorBuffer, + sample, + &state.blendState, + state.pfnBlendFunc, + coverageMask, + depthMask, + state.psState.renderTargetMask, + workerId); #endif // USE_8x2_TILE_BACKEND - if(!state.psState.forceEarlyZ && !T::bForcedSampleCount) + if (!state.psState.forceEarlyZ && !T::bForcedSampleCount) { - uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); - uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); - - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum], - pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]); + uint8_t* pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); + uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); + + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], + &state.depthStencilState, + work.triFlags.frontFacing, + PixelRateZTest.vZ[coverageSampleNum], + pDepthSample, + depthMask, + coverageMask, + pStencilSample, + PixelRateZTest.stencilPassMask[coverageSampleNum]); } RDTSC_END(BEOutputMerger, 0); } -Endtile: + Endtile: RDTSC_BEGIN(BEEndTile, pDC->drawId); - for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++) + for (uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++) { work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); } - if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) + if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) { work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); } @@ -1048,48 +1323,55 @@ Endtile: #if USE_8x2_TILE_BACKEND if (useAlternateOffset) { - DWORD rt; + DWORD rt; uint32_t rtMask = state.colorHottileEnable; while (_BitScanForward(&rt, rtMask)) { rtMask &= ~(1 << rt); - psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; + psContext.pColorBuffer[rt] += + (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; } } #else - DWORD rt; + DWORD rt; uint32_t rtMask = state.colorHottileEnable; while (_BitScanForward(&rt, rtMask)) { rtMask &= ~(1 << rt); - psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; + psContext.pColorBuffer[rt] += + (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; } #endif pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; - pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; + pStencilBuffer += + (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; RDTSC_END(BEEndTile, 0); - psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); + psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); } - psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); + psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); } RDTSC_END(BEPixelRateBackend, 0); } -template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t isCenter = 0, - uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0 - > +template <uint32_t sampleCountT = SWR_MULTISAMPLE_1X, + uint32_t isCenter = 0, + uint32_t coverage = 0, + uint32_t centroid = 0, + uint32_t forced = 0, + uint32_t canEarlyZ = 0 + > struct SwrBackendTraits { - static const bool bIsCenterPattern = (isCenter == 1); - static const uint32_t InputCoverage = coverage; - static const bool bCentroidPos = (centroid == 1); - static const bool bForcedSampleCount = (forced == 1); - static const bool bCanEarlyZ = (canEarlyZ == 1); + static const bool bIsCenterPattern = (isCenter == 1); + static const uint32_t InputCoverage = coverage; + static const bool bCentroidPos = (centroid == 1); + static const bool bForcedSampleCount = (forced == 1); + static const bool bCanEarlyZ = (canEarlyZ == 1); typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, bIsCenterPattern> MultisampleT; }; diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp index 5940aa7ba45..a1a1185bcfb 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp @@ -1,31 +1,31 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file backend.cpp -* -* @brief Backend handles rasterization, pixel shading and output merger -* operations. -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file backend.cpp + * + * @brief Backend handles rasterization, pixel shading and output merger + * operations. + * + ******************************************************************************/ #include <smmintrin.h> @@ -37,35 +37,44 @@ #include <algorithm> -template<typename T> -void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) +template <typename T> +void BackendSampleRate(DRAW_CONTEXT* pDC, + uint32_t workerId, + uint32_t x, + uint32_t y, + SWR_TRIANGLE_DESC& work, + RenderOutputBuffers& renderBuffers) { RDTSC_BEGIN(BESampleRateBackend, pDC->drawId); RDTSC_BEGIN(BESetup, pDC->drawId); - void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; - const API_STATE &state = GetApiState(pDC); + void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; + const API_STATE& state = GetApiState(pDC); BarycentricCoeffs coeffs; SetupBarycentricCoeffs(&coeffs, work); - SWR_PS_CONTEXT psContext; + SWR_PS_CONTEXT psContext; const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; SetupPixelShaderContext<T>(&psContext, samplePos, work); uint8_t *pDepthBuffer, *pStencilBuffer; - SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers); + SetupRenderBuffers(psContext.pColorBuffer, + &pDepthBuffer, + &pStencilBuffer, + state.colorHottileEnable, + renderBuffers); RDTSC_END(BESetup, 0); - psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y))); + psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y))); psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y))); const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { - psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x))); + psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x))); psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x))); const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM)); @@ -77,16 +86,21 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ #endif if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) { - const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0]; + const uint64_t* pCoverageMask = + (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) + ? &work.innerCoverageMask + : &work.coverageMask[0]; - generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask); + generateInputCoverage<T, T::InputCoverage>( + pCoverageMask, psContext.inputMask, state.blendState.sampleMask); } RDTSC_BEGIN(BEBarycentric, pDC->drawId); CalcPixelBarycentrics(coeffs, psContext); - CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); + CalcCentroid<T, false>( + &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); RDTSC_END(BEBarycentric, 0); @@ -97,14 +111,16 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ if (coverageMask) { // offset depth/stencil buffers current sample - uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); - uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); + uint8_t* pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); + uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) { - static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); + static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, + "Unsupported depth hot tile format"); - const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample)); + const simdscalar z = + _simd_load_ps(reinterpret_cast<const float*>(pDepthSample)); const float minz = state.depthBoundsState.depthBoundsTestMinValue; const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; @@ -121,7 +137,11 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ CalcSampleBarycentrics(coeffs, psContext); // interpolate and quantize z - psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); + psContext.vZ = vplaneps(coeffs.vZa, + coeffs.vZb, + coeffs.vZc, + psContext.vI.sample, + psContext.vJ.sample); psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); RDTSC_END(BEBarycentric, 0); @@ -129,27 +149,45 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ // interpolate user clip distance if available if (state.backendState.clipDistanceMask) { - coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample); + coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, + work.pUserClipBuffer, + psContext.vI.sample, + psContext.vJ.sample); } - simdscalar vCoverageMask = _simd_vmask_ps(coverageMask); - simdscalar depthPassMask = vCoverageMask; + simdscalar vCoverageMask = _simd_vmask_ps(coverageMask); + simdscalar depthPassMask = vCoverageMask; simdscalar stencilPassMask = vCoverageMask; // Early-Z? if (T::bCanEarlyZ) { RDTSC_BEGIN(BEEarlyDepthTest, pDC->drawId); - depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, - psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); - AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask))); + depthPassMask = DepthStencilTest(&state, + work.triFlags.frontFacing, + work.triFlags.viewportIndex, + psContext.vZ, + pDepthSample, + vCoverageMask, + pStencilSample, + &stencilPassMask); + AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), + _simd_movemask_ps(stencilPassMask), + _simd_movemask_ps(vCoverageMask))); RDTSC_END(BEEarlyDepthTest, 0); // early-exit if no samples passed depth or earlyZ is forced on. if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask)) { - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, - pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], + &state.depthStencilState, + work.triFlags.frontFacing, + psContext.vZ, + pDepthSample, + depthPassMask, + vCoverageMask, + pStencilSample, + stencilPassMask); if (!_simd_movemask_ps(depthPassMask)) { @@ -160,7 +198,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ } psContext.sampleIndex = sample; - psContext.activeMask = _simd_castps_si(vCoverageMask); + psContext.activeMask = _simd_castps_si(vCoverageMask); // execute pixel shader RDTSC_BEGIN(BEPixelShader, pDC->drawId); @@ -177,39 +215,80 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ if (!T::bCanEarlyZ) { RDTSC_BEGIN(BELateDepthTest, pDC->drawId); - depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, - psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); - AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask))); + depthPassMask = DepthStencilTest(&state, + work.triFlags.frontFacing, + work.triFlags.viewportIndex, + psContext.vZ, + pDepthSample, + vCoverageMask, + pStencilSample, + &stencilPassMask); + AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), + _simd_movemask_ps(stencilPassMask), + _simd_movemask_ps(vCoverageMask))); RDTSC_END(BELateDepthTest, 0); if (!_simd_movemask_ps(depthPassMask)) { // need to call depth/stencil write for stencil write - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, - pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], + &state.depthStencilState, + work.triFlags.frontFacing, + psContext.vZ, + pDepthSample, + depthPassMask, + vCoverageMask, + pStencilSample, + stencilPassMask); work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); continue; } } - uint32_t statMask = _simd_movemask_ps(depthPassMask); + uint32_t statMask = _simd_movemask_ps(depthPassMask); uint32_t statCount = _mm_popcnt_u32(statMask); UPDATE_STAT_BE(DepthPassCount, statCount); // output merger RDTSC_BEGIN(BEOutputMerger, pDC->drawId); #if USE_8x2_TILE_BACKEND - OutputMerger8x2(pDC, psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset, workerId); + OutputMerger8x2(pDC, + psContext, + psContext.pColorBuffer, + sample, + &state.blendState, + state.pfnBlendFunc, + vCoverageMask, + depthPassMask, + state.psState.renderTargetMask, + useAlternateOffset, + workerId); #else - OutputMerger4x2(pDC, psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, workerId); + OutputMerger4x2(pDC, + psContext, + psContext.pColorBuffer, + sample, + &state.blendState, + state.pfnBlendFunc, + vCoverageMask, + depthPassMask, + state.psState.renderTargetMask, + workerId); #endif // do final depth write after all pixel kills if (!state.psState.forceEarlyZ) { - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, - pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], + &state.depthStencilState, + work.triFlags.frontFacing, + psContext.vZ, + pDepthSample, + depthPassMask, + vCoverageMask, + pStencilSample, + stencilPassMask); } RDTSC_END(BEOutputMerger, 0); } @@ -229,12 +308,13 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ #if USE_8x2_TILE_BACKEND if (useAlternateOffset) { - DWORD rt; + DWORD rt; uint32_t rtMask = state.colorHottileEnable; while (_BitScanForward(&rt, rtMask)) { rtMask &= ~(1 << rt); - psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; + psContext.pColorBuffer[rt] += + (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; } } #else @@ -243,19 +323,21 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ while (_BitScanForward(&rt, rtMask)) { rtMask &= ~(1 << rt); - psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; + psContext.pColorBuffer[rt] += + (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; } #endif pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; - pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; + pStencilBuffer += + (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; RDTSC_END(BEEndTile, 0); - psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); + psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); } - psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); + psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); } @@ -272,7 +354,9 @@ struct BEChooserSampleRate { switch (tArg) { - case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break; + case SWR_BACKEND_MSAA_SAMPLE_RATE: + return BackendSampleRate<SwrBackendTraits<ArgsT...>>; + break; case SWR_BACKEND_SINGLE_SAMPLE: case SWR_BACKEND_MSAA_PIXEL_RATE: SWR_ASSERT(0 && "Invalid backend func\n"); @@ -291,12 +375,22 @@ struct BEChooserSampleRate { switch (tArg) { - case SWR_INPUT_COVERAGE_NONE: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break; - case SWR_INPUT_COVERAGE_NORMAL: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break; - case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break; + case SWR_INPUT_COVERAGE_NONE: + return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc( + remainingArgs...); + break; + case SWR_INPUT_COVERAGE_NORMAL: + return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc( + remainingArgs...); + break; + case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: + return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc( + remainingArgs...); + break; default: SWR_ASSERT(0 && "Invalid sample pattern\n"); - return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); + return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc( + remainingArgs...); break; } } @@ -307,11 +401,21 @@ struct BEChooserSampleRate { switch (tArg) { - case SWR_MULTISAMPLE_1X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break; - case SWR_MULTISAMPLE_2X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break; - case SWR_MULTISAMPLE_4X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break; - case SWR_MULTISAMPLE_8X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break; - case SWR_MULTISAMPLE_16X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break; + case SWR_MULTISAMPLE_1X: + return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); + break; + case SWR_MULTISAMPLE_2X: + return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); + break; + case SWR_MULTISAMPLE_4X: + return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); + break; + case SWR_MULTISAMPLE_8X: + return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); + break; + case SWR_MULTISAMPLE_16X: + return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); + break; default: SWR_ASSERT(0 && "Invalid sample count\n"); return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); @@ -332,9 +436,11 @@ struct BEChooserSampleRate } }; -void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]) +void InitBackendSampleFuncTable( + PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]) { - for (uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++) + for (uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; + sampleCount++) { for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++) { @@ -343,8 +449,14 @@ void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_MULTISAMPLE_TYPE_CO for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) { table[sampleCount][inputCoverage][centroid][canEarlyZ] = - BEChooserSampleRate<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, false, (SWR_INPUT_COVERAGE)inputCoverage, - (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE); + BEChooserSampleRate<>::GetFunc( + (SWR_MULTISAMPLE_COUNT)sampleCount, + false, + (SWR_INPUT_COVERAGE)inputCoverage, + (centroid > 0), + false, + (canEarlyZ > 0), + (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE); } } } diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp index aaaba636ed3..2efb01f95db 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp @@ -1,31 +1,31 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file backend.cpp -* -* @brief Backend handles rasterization, pixel shading and output merger -* operations. -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file backend.cpp + * + * @brief Backend handles rasterization, pixel shading and output merger + * operations. + * + ******************************************************************************/ #include <smmintrin.h> @@ -37,36 +37,45 @@ #include <algorithm> -template<typename T> -void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) +template <typename T> +void BackendSingleSample(DRAW_CONTEXT* pDC, + uint32_t workerId, + uint32_t x, + uint32_t y, + SWR_TRIANGLE_DESC& work, + RenderOutputBuffers& renderBuffers) { RDTSC_BEGIN(BESingleSampleBackend, pDC->drawId); RDTSC_BEGIN(BESetup, pDC->drawId); void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; - const API_STATE &state = GetApiState(pDC); + const API_STATE& state = GetApiState(pDC); BarycentricCoeffs coeffs; SetupBarycentricCoeffs(&coeffs, work); - SWR_PS_CONTEXT psContext; + SWR_PS_CONTEXT psContext; const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; SetupPixelShaderContext<T>(&psContext, samplePos, work); uint8_t *pDepthBuffer, *pStencilBuffer; - SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers); + SetupRenderBuffers(psContext.pColorBuffer, + &pDepthBuffer, + &pStencilBuffer, + state.colorHottileEnable, + renderBuffers); RDTSC_END(BESetup, 1); - psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y))); + psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y))); psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y))); const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { - psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x))); + psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x))); psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x))); const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM)); @@ -82,9 +91,11 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 { if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) { - static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); + static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, + "Unsupported depth hot tile format"); - const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer)); + const simdscalar z = + _simd_load_ps(reinterpret_cast<const float*>(pDepthBuffer)); const float minz = state.depthBoundsState.depthBoundsTestMinValue; const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; @@ -94,19 +105,25 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) { - const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0]; + const uint64_t* pCoverageMask = + (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) + ? &work.innerCoverageMask + : &work.coverageMask[0]; - generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask); + generateInputCoverage<T, T::InputCoverage>( + pCoverageMask, psContext.inputMask, state.blendState.sampleMask); } RDTSC_BEGIN(BEBarycentric, pDC->drawId); CalcPixelBarycentrics(coeffs, psContext); - CalcCentroid<T, true>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); + CalcCentroid<T, true>( + &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); // interpolate and quantize z - psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); + psContext.vZ = vplaneps( + coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); RDTSC_END(BEBarycentric, 1); @@ -114,27 +131,45 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 // interpolate user clip distance if available if (state.backendState.clipDistanceMask) { - coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center); + coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, + work.pUserClipBuffer, + psContext.vI.center, + psContext.vJ.center); } - simdscalar vCoverageMask = _simd_vmask_ps(coverageMask); - simdscalar depthPassMask = vCoverageMask; + simdscalar vCoverageMask = _simd_vmask_ps(coverageMask); + simdscalar depthPassMask = vCoverageMask; simdscalar stencilPassMask = vCoverageMask; // Early-Z? if (T::bCanEarlyZ) { RDTSC_BEGIN(BEEarlyDepthTest, pDC->drawId); - depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, - psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask); - AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask))); + depthPassMask = DepthStencilTest(&state, + work.triFlags.frontFacing, + work.triFlags.viewportIndex, + psContext.vZ, + pDepthBuffer, + vCoverageMask, + pStencilBuffer, + &stencilPassMask); + AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), + _simd_movemask_ps(stencilPassMask), + _simd_movemask_ps(vCoverageMask))); RDTSC_END(BEEarlyDepthTest, 0); // early-exit if no pixels passed depth or earlyZ is forced on if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask)) { - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, - pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask); + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], + &state.depthStencilState, + work.triFlags.frontFacing, + psContext.vZ, + pDepthBuffer, + depthPassMask, + vCoverageMask, + pStencilBuffer, + stencilPassMask); if (!_simd_movemask_ps(depthPassMask)) { @@ -144,7 +179,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 } psContext.sampleIndex = 0; - psContext.activeMask = _simd_castps_si(vCoverageMask); + psContext.activeMask = _simd_castps_si(vCoverageMask); // execute pixel shader RDTSC_BEGIN(BEPixelShader, pDC->drawId); @@ -161,50 +196,94 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 if (!T::bCanEarlyZ) { RDTSC_BEGIN(BELateDepthTest, pDC->drawId); - depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, - psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask); - AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask))); + depthPassMask = DepthStencilTest(&state, + work.triFlags.frontFacing, + work.triFlags.viewportIndex, + psContext.vZ, + pDepthBuffer, + vCoverageMask, + pStencilBuffer, + &stencilPassMask); + AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), + _simd_movemask_ps(stencilPassMask), + _simd_movemask_ps(vCoverageMask))); RDTSC_END(BELateDepthTest, 0); if (!_simd_movemask_ps(depthPassMask)) { // need to call depth/stencil write for stencil write - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, - pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask); + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], + &state.depthStencilState, + work.triFlags.frontFacing, + psContext.vZ, + pDepthBuffer, + depthPassMask, + vCoverageMask, + pStencilBuffer, + stencilPassMask); goto Endtile; } - } else { + } + else + { // for early z, consolidate discards from shader // into depthPassMask depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask); } - uint32_t statMask = _simd_movemask_ps(depthPassMask); + uint32_t statMask = _simd_movemask_ps(depthPassMask); uint32_t statCount = _mm_popcnt_u32(statMask); UPDATE_STAT_BE(DepthPassCount, statCount); // output merger RDTSC_BEGIN(BEOutputMerger, pDC->drawId); #if USE_8x2_TILE_BACKEND - OutputMerger8x2(pDC, psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset, workerId); + OutputMerger8x2(pDC, + psContext, + psContext.pColorBuffer, + 0, + &state.blendState, + state.pfnBlendFunc, + vCoverageMask, + depthPassMask, + state.psState.renderTargetMask, + useAlternateOffset, + workerId); #else - OutputMerger4x2(pDC, psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, workerId, workerId); + OutputMerger4x2(pDC, + psContext, + psContext.pColorBuffer, + 0, + &state.blendState, + state.pfnBlendFunc, + vCoverageMask, + depthPassMask, + state.psState.renderTargetMask, + workerId, + workerId); #endif // do final depth write after all pixel kills if (!state.psState.forceEarlyZ) { - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, - pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask); + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], + &state.depthStencilState, + work.triFlags.frontFacing, + psContext.vZ, + pDepthBuffer, + depthPassMask, + vCoverageMask, + pStencilBuffer, + stencilPassMask); } RDTSC_END(BEOutputMerger, 0); } -Endtile: + Endtile: RDTSC_BEGIN(BEEndTile, pDC->drawId); work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) + if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) { work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); } @@ -212,12 +291,13 @@ Endtile: #if USE_8x2_TILE_BACKEND if (useAlternateOffset) { - DWORD rt; + DWORD rt; uint32_t rtMask = state.colorHottileEnable; - while(_BitScanForward(&rt, rtMask)) + while (_BitScanForward(&rt, rtMask)) { rtMask &= ~(1 << rt); - psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; + psContext.pColorBuffer[rt] += + (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; } } #else @@ -226,19 +306,21 @@ Endtile: while (_BitScanForward(&rt, rtMask)) { rtMask &= ~(1 << rt); - psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; + psContext.pColorBuffer[rt] += + (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; } #endif pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; - pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; + pStencilBuffer += + (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; RDTSC_END(BEEndTile, 0); - psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); + psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); } - psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); + psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); } @@ -253,9 +335,11 @@ struct BEChooserSingleSample // Last Arg Terminator static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg) { - switch(tArg) + switch (tArg) { - case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break; + case SWR_BACKEND_SINGLE_SAMPLE: + return BackendSingleSample<SwrBackendTraits<ArgsT...>>; + break; case SWR_BACKEND_MSAA_PIXEL_RATE: case SWR_BACKEND_MSAA_SAMPLE_RATE: default: @@ -269,15 +353,25 @@ struct BEChooserSingleSample template <typename... TArgsT> static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs) { - switch(tArg) + switch (tArg) { - case SWR_INPUT_COVERAGE_NONE: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break; - case SWR_INPUT_COVERAGE_NORMAL: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break; - case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break; + case SWR_INPUT_COVERAGE_NONE: + return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc( + remainingArgs...); + break; + case SWR_INPUT_COVERAGE_NORMAL: + return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc( + remainingArgs...); + break; + case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: + return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc( + remainingArgs...); + break; default: - SWR_ASSERT(0 && "Invalid sample pattern\n"); - return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); - break; + SWR_ASSERT(0 && "Invalid sample pattern\n"); + return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc( + remainingArgs...); + break; } } @@ -285,17 +379,27 @@ struct BEChooserSingleSample template <typename... TArgsT> static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs) { - switch(tArg) + switch (tArg) { - case SWR_MULTISAMPLE_1X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break; - case SWR_MULTISAMPLE_2X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break; - case SWR_MULTISAMPLE_4X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break; - case SWR_MULTISAMPLE_8X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break; - case SWR_MULTISAMPLE_16X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break; + case SWR_MULTISAMPLE_1X: + return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); + break; + case SWR_MULTISAMPLE_2X: + return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); + break; + case SWR_MULTISAMPLE_4X: + return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); + break; + case SWR_MULTISAMPLE_8X: + return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); + break; + case SWR_MULTISAMPLE_16X: + return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); + break; default: - SWR_ASSERT(0 && "Invalid sample count\n"); - return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); - break; + SWR_ASSERT(0 && "Invalid sample count\n"); + return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); + break; } } @@ -303,7 +407,7 @@ struct BEChooserSingleSample template <typename... TArgsT> static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs) { - if(tArg == true) + if (tArg == true) { return BEChooserSingleSample<ArgsT..., 1>::GetFunc(remainingArgs...); } @@ -314,15 +418,20 @@ struct BEChooserSingleSample void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2]) { - for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++) + for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++) { - for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++) + for (uint32_t isCentroid = 0; isCentroid < 2; isCentroid++) { - for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) + for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) { table[inputCoverage][isCentroid][canEarlyZ] = - BEChooserSingleSample<>::GetFunc(SWR_MULTISAMPLE_1X, false, (SWR_INPUT_COVERAGE)inputCoverage, - (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE); + BEChooserSingleSample<>::GetFunc(SWR_MULTISAMPLE_1X, + false, + (SWR_INPUT_COVERAGE)inputCoverage, + (isCentroid > 0), + false, + (canEarlyZ > 0), + SWR_BACKEND_SINGLE_SAMPLE); } } } diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index 7b9c20ef802..6d9680b72c5 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -1,30 +1,30 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file binner.cpp -* -* @brief Implementation for the macrotile binner -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file binner.cpp + * + * @brief Implementation for the macrotile binner + * + ******************************************************************************/ #include "binner.h" #include "context.h" @@ -37,27 +37,25 @@ // Function Prototype template <typename SIMD_T, uint32_t SIMD_WIDTH> -void BinPostSetupLinesImpl( - DRAW_CONTEXT *pDC, - PA_STATE &pa, - uint32_t workerId, - Vec4<SIMD_T> prim[], - Float<SIMD_T> recipW[], - uint32_t primMask, - Integer<SIMD_T> const &primID, - Integer<SIMD_T> const &viewportIdx, - Integer<SIMD_T> const &rtIdx); +void BinPostSetupLinesImpl(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + Vec4<SIMD_T> prim[], + Float<SIMD_T> recipW[], + uint32_t primMask, + Integer<SIMD_T> const& primID, + Integer<SIMD_T> const& viewportIdx, + Integer<SIMD_T> const& rtIdx); template <typename SIMD_T, uint32_t SIMD_WIDTH> -void BinPostSetupPointsImpl( - DRAW_CONTEXT *pDC, - PA_STATE &pa, - uint32_t workerId, - Vec4<SIMD_T> prim[], - uint32_t primMask, - Integer<SIMD_T> const &primID, - Integer<SIMD_T> const &viewportIdx, - Integer<SIMD_T> const &rtIdx); +void BinPostSetupPointsImpl(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + Vec4<SIMD_T> prim[], + uint32_t primMask, + Integer<SIMD_T> const& primID, + Integer<SIMD_T> const& viewportIdx, + Integer<SIMD_T> const& rtIdx); ////////////////////////////////////////////////////////////////////////// /// @brief Processes attributes for the backend based on linkage mask and @@ -68,26 +66,23 @@ void BinPostSetupPointsImpl( /// @param pLinkageMap - maps VS attribute slot to PS slot /// @param triIndex - Triangle to process attributes for /// @param pBuffer - Output result -template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate> +template <typename NumVertsT, + typename IsSwizzledT, + typename HasConstantInterpT, + typename IsDegenerate> INLINE void ProcessAttributes( - DRAW_CONTEXT *pDC, - PA_STATE&pa, - uint32_t triIndex, - uint32_t primId, - float *pBuffer) + DRAW_CONTEXT* pDC, PA_STATE& pa, uint32_t triIndex, uint32_t primId, float* pBuffer) { static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT"); const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState; // Conservative Rasterization requires degenerate tris to have constant attribute interpolation - uint32_t constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask; + uint32_t constantInterpMask = + IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask; const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex; - const PRIMITIVE_TOPOLOGY topo = pa.binTopology; + const PRIMITIVE_TOPOLOGY topo = pa.binTopology; static const float constTable[3][4] = { - { 0.0f, 0.0f, 0.0f, 0.0f }, - { 0.0f, 0.0f, 0.0f, 1.0f }, - { 1.0f, 1.0f, 1.0f, 1.0f } - }; + {0.0f, 0.0f, 0.0f, 0.0f}, {0.0f, 0.0f, 0.0f, 1.0f}, {1.0f, 1.0f, 1.0f, 1.0f}}; for (uint32_t i = 0; i < backendState.numAttributes; ++i) { @@ -96,46 +91,45 @@ INLINE void ProcessAttributes( { SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i]; inputSlot = backendState.vertexAttribOffset + attribSwizzle.sourceAttrib; - } else { inputSlot = backendState.vertexAttribOffset + i; } - simd4scalar attrib[3]; // triangle attribs (always 4 wide) - float* pAttribStart = pBuffer; + simd4scalar attrib[3]; // triangle attribs (always 4 wide) + float* pAttribStart = pBuffer; if (HasConstantInterpT::value || IsDegenerate::value) { if (CheckBit(constantInterpMask, i)) { - uint32_t vid; - uint32_t adjustedTriIndex; - static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 }; - static const int32_t quadProvokingTri[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } }; - static const uint32_t quadProvokingVertex[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } }; - static const int32_t qstripProvokingTri[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } }; - static const uint32_t qstripProvokingVertex[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } }; - - switch (topo) { + uint32_t vid; + uint32_t adjustedTriIndex; + static const uint32_t tristripProvokingVertex[] = {0, 2, 1}; + static const int32_t quadProvokingTri[2][4] = {{0, 0, 0, 1}, {0, -1, 0, 0}}; + static const uint32_t quadProvokingVertex[2][4] = {{0, 1, 2, 2}, {0, 1, 1, 2}}; + static const int32_t qstripProvokingTri[2][4] = {{0, 0, 0, 1}, {-1, 0, 0, 0}}; + static const uint32_t qstripProvokingVertex[2][4] = {{0, 1, 2, 1}, {0, 0, 2, 1}}; + + switch (topo) + { case TOP_QUAD_LIST: adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex]; - vid = quadProvokingVertex[triIndex & 1][provokingVertex]; + vid = quadProvokingVertex[triIndex & 1][provokingVertex]; break; case TOP_QUAD_STRIP: adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex]; - vid = qstripProvokingVertex[triIndex & 1][provokingVertex]; + vid = qstripProvokingVertex[triIndex & 1][provokingVertex]; break; case TOP_TRIANGLE_STRIP: adjustedTriIndex = triIndex; - vid = (triIndex & 1) - ? tristripProvokingVertex[provokingVertex] - : provokingVertex; + vid = + (triIndex & 1) ? tristripProvokingVertex[provokingVertex] : provokingVertex; break; default: adjustedTriIndex = triIndex; - vid = provokingVertex; + vid = provokingVertex; break; } @@ -214,7 +208,7 @@ INLINE void ProcessAttributes( } } -typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*); +typedef void (*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*); struct ProcessAttributesChooser { @@ -227,9 +221,13 @@ struct ProcessAttributesChooser } }; -PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false) +PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, + bool IsSwizzled, + bool HasConstantInterp, + bool IsDegenerate = false) { - return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate); + return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc( + IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate); } ////////////////////////////////////////////////////////////////////////// @@ -240,18 +238,22 @@ PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzl /// @param primIndex - primitive index to process /// @param clipDistMask - mask of enabled clip distances /// @param pUserClipBuffer - buffer to store results -template<uint32_t NumVerts> -void ProcessUserClipDist(const SWR_BACKEND_STATE& state, PA_STATE& pa, uint32_t primIndex, float *pRecipW, float* pUserClipBuffer) +template <uint32_t NumVerts> +void ProcessUserClipDist(const SWR_BACKEND_STATE& state, + PA_STATE& pa, + uint32_t primIndex, + float* pRecipW, + float* pUserClipBuffer) { - DWORD clipDist; + DWORD clipDist; uint32_t clipDistMask = state.clipDistanceMask; while (_BitScanForward(&clipDist, clipDistMask)) { clipDistMask &= ~(1 << clipDist); uint32_t clipSlot = clipDist >> 2; uint32_t clipComp = clipDist & 0x3; - uint32_t clipAttribSlot = clipSlot == 0 ? - state.vertexClipCullOffset : state.vertexClipCullOffset + 1; + uint32_t clipAttribSlot = + clipSlot == 0 ? state.vertexClipCullOffset : state.vertexClipCullOffset + 1; simd4scalar primClipDist[3]; pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist); @@ -281,30 +283,35 @@ void ProcessUserClipDist(const SWR_BACKEND_STATE& state, PA_STATE& pa, uint32_t } INLINE -void TransposeVertices(simd4scalar(&dst)[8], const simdscalar &src0, const simdscalar &src1, const simdscalar &src2) +void TransposeVertices(simd4scalar (&dst)[8], + const simdscalar& src0, + const simdscalar& src1, + const simdscalar& src2) { vTranspose3x8(dst, src0, src1, src2); } INLINE -void TransposeVertices(simd4scalar(&dst)[16], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2) +void TransposeVertices(simd4scalar (&dst)[16], + const simd16scalar& src0, + const simd16scalar& src1, + const simd16scalar& src2) { - vTranspose4x16(reinterpret_cast<simd16scalar(&)[4]>(dst), src0, src1, src2, _simd16_setzero_ps()); + vTranspose4x16( + reinterpret_cast<simd16scalar(&)[4]>(dst), src0, src1, src2, _simd16_setzero_ps()); } - #if KNOB_ENABLE_EARLY_RAST #define ER_SIMD_TILE_X_DIM (1 << ER_SIMD_TILE_X_SHIFT) #define ER_SIMD_TILE_Y_DIM (1 << ER_SIMD_TILE_Y_SHIFT) - -template<typename SIMD_T> +template <typename SIMD_T> struct EarlyRastHelper { }; -template<> +template <> struct EarlyRastHelper<SIMD256> { static SIMD256::Integer InitShiftCntrl() @@ -314,7 +321,7 @@ struct EarlyRastHelper<SIMD256> }; #if USE_SIMD16_FRONTEND -template<> +template <> struct EarlyRastHelper<SIMD512> { static SIMD512::Integer InitShiftCntrl() @@ -340,21 +347,22 @@ struct EarlyRastHelper<SIMD512> /// @param oneTileMask - defines triangles for ER to work on /// (tris that fit into ER tile) template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT> -uint32_t SIMDCALL EarlyRasterizer( - SIMDBBOX_T<SIMD_T> &er_bbox, - Integer<SIMD_T> (&vAi)[3], - Integer<SIMD_T> (&vBi)[3], - Integer<SIMD_T> (&vXi)[3], - Integer<SIMD_T> (&vYi)[3], - uint32_t cwTrisMask, - uint32_t triMask, - uint32_t oneTileMask) +uint32_t SIMDCALL EarlyRasterizer(SIMDBBOX_T<SIMD_T>& er_bbox, + Integer<SIMD_T> (&vAi)[3], + Integer<SIMD_T> (&vBi)[3], + Integer<SIMD_T> (&vXi)[3], + Integer<SIMD_T> (&vYi)[3], + uint32_t cwTrisMask, + uint32_t triMask, + uint32_t oneTileMask) { // step to pixel center of top-left pixel of the triangle bbox - Integer<SIMD_T> vTopLeftX = SIMD_T::template slli_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(er_bbox.xmin); + Integer<SIMD_T> vTopLeftX = + SIMD_T::template slli_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(er_bbox.xmin); vTopLeftX = SIMD_T::add_epi32(vTopLeftX, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2)); - Integer<SIMD_T> vTopLeftY = SIMD_T::template slli_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(er_bbox.ymin); + Integer<SIMD_T> vTopLeftY = + SIMD_T::template slli_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(er_bbox.ymin); vTopLeftY = SIMD_T::add_epi32(vTopLeftY, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2)); // negate A and B for CW tris @@ -367,16 +375,22 @@ uint32_t SIMDCALL EarlyRasterizer( RDTSC_EVENT(FEEarlyRastEnter, _mm_popcnt_u32(oneTileMask & triMask), 0); - Integer<SIMD_T> vShiftCntrl = EarlyRastHelper <SIMD_T>::InitShiftCntrl(); - Integer<SIMD_T> vCwTris = SIMD_T::set1_epi32(cwTrisMask); - Integer<SIMD_T> vMask = SIMD_T::sllv_epi32(vCwTris, vShiftCntrl); - - vAi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[0]), SIMD_T::castsi_ps(vNegA0), SIMD_T::castsi_ps(vMask))); - vAi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[1]), SIMD_T::castsi_ps(vNegA1), SIMD_T::castsi_ps(vMask))); - vAi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[2]), SIMD_T::castsi_ps(vNegA2), SIMD_T::castsi_ps(vMask))); - vBi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[0]), SIMD_T::castsi_ps(vNegB0), SIMD_T::castsi_ps(vMask))); - vBi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[1]), SIMD_T::castsi_ps(vNegB1), SIMD_T::castsi_ps(vMask))); - vBi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[2]), SIMD_T::castsi_ps(vNegB2), SIMD_T::castsi_ps(vMask))); + Integer<SIMD_T> vShiftCntrl = EarlyRastHelper<SIMD_T>::InitShiftCntrl(); + Integer<SIMD_T> vCwTris = SIMD_T::set1_epi32(cwTrisMask); + Integer<SIMD_T> vMask = SIMD_T::sllv_epi32(vCwTris, vShiftCntrl); + + vAi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps( + SIMD_T::castsi_ps(vAi[0]), SIMD_T::castsi_ps(vNegA0), SIMD_T::castsi_ps(vMask))); + vAi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps( + SIMD_T::castsi_ps(vAi[1]), SIMD_T::castsi_ps(vNegA1), SIMD_T::castsi_ps(vMask))); + vAi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps( + SIMD_T::castsi_ps(vAi[2]), SIMD_T::castsi_ps(vNegA2), SIMD_T::castsi_ps(vMask))); + vBi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps( + SIMD_T::castsi_ps(vBi[0]), SIMD_T::castsi_ps(vNegB0), SIMD_T::castsi_ps(vMask))); + vBi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps( + SIMD_T::castsi_ps(vBi[1]), SIMD_T::castsi_ps(vNegB1), SIMD_T::castsi_ps(vMask))); + vBi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps( + SIMD_T::castsi_ps(vBi[2]), SIMD_T::castsi_ps(vNegB2), SIMD_T::castsi_ps(vMask))); // evaluate edge equations at top-left pixel Integer<SIMD_T> vDeltaX0 = SIMD_T::sub_epi32(vTopLeftX, vXi[0]); @@ -409,9 +423,12 @@ uint32_t SIMDCALL EarlyRasterizer( Integer<SIMD_T> vEdgeAdjust2 = SIMD_T::sub_epi32(vEdge2, SIMD_T::set1_epi32(1)); // vA < 0 - vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vAi[0]))); - vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vAi[1]))); - vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vAi[2]))); + vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps( + SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vAi[0]))); + vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps( + SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vAi[1]))); + vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps( + SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vAi[2]))); // vA == 0 && vB < 0 Integer<SIMD_T> vCmp0 = SIMD_T::cmpeq_epi32(vAi[0], SIMD_T::setzero_si()); @@ -422,75 +439,77 @@ uint32_t SIMDCALL EarlyRasterizer( vCmp1 = SIMD_T::and_si(vCmp1, vBi[1]); vCmp2 = SIMD_T::and_si(vCmp2, vBi[2]); - vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vCmp0))); - vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vCmp1))); - vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vCmp2))); - + vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps( + SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vCmp0))); + vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps( + SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vCmp1))); + vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps( + SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vCmp2))); #if ER_SIMD_TILE_X_DIM == 4 && ER_SIMD_TILE_Y_DIM == 4 // Go down // coverage pixel 0 Integer<SIMD_T> vMask0 = SIMD_T::and_si(vEdge0, vEdge1); - vMask0 = SIMD_T::and_si(vMask0, vEdge2); + vMask0 = SIMD_T::and_si(vMask0, vEdge2); // coverage pixel 1 Integer<SIMD_T> vEdge0N = SIMD_T::add_epi32(vEdge0, vBi[0]); Integer<SIMD_T> vEdge1N = SIMD_T::add_epi32(vEdge1, vBi[1]); Integer<SIMD_T> vEdge2N = SIMD_T::add_epi32(vEdge2, vBi[2]); - Integer<SIMD_T> vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask1 = SIMD_T::and_si(vMask1, vEdge2N); + Integer<SIMD_T> vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N); + vMask1 = SIMD_T::and_si(vMask1, vEdge2N); // coverage pixel 2 - vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]); - vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]); - vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]); + vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]); + vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]); + vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]); Integer<SIMD_T> vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask2 = SIMD_T::and_si(vMask2, vEdge2N); + vMask2 = SIMD_T::and_si(vMask2, vEdge2N); // coverage pixel 3 - vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]); - vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]); - vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]); + vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]); + vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]); + vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]); Integer<SIMD_T> vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask3 = SIMD_T::and_si(vMask3, vEdge2N); + vMask3 = SIMD_T::and_si(vMask3, vEdge2N); // One step to the right and then up // coverage pixel 4 - vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]); - vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]); - vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]); + vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]); + vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]); + vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]); Integer<SIMD_T> vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask4 = SIMD_T::and_si(vMask4, vEdge2N); + vMask4 = SIMD_T::and_si(vMask4, vEdge2N); // coverage pixel 5 - vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]); - vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]); - vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]); + vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]); + vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]); + vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]); Integer<SIMD_T> vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask5 = SIMD_T::and_si(vMask5, vEdge2N); + vMask5 = SIMD_T::and_si(vMask5, vEdge2N); // coverage pixel 6 - vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]); - vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]); - vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]); + vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]); + vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]); + vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]); Integer<SIMD_T> vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask6 = SIMD_T::and_si(vMask6, vEdge2N); + vMask6 = SIMD_T::and_si(vMask6, vEdge2N); // coverage pixel 7 - vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]); - vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]); - vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]); + vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]); + vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]); + vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]); Integer<SIMD_T> vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask7 = SIMD_T::and_si(vMask7, vEdge2N); + vMask7 = SIMD_T::and_si(vMask7, vEdge2N); Integer<SIMD_T> vLit1 = SIMD_T::or_si(vMask0, vMask1); - vLit1 = SIMD_T::or_si(vLit1, vMask2); - vLit1 = SIMD_T::or_si(vLit1, vMask3); - vLit1 = SIMD_T::or_si(vLit1, vMask4); - vLit1 = SIMD_T::or_si(vLit1, vMask5); - vLit1 = SIMD_T::or_si(vLit1, vMask6); - vLit1 = SIMD_T::or_si(vLit1, vMask7); + vLit1 = SIMD_T::or_si(vLit1, vMask2); + vLit1 = SIMD_T::or_si(vLit1, vMask3); + vLit1 = SIMD_T::or_si(vLit1, vMask4); + vLit1 = SIMD_T::or_si(vLit1, vMask5); + vLit1 = SIMD_T::or_si(vLit1, vMask6); + vLit1 = SIMD_T::or_si(vLit1, vMask7); // Step to the right and go down again @@ -498,29 +517,29 @@ uint32_t SIMDCALL EarlyRasterizer( vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]); vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]); vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]); - vMask0 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask0 = SIMD_T::and_si(vMask0, vEdge2N); + vMask0 = SIMD_T::and_si(vEdge0N, vEdge1N); + vMask0 = SIMD_T::and_si(vMask0, vEdge2N); // coverage pixel 1 vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]); vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]); vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]); - vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask1 = SIMD_T::and_si(vMask1, vEdge2N); + vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N); + vMask1 = SIMD_T::and_si(vMask1, vEdge2N); // coverage pixel 2 vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]); vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]); vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]); - vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask2 = SIMD_T::and_si(vMask2, vEdge2N); + vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N); + vMask2 = SIMD_T::and_si(vMask2, vEdge2N); // coverage pixel 3 vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]); vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]); vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]); - vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask3 = SIMD_T::and_si(vMask3, vEdge2N); + vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N); + vMask3 = SIMD_T::and_si(vMask3, vEdge2N); // And for the last time - to the right and up @@ -528,37 +547,37 @@ uint32_t SIMDCALL EarlyRasterizer( vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]); vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]); vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]); - vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask4 = SIMD_T::and_si(vMask4, vEdge2N); + vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N); + vMask4 = SIMD_T::and_si(vMask4, vEdge2N); // coverage pixel 5 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]); vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]); vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]); - vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask5 = SIMD_T::and_si(vMask5, vEdge2N); + vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N); + vMask5 = SIMD_T::and_si(vMask5, vEdge2N); // coverage pixel 6 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]); vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]); vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]); - vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask6 = SIMD_T::and_si(vMask6, vEdge2N); + vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N); + vMask6 = SIMD_T::and_si(vMask6, vEdge2N); // coverage pixel 7 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]); vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]); vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]); - vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask7 = SIMD_T::and_si(vMask7, vEdge2N); + vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N); + vMask7 = SIMD_T::and_si(vMask7, vEdge2N); Integer<SIMD_T> vLit2 = SIMD_T::or_si(vMask0, vMask1); - vLit2 = SIMD_T::or_si(vLit2, vMask2); - vLit2 = SIMD_T::or_si(vLit2, vMask3); - vLit2 = SIMD_T::or_si(vLit2, vMask4); - vLit2 = SIMD_T::or_si(vLit2, vMask5); - vLit2 = SIMD_T::or_si(vLit2, vMask6); - vLit2 = SIMD_T::or_si(vLit2, vMask7); + vLit2 = SIMD_T::or_si(vLit2, vMask2); + vLit2 = SIMD_T::or_si(vLit2, vMask3); + vLit2 = SIMD_T::or_si(vLit2, vMask4); + vLit2 = SIMD_T::or_si(vLit2, vMask5); + vLit2 = SIMD_T::or_si(vLit2, vMask6); + vLit2 = SIMD_T::or_si(vLit2, vMask7); Integer<SIMD_T> vLit = SIMD_T::or_si(vLit1, vLit2); @@ -612,7 +631,7 @@ uint32_t SIMDCALL EarlyRasterizer( #endif // Check which triangles has any pixel lit - uint32_t maskLit = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vLit)); + uint32_t maskLit = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vLit)); uint32_t maskUnlit = ~maskLit & oneTileMask; uint32_t oldTriMask = triMask; @@ -638,25 +657,24 @@ uint32_t SIMDCALL EarlyRasterizer( /// @param viewportIdx - viewport array index for each triangle. /// @tparam CT - ConservativeRastFETraits template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT> -void SIMDCALL BinTrianglesImpl( - DRAW_CONTEXT *pDC, - PA_STATE &pa, - uint32_t workerId, - Vec4<SIMD_T> tri[3], - uint32_t triMask, - Integer<SIMD_T> const &primID, - Integer<SIMD_T> const &viewportIdx, - Integer<SIMD_T> const &rtIdx) +void SIMDCALL BinTrianglesImpl(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + Vec4<SIMD_T> tri[3], + uint32_t triMask, + Integer<SIMD_T> const& primID, + Integer<SIMD_T> const& viewportIdx, + Integer<SIMD_T> const& rtIdx) { - const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx); + const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx); RDTSC_BEGIN(FEBinTriangles, pDC->drawId); - const API_STATE& state = GetApiState(pDC); - const SWR_RASTSTATE& rastState = state.rastState; - const SWR_FRONTEND_STATE& feState = state.frontendState; + const API_STATE& state = GetApiState(pDC); + const SWR_RASTSTATE& rastState = state.rastState; + const SWR_FRONTEND_STATE& feState = state.frontendState; - MacroTileMgr *pTileMgr = pDC->pTileMgr; + MacroTileMgr* pTileMgr = pDC->pTileMgr; Float<SIMD_T> vRecipW0 = SIMD_T::set1_ps(1.0f); Float<SIMD_T> vRecipW1 = SIMD_T::set1_ps(1.0f); @@ -724,8 +742,10 @@ void SIMDCALL BinTrianglesImpl( calcDeterminantIntVertical(vAi, vBi, vDet); // cull zero area - uint32_t maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[0], SIMD_T::setzero_si()))); - uint32_t maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[1], SIMD_T::setzero_si()))); + uint32_t maskLo = + SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[0], SIMD_T::setzero_si()))); + uint32_t maskHi = + SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[1], SIMD_T::setzero_si()))); uint32_t cullZeroAreaMask = maskLo | (maskHi << (SIMD_WIDTH / 2)); @@ -744,13 +764,17 @@ void SIMDCALL BinTrianglesImpl( uint32_t frontWindingTris; if (rastState.frontWinding == SWR_FRONTWINDING_CW) { - maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si()))); - maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si()))); + maskLo = SIMD_T::movemask_pd( + SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si()))); + maskHi = SIMD_T::movemask_pd( + SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si()))); } else { - maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[0]))); - maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[1]))); + maskLo = SIMD_T::movemask_pd( + SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[0]))); + maskHi = SIMD_T::movemask_pd( + SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[1]))); } frontWindingTris = maskLo | (maskHi << (SIMD_WIDTH / 2)); @@ -758,12 +782,24 @@ void SIMDCALL BinTrianglesImpl( uint32_t cullTris; switch ((SWR_CULLMODE)rastState.cullMode) { - case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break; - case SWR_CULLMODE_NONE: cullTris = 0x0; break; - case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break; - // 0 area triangles are marked as backfacing, which is required behavior for conservative rast - case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break; - default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break; + case SWR_CULLMODE_BOTH: + cullTris = 0xffffffff; + break; + case SWR_CULLMODE_NONE: + cullTris = 0x0; + break; + case SWR_CULLMODE_FRONT: + cullTris = frontWindingTris; + break; + // 0 area triangles are marked as backfacing, which is required behavior for conservative + // rast + case SWR_CULLMODE_BACK: + cullTris = ~frontWindingTris; + break; + default: + SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); + cullTris = 0x0; + break; } triMask &= ~cullTris; @@ -777,12 +813,12 @@ void SIMDCALL BinTrianglesImpl( /// Note: these variable initializations must stay above any 'goto endBenTriangles' // compute per tri backface - uint32_t frontFaceMask = frontWindingTris; - uint32_t *pPrimID = (uint32_t *)&primID; - const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; - DWORD triIndex = 0; + uint32_t frontFaceMask = frontWindingTris; + uint32_t* pPrimID = (uint32_t*)&primID; + const uint32_t* pViewportIndex = (uint32_t*)&viewportIdx; + DWORD triIndex = 0; - uint32_t edgeEnable; + uint32_t edgeEnable; PFN_WORK_FUNC pfnWork; if (CT::IsConservativeT::value) { @@ -794,13 +830,15 @@ void SIMDCALL BinTrianglesImpl( const Integer<SIMD_T> x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]); const Integer<SIMD_T> y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]); - uint32_t e0Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask))); + uint32_t e0Mask = + SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask))); // e1 = v2-v1 const Integer<SIMD_T> x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]); const Integer<SIMD_T> y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]); - uint32_t e1Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask))); + uint32_t e1Mask = + SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask))); // e2 = v0-v2 // if v0 == v1 & v1 == v2, v0 == v2 @@ -827,8 +865,12 @@ void SIMDCALL BinTrianglesImpl( else { // degenerate triangles won't be sent to rasterizer; just enable all edges - pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0), - (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false)); + pfnWork = GetRasterizerFunc(rastState.sampleCount, + rastState.bIsCenterPattern, + (rastState.conservativeRast > 0), + (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, + EdgeValToEdgeState(ALL_EDGES_VALID), + (state.scissorsTileAligned == false)); } SIMDBBOX_T<SIMD_T> bbox; @@ -854,20 +896,20 @@ void SIMDCALL BinTrianglesImpl( { Integer<SIMD_T> xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127)); - xmin = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255)); + xmin = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255)); Integer<SIMD_T> xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128)); - xmax = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255)); + xmax = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255)); Integer<SIMD_T> vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax); Integer<SIMD_T> ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127)); - ymin = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255)); + ymin = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255)); Integer<SIMD_T> ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128)); - ymax = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255)); + ymax = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255)); Integer<SIMD_T> vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax); - vMaskV = SIMD_T::or_si(vMaskH, vMaskV); + vMaskV = SIMD_T::or_si(vMaskH, vMaskV); cullCenterMask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV)); } @@ -879,15 +921,20 @@ void SIMDCALL BinTrianglesImpl( } } - // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. - // Gather the AOS effective scissor rects based on the per-prim VP index. + // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is + // exclusive. Gather the AOS effective scissor rects based on the per-prim VP index. /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. { Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax; if (pa.viewportArrayActive) { - GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax); + GatherScissors(&state.scissorsInFixedPoint[0], + pViewportIndex, + scisXmin, + scisYmin, + scisXmax, + scisYmax); } else // broadcast fast path for non-VPAI case. { @@ -909,23 +956,26 @@ void SIMDCALL BinTrianglesImpl( if (CT::IsConservativeT::value) { - // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has - // some area. Bump the xmax/ymax edges out + // in the case where a degenerate triangle is on a scissor edge, we need to make sure the + // primitive bbox has some area. Bump the xmax/ymax edges out Integer<SIMD_T> topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax); - bbox.ymax = SIMD_T::blendv_epi32(bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom); + bbox.ymax = SIMD_T::blendv_epi32( + bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom); Integer<SIMD_T> leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax); - bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight); + bbox.xmax = SIMD_T::blendv_epi32( + bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight); } // Cull tris completely outside scissor { Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax); Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax); - Integer<SIMD_T> maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY); + Integer<SIMD_T> maskOutsideScissorXY = + SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY); uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY)); - triMask = triMask & ~maskOutsideScissor; + triMask = triMask & ~maskOutsideScissor; } #if KNOB_ENABLE_EARLY_RAST @@ -936,26 +986,34 @@ void SIMDCALL BinTrianglesImpl( // convert to ER tiles SIMDBBOX_T<SIMD_T> er_bbox; - er_bbox.xmin = SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmin); - er_bbox.xmax = SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmax); - er_bbox.ymin = SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymin); - er_bbox.ymax = SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymax); + er_bbox.xmin = + SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmin); + er_bbox.xmax = + SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmax); + er_bbox.ymin = + SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymin); + er_bbox.ymax = + SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymax); Integer<SIMD_T> vTileX = SIMD_T::cmpeq_epi32(er_bbox.xmin, er_bbox.xmax); Integer<SIMD_T> vTileY = SIMD_T::cmpeq_epi32(er_bbox.ymin, er_bbox.ymax); // Take only triangles that fit into ER tile - uint32_t oneTileMask = triMask & SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(vTileX, vTileY))); + uint32_t oneTileMask = + triMask & SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(vTileX, vTileY))); if (oneTileMask) { // determine CW tris (det > 0) - uint32_t maskCwLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si()))); - uint32_t maskCwHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si()))); + uint32_t maskCwLo = SIMD_T::movemask_pd( + SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si()))); + uint32_t maskCwHi = SIMD_T::movemask_pd( + SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si()))); uint32_t cwTrisMask = maskCwLo | (maskCwHi << (SIMD_WIDTH / 2)); // Try early rasterization - triMask = EarlyRasterizer<SIMD_T, SIMD_WIDTH, CT>(er_bbox, vAi, vBi, vXi, vYi, cwTrisMask, triMask, oneTileMask); + triMask = EarlyRasterizer<SIMD_T, SIMD_WIDTH, CT>( + er_bbox, vAi, vBi, vXi, vYi, cwTrisMask, triMask, oneTileMask); if (!triMask) { @@ -963,7 +1021,6 @@ void SIMDCALL BinTrianglesImpl( return; } } - } #endif @@ -975,29 +1032,32 @@ endBinTriangles: { // Simple non-conformant wireframe mode, useful for debugging // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD - Vec4<SIMD_T> line[2]; + Vec4<SIMD_T> line[2]; Float<SIMD_T> recipW[2]; - line[0] = tri[0]; - line[1] = tri[1]; + line[0] = tri[0]; + line[1] = tri[1]; recipW[0] = vRecipW0; recipW[1] = vRecipW1; - BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx); + BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>( + pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx); - line[0] = tri[1]; - line[1] = tri[2]; + line[0] = tri[1]; + line[1] = tri[2]; recipW[0] = vRecipW1; recipW[1] = vRecipW2; - BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx); + BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>( + pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx); - line[0] = tri[2]; - line[1] = tri[0]; + line[0] = tri[2]; + line[1] = tri[0]; recipW[0] = vRecipW2; recipW[1] = vRecipW0; - BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx); + BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>( + pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx); RDTSC_END(FEBinTriangles, 1); return; @@ -1005,9 +1065,12 @@ endBinTriangles: else if (rastState.fillMode == SWR_FILLMODE_POINT) { // Bin 3 points - BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx, rtIdx); - BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx, rtIdx); - BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx, rtIdx); + BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>( + pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx, rtIdx); + BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>( + pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx, rtIdx); + BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>( + pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx, rtIdx); RDTSC_END(FEBinTriangles, 1); return; @@ -1019,12 +1082,13 @@ endBinTriangles: bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax); bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax); - OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH]; + OSALIGNSIMD16(uint32_t) + aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH]; - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTLeft), bbox.xmin); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTRight), bbox.xmax); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTTop), bbox.ymin); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTBottom), bbox.ymax); + SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTLeft), bbox.xmin); + SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTRight), bbox.xmax); + SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTTop), bbox.ymin); + SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTBottom), bbox.ymax); // transpose verts needed for backend /// @todo modify BE to take non-transformed verts @@ -1041,7 +1105,7 @@ endBinTriangles: // scan remaining valid triangles and bin each separately while (_BitScanForward(&triIndex, triMask)) { - uint32_t linkageCount = state.backendState.numAttributes; + uint32_t linkageCount = state.backendState.numAttributes; uint32_t numScalarAttribs = linkageCount * 4; BE_WORK work; @@ -1052,8 +1116,13 @@ endBinTriangles: { // only rasterize valid edges if we have a degenerate primitive int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID; - work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0), - (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false)); + work.pfnWork = + GetRasterizerFunc(rastState.sampleCount, + rastState.bIsCenterPattern, + (rastState.conservativeRast > 0), + (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, + EdgeValToEdgeState(triEdgeEnable), + (state.scissorsTileAligned == false)); // Degenerate triangles are required to be constant interpolated isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false; @@ -1065,30 +1134,33 @@ endBinTriangles: } // Select attribute processor - PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3, - state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate); + PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = + GetProcessAttributesFunc(3, + state.backendState.swizzleEnable, + state.backendState.constantInterpolationMask, + isDegenerate); - TRIANGLE_WORK_DESC &desc = work.desc.tri; + TRIANGLE_WORK_DESC& desc = work.desc.tri; desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1); desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex]; - desc.triFlags.viewportIndex = pViewportIndex[triIndex]; + desc.triFlags.viewportIndex = pViewportIndex[triIndex]; auto pArena = pDC->pArena; SWR_ASSERT(pArena != nullptr); // store active attribs - float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); - desc.pAttribs = pAttribs; + float* pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); + desc.pAttribs = pAttribs; desc.numAttribs = linkageCount; pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs); // store triangle vertex data desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16); - SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]); - SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]); - SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]); + SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]); + SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]); + SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]); SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]); // store user clip distances @@ -1096,7 +1168,8 @@ endBinTriangles: { uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask); desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float)); - ProcessUserClipDist<3>(state.backendState, pa, triIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer); + ProcessUserClipDist<3>( + state.backendState, pa, triIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer); } for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y) @@ -1112,39 +1185,39 @@ endBinTriangles: } } - triMask &= ~(1 << triIndex); + triMask &= ~(1 << triIndex); } RDTSC_END(FEBinTriangles, 1); } template <typename CT> -void BinTriangles( - DRAW_CONTEXT *pDC, - PA_STATE &pa, - uint32_t workerId, - simdvector tri[3], - uint32_t triMask, - simdscalari const &primID, - simdscalari const &viewportIdx, - simdscalari const &rtIdx) +void BinTriangles(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simdvector tri[3], + uint32_t triMask, + simdscalari const& primID, + simdscalari const& viewportIdx, + simdscalari const& rtIdx) { - BinTrianglesImpl<SIMD256, KNOB_SIMD_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx); + BinTrianglesImpl<SIMD256, KNOB_SIMD_WIDTH, CT>( + pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx); } #if USE_SIMD16_FRONTEND template <typename CT> -void SIMDCALL BinTriangles_simd16( - DRAW_CONTEXT *pDC, - PA_STATE &pa, - uint32_t workerId, - simd16vector tri[3], - uint32_t triMask, - simd16scalari const &primID, - simd16scalari const &viewportIdx, - simd16scalari const &rtIdx) +void SIMDCALL BinTriangles_simd16(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simd16vector tri[3], + uint32_t triMask, + simd16scalari const& primID, + simd16scalari const& viewportIdx, + simd16scalari const& rtIdx) { - BinTrianglesImpl<SIMD512, KNOB_SIMD16_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx); + BinTrianglesImpl<SIMD512, KNOB_SIMD16_WIDTH, CT>( + pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx); } #endif @@ -1186,27 +1259,26 @@ PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative) #endif template <typename SIMD_T, uint32_t SIMD_WIDTH> -void BinPostSetupPointsImpl( - DRAW_CONTEXT *pDC, - PA_STATE &pa, - uint32_t workerId, - Vec4<SIMD_T> prim[], - uint32_t primMask, - Integer<SIMD_T> const &primID, - Integer<SIMD_T> const &viewportIdx, - Integer<SIMD_T> const &rtIdx) +void BinPostSetupPointsImpl(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + Vec4<SIMD_T> prim[], + uint32_t primMask, + Integer<SIMD_T> const& primID, + Integer<SIMD_T> const& viewportIdx, + Integer<SIMD_T> const& rtIdx) { RDTSC_BEGIN(FEBinPoints, pDC->drawId); - Vec4<SIMD_T> &primVerts = prim[0]; + Vec4<SIMD_T>& primVerts = prim[0]; - const API_STATE& state = GetApiState(pDC); - const SWR_RASTSTATE& rastState = state.rastState; - const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; + const API_STATE& state = GetApiState(pDC); + const SWR_RASTSTATE& rastState = state.rastState; + const uint32_t* pViewportIndex = (uint32_t*)&viewportIdx; // Select attribute processor - PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1, - state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); + PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc( + 1, state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); // convert to fixed point Integer<SIMD_T> vXi, vYi; @@ -1224,64 +1296,68 @@ void BinPostSetupPointsImpl( primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vXi)); primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi)); - // compute macro tile coordinates + // compute macro tile coordinates Integer<SIMD_T> macroX = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(vXi); Integer<SIMD_T> macroY = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(vYi); OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH]; - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMacroX), macroX); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMacroY), macroY); + SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMacroX), macroX); + SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMacroY), macroY); // compute raster tile coordinates - Integer<SIMD_T> rasterX = SIMD_T::template srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi); - Integer<SIMD_T> rasterY = SIMD_T::template srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi); + Integer<SIMD_T> rasterX = + SIMD_T::template srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi); + Integer<SIMD_T> rasterY = + SIMD_T::template srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi); // compute raster tile relative x,y for coverage mask Integer<SIMD_T> tileAlignedX = SIMD_T::template slli_epi32<KNOB_TILE_X_DIM_SHIFT>(rasterX); Integer<SIMD_T> tileAlignedY = SIMD_T::template slli_epi32<KNOB_TILE_Y_DIM_SHIFT>(rasterY); - Integer<SIMD_T> tileRelativeX = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX); - Integer<SIMD_T> tileRelativeY = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY); + Integer<SIMD_T> tileRelativeX = + SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX); + Integer<SIMD_T> tileRelativeY = + SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY); OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH]; OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH]; - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aTileRelativeX), tileRelativeX); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aTileRelativeY), tileRelativeY); + SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileRelativeX), tileRelativeX); + SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileRelativeY), tileRelativeY); OSALIGNSIMD16(uint32_t) aTileAlignedX[SIMD_WIDTH]; OSALIGNSIMD16(uint32_t) aTileAlignedY[SIMD_WIDTH]; - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aTileAlignedX), tileAlignedX); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aTileAlignedY), tileAlignedY); + SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileAlignedX), tileAlignedX); + SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileAlignedY), tileAlignedY); OSALIGNSIMD16(float) aZ[SIMD_WIDTH]; - SIMD_T::store_ps(reinterpret_cast<float *>(aZ), primVerts.z); + SIMD_T::store_ps(reinterpret_cast<float*>(aZ), primVerts.z); // store render target array index - const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx); - - uint32_t *pPrimID = (uint32_t *)&primID; - DWORD primIndex = 0; + const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx); + + uint32_t* pPrimID = (uint32_t*)&primID; + DWORD primIndex = 0; const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState; // scan remaining valid triangles and bin each separately while (_BitScanForward(&primIndex, primMask)) { - uint32_t linkageCount = backendState.numAttributes; + uint32_t linkageCount = backendState.numAttributes; uint32_t numScalarAttribs = linkageCount * 4; BE_WORK work; work.type = DRAW; - TRIANGLE_WORK_DESC &desc = work.desc.tri; + TRIANGLE_WORK_DESC& desc = work.desc.tri; // points are always front facing - desc.triFlags.frontFacing = 1; + desc.triFlags.frontFacing = 1; desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; - desc.triFlags.viewportIndex = pViewportIndex[primIndex]; + desc.triFlags.viewportIndex = pViewportIndex[primIndex]; work.pfnWork = RasterizeSimplePoint; @@ -1289,18 +1365,19 @@ void BinPostSetupPointsImpl( SWR_ASSERT(pArena != nullptr); // store attributes - float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16); - desc.pAttribs = pAttribs; + float* pAttribs = + (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16); + desc.pAttribs = pAttribs; desc.numAttribs = linkageCount; pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs); // store raster tile aligned x, y, perspective correct z - float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16); - desc.pTriBuffer = pTriBuffer; + float* pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16); + desc.pTriBuffer = pTriBuffer; *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex]; *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex]; - *pTriBuffer = aZ[primIndex]; + *pTriBuffer = aZ[primIndex]; uint32_t tX = aTileRelativeX[primIndex]; uint32_t tY = aTileRelativeY[primIndex]; @@ -1310,7 +1387,7 @@ void BinPostSetupPointsImpl( work.desc.tri.triFlags.coverageMask = tX | (tY << 4); // bin it - MacroTileMgr *pTileMgr = pDC->pTileMgr; + MacroTileMgr* pTileMgr = pDC->pTileMgr; #if KNOB_ENABLE_TOSS_POINTS if (!KNOB_TOSS_SETUP_TRIS) #endif @@ -1343,7 +1420,7 @@ void BinPostSetupPointsImpl( bbox.xmin = bbox.xmax = vXi; bbox.ymin = bbox.ymax = vYi; - Float<SIMD_T> vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f)); + Float<SIMD_T> vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f)); Integer<SIMD_T> vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth); bbox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi); @@ -1351,15 +1428,20 @@ void BinPostSetupPointsImpl( bbox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi); bbox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi); - // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. - // Gather the AOS effective scissor rects based on the per-prim VP index. + // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge + // is exclusive. Gather the AOS effective scissor rects based on the per-prim VP index. /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. { Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax; if (pa.viewportArrayActive) { - GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax); + GatherScissors(&state.scissorsInFixedPoint[0], + pViewportIndex, + scisXmin, + scisYmin, + scisXmax, + scisYmax); } else // broadcast fast path for non-VPAI case. { @@ -1371,16 +1453,19 @@ void BinPostSetupPointsImpl( bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin); bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin); - bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax); - bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax); + bbox.xmax = + SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax); + bbox.ymax = + SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax); } // Cull bloated points completely outside scissor Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax); Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax); - Integer<SIMD_T> maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY); + Integer<SIMD_T> maskOutsideScissorXY = + SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY); uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY)); - primMask = primMask & ~maskOutsideScissor; + primMask = primMask & ~maskOutsideScissor; // Convert bbox to macrotile units. bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin); @@ -1388,46 +1473,47 @@ void BinPostSetupPointsImpl( bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax); bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax); - OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH]; + OSALIGNSIMD16(uint32_t) + aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH]; - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTLeft), bbox.xmin); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTRight), bbox.xmax); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTTop), bbox.ymin); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTBottom), bbox.ymax); + SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTLeft), bbox.xmin); + SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTRight), bbox.xmax); + SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTTop), bbox.ymin); + SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTBottom), bbox.ymax); // store render target array index - const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx); + const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx); OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH]; - SIMD_T::store_ps(reinterpret_cast<float *>(aPointSize), vPointSize); + SIMD_T::store_ps(reinterpret_cast<float*>(aPointSize), vPointSize); - uint32_t *pPrimID = (uint32_t *)&primID; + uint32_t* pPrimID = (uint32_t*)&primID; OSALIGNSIMD16(float) aPrimVertsX[SIMD_WIDTH]; OSALIGNSIMD16(float) aPrimVertsY[SIMD_WIDTH]; OSALIGNSIMD16(float) aPrimVertsZ[SIMD_WIDTH]; - SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsX), primVerts.x); - SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsY), primVerts.y); - SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsZ), primVerts.z); + SIMD_T::store_ps(reinterpret_cast<float*>(aPrimVertsX), primVerts.x); + SIMD_T::store_ps(reinterpret_cast<float*>(aPrimVertsY), primVerts.y); + SIMD_T::store_ps(reinterpret_cast<float*>(aPrimVertsZ), primVerts.z); // scan remaining valid prims and bin each separately const SWR_BACKEND_STATE& backendState = state.backendState; - DWORD primIndex; + DWORD primIndex; while (_BitScanForward(&primIndex, primMask)) { - uint32_t linkageCount = backendState.numAttributes; + uint32_t linkageCount = backendState.numAttributes; uint32_t numScalarAttribs = linkageCount * 4; BE_WORK work; work.type = DRAW; - TRIANGLE_WORK_DESC &desc = work.desc.tri; + TRIANGLE_WORK_DESC& desc = work.desc.tri; - desc.triFlags.frontFacing = 1; - desc.triFlags.pointSize = aPointSize[primIndex]; + desc.triFlags.frontFacing = 1; + desc.triFlags.pointSize = aPointSize[primIndex]; desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; - desc.triFlags.viewportIndex = pViewportIndex[primIndex]; + desc.triFlags.viewportIndex = pViewportIndex[primIndex]; work.pfnWork = RasterizeTriPoint; @@ -1440,11 +1526,11 @@ void BinPostSetupPointsImpl( pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs); // store point vertex data - float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16); - desc.pTriBuffer = pTriBuffer; - *pTriBuffer++ = aPrimVertsX[primIndex]; - *pTriBuffer++ = aPrimVertsY[primIndex]; - *pTriBuffer = aPrimVertsZ[primIndex]; + float* pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16); + desc.pTriBuffer = pTriBuffer; + *pTriBuffer++ = aPrimVertsX[primIndex]; + *pTriBuffer++ = aPrimVertsY[primIndex]; + *pTriBuffer = aPrimVertsZ[primIndex]; // store user clip distances if (backendState.clipDistanceMask) @@ -1454,14 +1540,15 @@ void BinPostSetupPointsImpl( float dists[8]; float one = 1.0f; ProcessUserClipDist<1>(backendState, pa, primIndex, &one, dists); - for (uint32_t i = 0; i < numClipDist; i++) { + for (uint32_t i = 0; i < numClipDist; i++) + { desc.pUserClipBuffer[3 * i + 0] = 0.0f; desc.pUserClipBuffer[3 * i + 1] = 0.0f; desc.pUserClipBuffer[3 * i + 2] = dists[i]; } } - MacroTileMgr *pTileMgr = pDC->pTileMgr; + MacroTileMgr* pTileMgr = pDC->pTileMgr; for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y) { for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x) @@ -1490,19 +1577,18 @@ void BinPostSetupPointsImpl( /// @param tri - Contains point position data for SIMDs worth of points. /// @param primID - Primitive ID for each point. template <typename SIMD_T, uint32_t SIMD_WIDTH> -void BinPointsImpl( - DRAW_CONTEXT *pDC, - PA_STATE &pa, - uint32_t workerId, - Vec4<SIMD_T> prim[3], - uint32_t primMask, - Integer<SIMD_T> const &primID, - Integer<SIMD_T> const &viewportIdx, - Integer<SIMD_T> const &rtIdx) +void BinPointsImpl(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + Vec4<SIMD_T> prim[3], + uint32_t primMask, + Integer<SIMD_T> const& primID, + Integer<SIMD_T> const& viewportIdx, + Integer<SIMD_T> const& rtIdx) { - const API_STATE& state = GetApiState(pDC); - const SWR_FRONTEND_STATE& feState = state.frontendState; - const SWR_RASTSTATE& rastState = state.rastState; + const API_STATE& state = GetApiState(pDC); + const SWR_FRONTEND_STATE& feState = state.frontendState; + const SWR_RASTSTATE& rastState = state.rastState; if (!feState.vpTransformDisable) { @@ -1530,57 +1616,34 @@ void BinPointsImpl( prim[0].y = SIMD_T::add_ps(prim[0].y, offset); BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>( - pDC, - pa, - workerId, - prim, - primMask, - primID, - viewportIdx, - rtIdx); + pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx); } -void BinPoints( - DRAW_CONTEXT *pDC, - PA_STATE &pa, - uint32_t workerId, - simdvector prim[3], - uint32_t primMask, - simdscalari const &primID, - simdscalari const &viewportIdx, - simdscalari const &rtIdx) +void BinPoints(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simdvector prim[3], + uint32_t primMask, + simdscalari const& primID, + simdscalari const& viewportIdx, + simdscalari const& rtIdx) { BinPointsImpl<SIMD256, KNOB_SIMD_WIDTH>( - pDC, - pa, - workerId, - prim, - primMask, - primID, - viewportIdx, - rtIdx); + pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx); } #if USE_SIMD16_FRONTEND -void SIMDCALL BinPoints_simd16( - DRAW_CONTEXT *pDC, - PA_STATE &pa, - uint32_t workerId, - simd16vector prim[3], - uint32_t primMask, - simd16scalari const &primID, - simd16scalari const &viewportIdx, - simd16scalari const & rtIdx) +void SIMDCALL BinPoints_simd16(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simd16vector prim[3], + uint32_t primMask, + simd16scalari const& primID, + simd16scalari const& viewportIdx, + simd16scalari const& rtIdx) { BinPointsImpl<SIMD512, KNOB_SIMD16_WIDTH>( - pDC, - pa, - workerId, - prim, - primMask, - primID, - viewportIdx, - rtIdx); + pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx); } #endif @@ -1593,30 +1656,29 @@ void SIMDCALL BinPoints_simd16( /// @param primID - Primitive ID for each line. /// @param viewportIdx - Viewport Array Index for each line. template <typename SIMD_T, uint32_t SIMD_WIDTH> -void BinPostSetupLinesImpl( - DRAW_CONTEXT *pDC, - PA_STATE &pa, - uint32_t workerId, - Vec4<SIMD_T> prim[], - Float<SIMD_T> recipW[], - uint32_t primMask, - Integer<SIMD_T> const &primID, - Integer<SIMD_T> const &viewportIdx, - Integer<SIMD_T> const &rtIdx) +void BinPostSetupLinesImpl(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + Vec4<SIMD_T> prim[], + Float<SIMD_T> recipW[], + uint32_t primMask, + Integer<SIMD_T> const& primID, + Integer<SIMD_T> const& viewportIdx, + Integer<SIMD_T> const& rtIdx) { - const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx); + const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx); RDTSC_BEGIN(FEBinLines, pDC->drawId); - const API_STATE &state = GetApiState(pDC); - const SWR_RASTSTATE &rastState = state.rastState; + const API_STATE& state = GetApiState(pDC); + const SWR_RASTSTATE& rastState = state.rastState; // Select attribute processor - PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2, - state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); + PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc( + 2, state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); - Float<SIMD_T> &vRecipW0 = recipW[0]; - Float<SIMD_T> &vRecipW1 = recipW[1]; + Float<SIMD_T>& vRecipW0 = recipW[0]; + Float<SIMD_T>& vRecipW1 = recipW[1]; // convert to fixed point Integer<SIMD_T> vXi[2], vYi[2]; @@ -1627,19 +1689,20 @@ void BinPostSetupLinesImpl( vYi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].y); // compute x-major vs y-major mask - Integer<SIMD_T> xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1])); - Integer<SIMD_T> yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1])); - Float<SIMD_T> vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength)); - uint32_t yMajorMask = SIMD_T::movemask_ps(vYmajorMask); + Integer<SIMD_T> xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1])); + Integer<SIMD_T> yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1])); + Float<SIMD_T> vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength)); + uint32_t yMajorMask = SIMD_T::movemask_ps(vYmajorMask); // cull zero-length lines Integer<SIMD_T> vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si()); - vZeroLengthMask = SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si())); + vZeroLengthMask = + SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si())); primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask)); - uint32_t *pPrimID = (uint32_t *)&primID; - const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; + uint32_t* pPrimID = (uint32_t*)&primID; + const uint32_t* pViewportIndex = (uint32_t*)&viewportIdx; // Calc bounding box of lines SIMDBBOX_T<SIMD_T> bbox; @@ -1649,7 +1712,7 @@ void BinPostSetupLinesImpl( bbox.ymax = SIMD_T::max_epi32(vYi[0], vYi[1]); // bloat bbox by line width along minor axis - Float<SIMD_T> vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f); + Float<SIMD_T> vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f); Integer<SIMD_T> vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth); SIMDBBOX_T<SIMD_T> bloatBox; @@ -1664,13 +1727,19 @@ void BinPostSetupLinesImpl( bbox.ymin = SIMD_T::blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask); bbox.ymax = SIMD_T::blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask); - // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. + // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is + // exclusive. { Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax; if (pa.viewportArrayActive) { - GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax); + GatherScissors(&state.scissorsInFixedPoint[0], + pViewportIndex, + scisXmin, + scisYmin, + scisXmax, + scisYmax); } else // broadcast fast path for non-VPAI case. { @@ -1682,17 +1751,20 @@ void BinPostSetupLinesImpl( bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin); bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin); - bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax); - bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax); + bbox.xmax = + SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax); + bbox.ymax = + SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax); } // Cull prims completely outside scissor { Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax); Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax); - Integer<SIMD_T> maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY); + Integer<SIMD_T> maskOutsideScissorXY = + SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY); uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY)); - primMask = primMask & ~maskOutsideScissor; + primMask = primMask & ~maskOutsideScissor; } // transpose verts needed for backend @@ -1713,34 +1785,35 @@ void BinPostSetupLinesImpl( bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax); bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax); - OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH]; + OSALIGNSIMD16(uint32_t) + aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH]; - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTLeft), bbox.xmin); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTRight), bbox.xmax); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTTop), bbox.ymin); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTBottom), bbox.ymax); + SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTLeft), bbox.xmin); + SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTRight), bbox.xmax); + SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTTop), bbox.ymin); + SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTBottom), bbox.ymax); TransposeVertices(vHorizX, prim[0].x, prim[1].x, SIMD_T::setzero_ps()); TransposeVertices(vHorizY, prim[0].y, prim[1].y, SIMD_T::setzero_ps()); TransposeVertices(vHorizZ, prim[0].z, prim[1].z, SIMD_T::setzero_ps()); - TransposeVertices(vHorizW, vRecipW0, vRecipW1, SIMD_T::setzero_ps()); + TransposeVertices(vHorizW, vRecipW0, vRecipW1, SIMD_T::setzero_ps()); // scan remaining valid prims and bin each separately DWORD primIndex; while (_BitScanForward(&primIndex, primMask)) { - uint32_t linkageCount = state.backendState.numAttributes; + uint32_t linkageCount = state.backendState.numAttributes; uint32_t numScalarAttribs = linkageCount * 4; BE_WORK work; work.type = DRAW; - TRIANGLE_WORK_DESC &desc = work.desc.tri; + TRIANGLE_WORK_DESC& desc = work.desc.tri; - desc.triFlags.frontFacing = 1; - desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1; + desc.triFlags.frontFacing = 1; + desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1; desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; - desc.triFlags.viewportIndex = pViewportIndex[primIndex]; + desc.triFlags.viewportIndex = pViewportIndex[primIndex]; work.pfnWork = RasterizeLine; @@ -1748,16 +1821,16 @@ void BinPostSetupLinesImpl( SWR_ASSERT(pArena != nullptr); // store active attribs - desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); + desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); desc.numAttribs = linkageCount; pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs); // store line vertex data desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16); - _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]); - _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]); - _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]); + _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]); + _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]); + _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]); _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]); // store user clip distances @@ -1765,10 +1838,11 @@ void BinPostSetupLinesImpl( { uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask); desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float)); - ProcessUserClipDist<2>(state.backendState, pa, primIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer); + ProcessUserClipDist<2>( + state.backendState, pa, primIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer); } - MacroTileMgr *pTileMgr = pDC->pTileMgr; + MacroTileMgr* pTileMgr = pDC->pTileMgr; for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y) { for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x) @@ -1799,21 +1873,20 @@ endBinLines: /// @param primID - Primitive ID for each line. /// @param viewportIdx - Viewport Array Index for each line. template <typename SIMD_T, uint32_t SIMD_WIDTH> -void SIMDCALL BinLinesImpl( - DRAW_CONTEXT *pDC, - PA_STATE &pa, - uint32_t workerId, - Vec4<SIMD_T> prim[3], - uint32_t primMask, - Integer<SIMD_T> const &primID, - Integer<SIMD_T> const &viewportIdx, - Integer<SIMD_T> const & rtIdx) +void SIMDCALL BinLinesImpl(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + Vec4<SIMD_T> prim[3], + uint32_t primMask, + Integer<SIMD_T> const& primID, + Integer<SIMD_T> const& viewportIdx, + Integer<SIMD_T> const& rtIdx) { - const API_STATE& state = GetApiState(pDC); - const SWR_RASTSTATE& rastState = state.rastState; - const SWR_FRONTEND_STATE& feState = state.frontendState; + const API_STATE& state = GetApiState(pDC); + const SWR_RASTSTATE& rastState = state.rastState; + const SWR_FRONTEND_STATE& feState = state.frontendState; - Float<SIMD_T> vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) }; + Float<SIMD_T> vRecipW[2] = {SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f)}; if (!feState.vpTransformDisable) { @@ -1851,42 +1924,34 @@ void SIMDCALL BinLinesImpl( prim[1].y = SIMD_T::add_ps(prim[1].y, offset); BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>( - pDC, - pa, - workerId, - prim, - vRecipW, - primMask, - primID, - viewportIdx, - rtIdx); + pDC, pa, workerId, prim, vRecipW, primMask, primID, viewportIdx, rtIdx); } -void BinLines( - DRAW_CONTEXT *pDC, - PA_STATE &pa, - uint32_t workerId, - simdvector prim[], - uint32_t primMask, - simdscalari const &primID, - simdscalari const &viewportIdx, - simdscalari const &rtIdx) +void BinLines(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simdvector prim[], + uint32_t primMask, + simdscalari const& primID, + simdscalari const& viewportIdx, + simdscalari const& rtIdx) { - BinLinesImpl<SIMD256, KNOB_SIMD_WIDTH>(pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx); + BinLinesImpl<SIMD256, KNOB_SIMD_WIDTH>( + pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx); } #if USE_SIMD16_FRONTEND -void SIMDCALL BinLines_simd16( - DRAW_CONTEXT *pDC, - PA_STATE &pa, - uint32_t workerId, - simd16vector prim[3], - uint32_t primMask, - simd16scalari const &primID, - simd16scalari const &viewportIdx, - simd16scalari const &rtIdx) +void SIMDCALL BinLines_simd16(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simd16vector prim[3], + uint32_t primMask, + simd16scalari const& primID, + simd16scalari const& viewportIdx, + simd16scalari const& rtIdx) { - BinLinesImpl<SIMD512, KNOB_SIMD16_WIDTH>(pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx); + BinLinesImpl<SIMD512, KNOB_SIMD16_WIDTH>( + pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx); } #endif diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.h b/src/gallium/drivers/swr/rasterizer/core/binner.h index 443dac57fef..f5f6d8074cb 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.h +++ b/src/gallium/drivers/swr/rasterizer/core/binner.h @@ -1,30 +1,30 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file binner.h -* -* @brief Declaration for the macrotile binner -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file binner.h + * + * @brief Declaration for the macrotile binner + * + ******************************************************************************/ #include "state.h" #include "conservativeRast.h" #include "utils.h" @@ -47,22 +47,23 @@ public: }; ////////////////////////////////////////////////////////////////////////// -/// @brief Convert the X,Y coords of a triangle to the requested Fixed +/// @brief Convert the X,Y coords of a triangle to the requested Fixed /// Point precision from FP32. template <typename SIMD_T, typename PT = FixedPointTraits<Fixed_16_8>> -INLINE Integer<SIMD_T> fpToFixedPointVertical(const Float<SIMD_T> &vIn) +INLINE Integer<SIMD_T> fpToFixedPointVertical(const Float<SIMD_T>& vIn) { return SIMD_T::cvtps_epi32(SIMD_T::mul_ps(vIn, SIMD_T::set1_ps(PT::ScaleT::value))); } ////////////////////////////////////////////////////////////////////////// -/// @brief Helper function to set the X,Y coords of a triangle to the +/// @brief Helper function to set the X,Y coords of a triangle to the /// requested Fixed Point precision from FP32. /// @param tri: simdvector[3] of FP triangle verts /// @param vXi: fixed point X coords of tri verts /// @param vYi: fixed point Y coords of tri verts template <typename SIMD_T> -INLINE static void FPToFixedPoint(const Vec4<SIMD_T> *const tri, Integer<SIMD_T>(&vXi)[3], Integer<SIMD_T>(&vYi)[3]) +INLINE static void +FPToFixedPoint(const Vec4<SIMD_T>* const tri, Integer<SIMD_T> (&vXi)[3], Integer<SIMD_T> (&vYi)[3]) { vXi[0] = fpToFixedPointVertical<SIMD_T>(tri[0].x); vYi[0] = fpToFixedPointVertical<SIMD_T>(tri[0].y); @@ -78,10 +79,12 @@ INLINE static void FPToFixedPoint(const Vec4<SIMD_T> *const tri, Integer<SIMD_T> /// @param vX: fixed point X position for triangle verts /// @param vY: fixed point Y position for triangle verts /// @param bbox: fixed point bbox -/// *Note*: expects vX, vY to be in the correct precision for the type +/// *Note*: expects vX, vY to be in the correct precision for the type /// of rasterization. This avoids unnecessary FP->fixed conversions. template <typename SIMD_T, typename CT> -INLINE void calcBoundingBoxIntVertical(const Integer<SIMD_T>(&vX)[3], const Integer<SIMD_T>(&vY)[3], SIMDBBOX_T<SIMD_T> &bbox) +INLINE void calcBoundingBoxIntVertical(const Integer<SIMD_T> (&vX)[3], + const Integer<SIMD_T> (&vY)[3], + SIMDBBOX_T<SIMD_T>& bbox) { Integer<SIMD_T> vMinX = vX[0]; @@ -105,8 +108,9 @@ INLINE void calcBoundingBoxIntVertical(const Integer<SIMD_T>(&vX)[3], const Inte if (CT::BoundingBoxOffsetT::value != 0) { - /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization - /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer. + /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative + /// rasterization expand bbox by 1/256; coverage will be correctly handled in the + /// rasterizer. const Integer<SIMD_T> value = SIMD_T::set1_epi32(CT::BoundingBoxOffsetT::value); @@ -132,119 +136,119 @@ INLINE void calcBoundingBoxIntVertical(const Integer<SIMD_T>(&vX)[3], const Inte /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data. // /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. -static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const uint32_t *pViewportIndex, - simdscalari &scisXmin, simdscalari &scisYmin, simdscalari &scisXmax, simdscalari &scisYmax) +static void GatherScissors(const SWR_RECT* pScissorsInFixedPoint, + const uint32_t* pViewportIndex, + simdscalari& scisXmin, + simdscalari& scisYmin, + simdscalari& scisXmax, + simdscalari& scisYmax) { - scisXmin = _simd_set_epi32( - pScissorsInFixedPoint[pViewportIndex[7]].xmin, - pScissorsInFixedPoint[pViewportIndex[6]].xmin, - pScissorsInFixedPoint[pViewportIndex[5]].xmin, - pScissorsInFixedPoint[pViewportIndex[4]].xmin, - pScissorsInFixedPoint[pViewportIndex[3]].xmin, - pScissorsInFixedPoint[pViewportIndex[2]].xmin, - pScissorsInFixedPoint[pViewportIndex[1]].xmin, - pScissorsInFixedPoint[pViewportIndex[0]].xmin); - scisYmin = _simd_set_epi32( - pScissorsInFixedPoint[pViewportIndex[7]].ymin, - pScissorsInFixedPoint[pViewportIndex[6]].ymin, - pScissorsInFixedPoint[pViewportIndex[5]].ymin, - pScissorsInFixedPoint[pViewportIndex[4]].ymin, - pScissorsInFixedPoint[pViewportIndex[3]].ymin, - pScissorsInFixedPoint[pViewportIndex[2]].ymin, - pScissorsInFixedPoint[pViewportIndex[1]].ymin, - pScissorsInFixedPoint[pViewportIndex[0]].ymin); - scisXmax = _simd_set_epi32( - pScissorsInFixedPoint[pViewportIndex[7]].xmax, - pScissorsInFixedPoint[pViewportIndex[6]].xmax, - pScissorsInFixedPoint[pViewportIndex[5]].xmax, - pScissorsInFixedPoint[pViewportIndex[4]].xmax, - pScissorsInFixedPoint[pViewportIndex[3]].xmax, - pScissorsInFixedPoint[pViewportIndex[2]].xmax, - pScissorsInFixedPoint[pViewportIndex[1]].xmax, - pScissorsInFixedPoint[pViewportIndex[0]].xmax); - scisYmax = _simd_set_epi32( - pScissorsInFixedPoint[pViewportIndex[7]].ymax, - pScissorsInFixedPoint[pViewportIndex[6]].ymax, - pScissorsInFixedPoint[pViewportIndex[5]].ymax, - pScissorsInFixedPoint[pViewportIndex[4]].ymax, - pScissorsInFixedPoint[pViewportIndex[3]].ymax, - pScissorsInFixedPoint[pViewportIndex[2]].ymax, - pScissorsInFixedPoint[pViewportIndex[1]].ymax, - pScissorsInFixedPoint[pViewportIndex[0]].ymax); + scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].xmin, + pScissorsInFixedPoint[pViewportIndex[6]].xmin, + pScissorsInFixedPoint[pViewportIndex[5]].xmin, + pScissorsInFixedPoint[pViewportIndex[4]].xmin, + pScissorsInFixedPoint[pViewportIndex[3]].xmin, + pScissorsInFixedPoint[pViewportIndex[2]].xmin, + pScissorsInFixedPoint[pViewportIndex[1]].xmin, + pScissorsInFixedPoint[pViewportIndex[0]].xmin); + scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].ymin, + pScissorsInFixedPoint[pViewportIndex[6]].ymin, + pScissorsInFixedPoint[pViewportIndex[5]].ymin, + pScissorsInFixedPoint[pViewportIndex[4]].ymin, + pScissorsInFixedPoint[pViewportIndex[3]].ymin, + pScissorsInFixedPoint[pViewportIndex[2]].ymin, + pScissorsInFixedPoint[pViewportIndex[1]].ymin, + pScissorsInFixedPoint[pViewportIndex[0]].ymin); + scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].xmax, + pScissorsInFixedPoint[pViewportIndex[6]].xmax, + pScissorsInFixedPoint[pViewportIndex[5]].xmax, + pScissorsInFixedPoint[pViewportIndex[4]].xmax, + pScissorsInFixedPoint[pViewportIndex[3]].xmax, + pScissorsInFixedPoint[pViewportIndex[2]].xmax, + pScissorsInFixedPoint[pViewportIndex[1]].xmax, + pScissorsInFixedPoint[pViewportIndex[0]].xmax); + scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].ymax, + pScissorsInFixedPoint[pViewportIndex[6]].ymax, + pScissorsInFixedPoint[pViewportIndex[5]].ymax, + pScissorsInFixedPoint[pViewportIndex[4]].ymax, + pScissorsInFixedPoint[pViewportIndex[3]].ymax, + pScissorsInFixedPoint[pViewportIndex[2]].ymax, + pScissorsInFixedPoint[pViewportIndex[1]].ymax, + pScissorsInFixedPoint[pViewportIndex[0]].ymax); } -static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const uint32_t *pViewportIndex, - simd16scalari &scisXmin, simd16scalari &scisYmin, simd16scalari &scisXmax, simd16scalari &scisYmax) +static void GatherScissors(const SWR_RECT* pScissorsInFixedPoint, + const uint32_t* pViewportIndex, + simd16scalari& scisXmin, + simd16scalari& scisYmin, + simd16scalari& scisXmax, + simd16scalari& scisYmax) { - scisXmin = _simd16_set_epi32( - pScissorsInFixedPoint[pViewportIndex[15]].xmin, - pScissorsInFixedPoint[pViewportIndex[14]].xmin, - pScissorsInFixedPoint[pViewportIndex[13]].xmin, - pScissorsInFixedPoint[pViewportIndex[12]].xmin, - pScissorsInFixedPoint[pViewportIndex[11]].xmin, - pScissorsInFixedPoint[pViewportIndex[10]].xmin, - pScissorsInFixedPoint[pViewportIndex[9]].xmin, - pScissorsInFixedPoint[pViewportIndex[8]].xmin, - pScissorsInFixedPoint[pViewportIndex[7]].xmin, - pScissorsInFixedPoint[pViewportIndex[6]].xmin, - pScissorsInFixedPoint[pViewportIndex[5]].xmin, - pScissorsInFixedPoint[pViewportIndex[4]].xmin, - pScissorsInFixedPoint[pViewportIndex[3]].xmin, - pScissorsInFixedPoint[pViewportIndex[2]].xmin, - pScissorsInFixedPoint[pViewportIndex[1]].xmin, - pScissorsInFixedPoint[pViewportIndex[0]].xmin); - - scisYmin = _simd16_set_epi32( - pScissorsInFixedPoint[pViewportIndex[15]].ymin, - pScissorsInFixedPoint[pViewportIndex[14]].ymin, - pScissorsInFixedPoint[pViewportIndex[13]].ymin, - pScissorsInFixedPoint[pViewportIndex[12]].ymin, - pScissorsInFixedPoint[pViewportIndex[11]].ymin, - pScissorsInFixedPoint[pViewportIndex[10]].ymin, - pScissorsInFixedPoint[pViewportIndex[9]].ymin, - pScissorsInFixedPoint[pViewportIndex[8]].ymin, - pScissorsInFixedPoint[pViewportIndex[7]].ymin, - pScissorsInFixedPoint[pViewportIndex[6]].ymin, - pScissorsInFixedPoint[pViewportIndex[5]].ymin, - pScissorsInFixedPoint[pViewportIndex[4]].ymin, - pScissorsInFixedPoint[pViewportIndex[3]].ymin, - pScissorsInFixedPoint[pViewportIndex[2]].ymin, - pScissorsInFixedPoint[pViewportIndex[1]].ymin, - pScissorsInFixedPoint[pViewportIndex[0]].ymin); - - scisXmax = _simd16_set_epi32( - pScissorsInFixedPoint[pViewportIndex[15]].xmax, - pScissorsInFixedPoint[pViewportIndex[14]].xmax, - pScissorsInFixedPoint[pViewportIndex[13]].xmax, - pScissorsInFixedPoint[pViewportIndex[12]].xmax, - pScissorsInFixedPoint[pViewportIndex[11]].xmax, - pScissorsInFixedPoint[pViewportIndex[10]].xmax, - pScissorsInFixedPoint[pViewportIndex[9]].xmax, - pScissorsInFixedPoint[pViewportIndex[8]].xmax, - pScissorsInFixedPoint[pViewportIndex[7]].xmax, - pScissorsInFixedPoint[pViewportIndex[6]].xmax, - pScissorsInFixedPoint[pViewportIndex[5]].xmax, - pScissorsInFixedPoint[pViewportIndex[4]].xmax, - pScissorsInFixedPoint[pViewportIndex[3]].xmax, - pScissorsInFixedPoint[pViewportIndex[2]].xmax, - pScissorsInFixedPoint[pViewportIndex[1]].xmax, - pScissorsInFixedPoint[pViewportIndex[0]].xmax); - - scisYmax = _simd16_set_epi32( - pScissorsInFixedPoint[pViewportIndex[15]].ymax, - pScissorsInFixedPoint[pViewportIndex[14]].ymax, - pScissorsInFixedPoint[pViewportIndex[13]].ymax, - pScissorsInFixedPoint[pViewportIndex[12]].ymax, - pScissorsInFixedPoint[pViewportIndex[11]].ymax, - pScissorsInFixedPoint[pViewportIndex[10]].ymax, - pScissorsInFixedPoint[pViewportIndex[9]].ymax, - pScissorsInFixedPoint[pViewportIndex[8]].ymax, - pScissorsInFixedPoint[pViewportIndex[7]].ymax, - pScissorsInFixedPoint[pViewportIndex[6]].ymax, - pScissorsInFixedPoint[pViewportIndex[5]].ymax, - pScissorsInFixedPoint[pViewportIndex[4]].ymax, - pScissorsInFixedPoint[pViewportIndex[3]].ymax, - pScissorsInFixedPoint[pViewportIndex[2]].ymax, - pScissorsInFixedPoint[pViewportIndex[1]].ymax, - pScissorsInFixedPoint[pViewportIndex[0]].ymax); + scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].xmin, + pScissorsInFixedPoint[pViewportIndex[14]].xmin, + pScissorsInFixedPoint[pViewportIndex[13]].xmin, + pScissorsInFixedPoint[pViewportIndex[12]].xmin, + pScissorsInFixedPoint[pViewportIndex[11]].xmin, + pScissorsInFixedPoint[pViewportIndex[10]].xmin, + pScissorsInFixedPoint[pViewportIndex[9]].xmin, + pScissorsInFixedPoint[pViewportIndex[8]].xmin, + pScissorsInFixedPoint[pViewportIndex[7]].xmin, + pScissorsInFixedPoint[pViewportIndex[6]].xmin, + pScissorsInFixedPoint[pViewportIndex[5]].xmin, + pScissorsInFixedPoint[pViewportIndex[4]].xmin, + pScissorsInFixedPoint[pViewportIndex[3]].xmin, + pScissorsInFixedPoint[pViewportIndex[2]].xmin, + pScissorsInFixedPoint[pViewportIndex[1]].xmin, + pScissorsInFixedPoint[pViewportIndex[0]].xmin); + + scisYmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].ymin, + pScissorsInFixedPoint[pViewportIndex[14]].ymin, + pScissorsInFixedPoint[pViewportIndex[13]].ymin, + pScissorsInFixedPoint[pViewportIndex[12]].ymin, + pScissorsInFixedPoint[pViewportIndex[11]].ymin, + pScissorsInFixedPoint[pViewportIndex[10]].ymin, + pScissorsInFixedPoint[pViewportIndex[9]].ymin, + pScissorsInFixedPoint[pViewportIndex[8]].ymin, + pScissorsInFixedPoint[pViewportIndex[7]].ymin, + pScissorsInFixedPoint[pViewportIndex[6]].ymin, + pScissorsInFixedPoint[pViewportIndex[5]].ymin, + pScissorsInFixedPoint[pViewportIndex[4]].ymin, + pScissorsInFixedPoint[pViewportIndex[3]].ymin, + pScissorsInFixedPoint[pViewportIndex[2]].ymin, + pScissorsInFixedPoint[pViewportIndex[1]].ymin, + pScissorsInFixedPoint[pViewportIndex[0]].ymin); + + scisXmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].xmax, + pScissorsInFixedPoint[pViewportIndex[14]].xmax, + pScissorsInFixedPoint[pViewportIndex[13]].xmax, + pScissorsInFixedPoint[pViewportIndex[12]].xmax, + pScissorsInFixedPoint[pViewportIndex[11]].xmax, + pScissorsInFixedPoint[pViewportIndex[10]].xmax, + pScissorsInFixedPoint[pViewportIndex[9]].xmax, + pScissorsInFixedPoint[pViewportIndex[8]].xmax, + pScissorsInFixedPoint[pViewportIndex[7]].xmax, + pScissorsInFixedPoint[pViewportIndex[6]].xmax, + pScissorsInFixedPoint[pViewportIndex[5]].xmax, + pScissorsInFixedPoint[pViewportIndex[4]].xmax, + pScissorsInFixedPoint[pViewportIndex[3]].xmax, + pScissorsInFixedPoint[pViewportIndex[2]].xmax, + pScissorsInFixedPoint[pViewportIndex[1]].xmax, + pScissorsInFixedPoint[pViewportIndex[0]].xmax); + + scisYmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].ymax, + pScissorsInFixedPoint[pViewportIndex[14]].ymax, + pScissorsInFixedPoint[pViewportIndex[13]].ymax, + pScissorsInFixedPoint[pViewportIndex[12]].ymax, + pScissorsInFixedPoint[pViewportIndex[11]].ymax, + pScissorsInFixedPoint[pViewportIndex[10]].ymax, + pScissorsInFixedPoint[pViewportIndex[9]].ymax, + pScissorsInFixedPoint[pViewportIndex[8]].ymax, + pScissorsInFixedPoint[pViewportIndex[7]].ymax, + pScissorsInFixedPoint[pViewportIndex[6]].ymax, + pScissorsInFixedPoint[pViewportIndex[5]].ymax, + pScissorsInFixedPoint[pViewportIndex[4]].ymax, + pScissorsInFixedPoint[pViewportIndex[3]].ymax, + pScissorsInFixedPoint[pViewportIndex[2]].ymax, + pScissorsInFixedPoint[pViewportIndex[1]].ymax, + pScissorsInFixedPoint[pViewportIndex[0]].ymax); }
\ No newline at end of file diff --git a/src/gallium/drivers/swr/rasterizer/core/blend.h b/src/gallium/drivers/swr/rasterizer/core/blend.h index c89c47646a3..7b2f77985f8 100644 --- a/src/gallium/drivers/swr/rasterizer/core/blend.h +++ b/src/gallium/drivers/swr/rasterizer/core/blend.h @@ -1,77 +1,82 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file blend.cpp -* -* @brief Implementation for blending operations. -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file blend.cpp + * + * @brief Implementation for blending operations. + * + ******************************************************************************/ #include "state.h" -template<bool Color, bool Alpha> -INLINE -void GenerateBlendFactor(SWR_BLEND_FACTOR func, simdvector &constantColor, simdvector &src, simdvector &src1, simdvector &dst, simdvector &out) +template <bool Color, bool Alpha> +INLINE void GenerateBlendFactor(SWR_BLEND_FACTOR func, + simdvector& constantColor, + simdvector& src, + simdvector& src1, + simdvector& dst, + simdvector& out) { simdvector result; switch (func) { - case BLENDFACTOR_ZERO: + case BLENDFACTOR_ZERO: result.x = _simd_setzero_ps(); result.y = _simd_setzero_ps(); result.z = _simd_setzero_ps(); result.w = _simd_setzero_ps(); break; - case BLENDFACTOR_ONE: + case BLENDFACTOR_ONE: result.x = _simd_set1_ps(1.0); result.y = _simd_set1_ps(1.0); result.z = _simd_set1_ps(1.0); result.w = _simd_set1_ps(1.0); break; - case BLENDFACTOR_SRC_COLOR: + case BLENDFACTOR_SRC_COLOR: result = src; break; - case BLENDFACTOR_DST_COLOR: + case BLENDFACTOR_DST_COLOR: result = dst; break; - case BLENDFACTOR_INV_SRC_COLOR: + case BLENDFACTOR_INV_SRC_COLOR: result.x = _simd_sub_ps(_simd_set1_ps(1.0), src.x); result.y = _simd_sub_ps(_simd_set1_ps(1.0), src.y); result.z = _simd_sub_ps(_simd_set1_ps(1.0), src.z); result.w = _simd_sub_ps(_simd_set1_ps(1.0), src.w); break; - case BLENDFACTOR_INV_DST_COLOR: + case BLENDFACTOR_INV_DST_COLOR: result.x = _simd_sub_ps(_simd_set1_ps(1.0), dst.x); result.y = _simd_sub_ps(_simd_set1_ps(1.0), dst.y); result.z = _simd_sub_ps(_simd_set1_ps(1.0), dst.z); result.w = _simd_sub_ps(_simd_set1_ps(1.0), dst.w); break; - case BLENDFACTOR_SRC_ALPHA: result.x = src.w; + case BLENDFACTOR_SRC_ALPHA: + result.x = src.w; result.y = src.w; result.z = src.w; result.w = src.w; @@ -80,14 +85,15 @@ void GenerateBlendFactor(SWR_BLEND_FACTOR func, simdvector &constantColor, simdv case BLENDFACTOR_INV_SRC_ALPHA: { simdscalar oneMinusSrcA = _simd_sub_ps(_simd_set1_ps(1.0), src.w); - result.x = oneMinusSrcA; - result.y = oneMinusSrcA; - result.z = oneMinusSrcA; - result.w = oneMinusSrcA; + result.x = oneMinusSrcA; + result.y = oneMinusSrcA; + result.z = oneMinusSrcA; + result.w = oneMinusSrcA; break; } - case BLENDFACTOR_DST_ALPHA: result.x = dst.w; + case BLENDFACTOR_DST_ALPHA: + result.x = dst.w; result.y = dst.w; result.z = dst.w; result.w = dst.w; @@ -96,20 +102,20 @@ void GenerateBlendFactor(SWR_BLEND_FACTOR func, simdvector &constantColor, simdv case BLENDFACTOR_INV_DST_ALPHA: { simdscalar oneMinusDstA = _simd_sub_ps(_simd_set1_ps(1.0), dst.w); - result.x = oneMinusDstA; - result.y = oneMinusDstA; - result.z = oneMinusDstA; - result.w = oneMinusDstA; + result.x = oneMinusDstA; + result.y = oneMinusDstA; + result.z = oneMinusDstA; + result.w = oneMinusDstA; break; } case BLENDFACTOR_SRC_ALPHA_SATURATE: { simdscalar sat = _simd_min_ps(src.w, _simd_sub_ps(_simd_set1_ps(1.0), dst.w)); - result.x = sat; - result.y = sat; - result.z = sat; - result.w = _simd_set1_ps(1.0); + result.x = sat; + result.y = sat; + result.z = sat; + result.w = _simd_set1_ps(1.0); break; } @@ -135,7 +141,8 @@ void GenerateBlendFactor(SWR_BLEND_FACTOR func, simdvector &constantColor, simdv case BLENDFACTOR_INV_CONST_ALPHA: { - result.x = result.y = result.z = result.w = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]); + result.x = result.y = result.z = result.w = + _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]); break; } @@ -161,7 +168,8 @@ void GenerateBlendFactor(SWR_BLEND_FACTOR func, simdvector &constantColor, simdv result.x = result.y = result.z = result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w); break; - default: SWR_INVALID("Unimplemented blend factor: %d", func); + default: + SWR_INVALID("Unimplemented blend factor: %d", func); } if (Color) @@ -174,11 +182,15 @@ void GenerateBlendFactor(SWR_BLEND_FACTOR func, simdvector &constantColor, simdv { out.w = result.w; } - } -template<bool Color, bool Alpha> -INLINE void BlendFunc(SWR_BLEND_OP blendOp, simdvector &src, simdvector &srcFactor, simdvector &dst, simdvector &dstFactor, simdvector &out) +template <bool Color, bool Alpha> +INLINE void BlendFunc(SWR_BLEND_OP blendOp, + simdvector& src, + simdvector& srcFactor, + simdvector& dst, + simdvector& dstFactor, + simdvector& out) { simdvector result; @@ -204,21 +216,21 @@ INLINE void BlendFunc(SWR_BLEND_OP blendOp, simdvector &src, simdvector &srcFact result.z = _simd_fmsub_ps(dstFactor.z, dst.z, _simd_mul_ps(srcFactor.z, src.z)); result.w = _simd_fmsub_ps(dstFactor.w, dst.w, _simd_mul_ps(srcFactor.w, src.w)); break; - + case BLENDOP_MIN: result.x = _simd_min_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x)); result.y = _simd_min_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y)); result.z = _simd_min_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z)); result.w = _simd_min_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w)); break; - + case BLENDOP_MAX: result.x = _simd_max_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x)); result.y = _simd_max_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y)); result.z = _simd_max_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z)); result.w = _simd_max_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w)); break; - + default: SWR_INVALID("Unimplemented blend function: %d", blendOp); } @@ -235,8 +247,8 @@ INLINE void BlendFunc(SWR_BLEND_OP blendOp, simdvector &src, simdvector &srcFact } } -template<SWR_TYPE type> -INLINE void Clamp(simdvector &src) +template <SWR_TYPE type> +INLINE void Clamp(simdvector& src) { switch (type) { @@ -277,8 +289,13 @@ INLINE void Clamp(simdvector &src) } } -template<SWR_TYPE type> -void Blend(const SWR_BLEND_STATE *pBlendState, const SWR_RENDER_TARGET_BLEND_STATE *pState, simdvector &src, simdvector& src1, uint8_t *pDst, simdvector &result) +template <SWR_TYPE type> +void Blend(const SWR_BLEND_STATE* pBlendState, + const SWR_RENDER_TARGET_BLEND_STATE* pState, + simdvector& src, + simdvector& src1, + uint8_t* pDst, + simdvector& result) { // load render target simdvector dst; @@ -299,20 +316,33 @@ void Blend(const SWR_BLEND_STATE *pBlendState, const SWR_RENDER_TARGET_BLEND_STA simdvector srcFactor, dstFactor; if (pBlendState->independentAlphaBlendEnable) { - GenerateBlendFactor<true, false>((SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor); - GenerateBlendFactor<false, true>((SWR_BLEND_FACTOR)pState->sourceAlphaBlendFactor, constColor, src, src1, dst, srcFactor); - - GenerateBlendFactor<true, false>((SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor); - GenerateBlendFactor<false, true>((SWR_BLEND_FACTOR)pState->destAlphaBlendFactor, constColor, src, src1, dst, dstFactor); - - BlendFunc<true, false>((SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result); - BlendFunc<false, true>((SWR_BLEND_OP)pState->alphaBlendFunc, src, srcFactor, dst, dstFactor, result); + GenerateBlendFactor<true, false>( + (SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor); + GenerateBlendFactor<false, true>((SWR_BLEND_FACTOR)pState->sourceAlphaBlendFactor, + constColor, + src, + src1, + dst, + srcFactor); + + GenerateBlendFactor<true, false>( + (SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor); + GenerateBlendFactor<false, true>( + (SWR_BLEND_FACTOR)pState->destAlphaBlendFactor, constColor, src, src1, dst, dstFactor); + + BlendFunc<true, false>( + (SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result); + BlendFunc<false, true>( + (SWR_BLEND_OP)pState->alphaBlendFunc, src, srcFactor, dst, dstFactor, result); } else { - GenerateBlendFactor<true, true>((SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor); - GenerateBlendFactor<true, true>((SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor); + GenerateBlendFactor<true, true>( + (SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor); + GenerateBlendFactor<true, true>( + (SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor); - BlendFunc<true, true>((SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result); + BlendFunc<true, true>( + (SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result); } } diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp index e6c22180683..8c53fca6432 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp @@ -1,30 +1,30 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file clip.cpp -* -* @brief Implementation for clipping -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file clip.cpp + * + * @brief Implementation for clipping + * + ******************************************************************************/ #include <assert.h> @@ -42,115 +42,137 @@ float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1) return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1)); } -template<SWR_CLIPCODES ClippingPlane> +template <SWR_CLIPCODES ClippingPlane> inline void intersect( - int s, // index to first edge vertex v0 in pInPts. - int p, // index to second edge vertex v1 in pInPts. - const float *pInPts, // array of all the input positions. - const float *pInAttribs, // array of all attributes for all vertex. All the attributes for each vertex is contiguous. - int numInAttribs, // number of attributes per vertex. - int i, // output index. - float *pOutPts, // array of output positions. We'll write our new intersection point at i*4. - float *pOutAttribs) // array of output attributes. We'll write our new attributes at i*numInAttribs. + int s, // index to first edge vertex v0 in pInPts. + int p, // index to second edge vertex v1 in pInPts. + const float* pInPts, // array of all the input positions. + const float* pInAttribs, // array of all attributes for all vertex. All the attributes for each + // vertex is contiguous. + int numInAttribs, // number of attributes per vertex. + int i, // output index. + float* pOutPts, // array of output positions. We'll write our new intersection point at i*4. + float* pOutAttribs) // array of output attributes. We'll write our new attributes at + // i*numInAttribs. { float t; // Find the parameter of the intersection. // t = (v1.w - v1.x) / ((v2.x - v1.x) - (v2.w - v1.w)) for x = w (RIGHT) plane, etc. - const float *v1 = &pInPts[s*4]; - const float *v2 = &pInPts[p*4]; + const float* v1 = &pInPts[s * 4]; + const float* v2 = &pInPts[p * 4]; switch (ClippingPlane) { - case FRUSTUM_LEFT: t = ComputeInterpFactor(v1[3] + v1[0], v2[3] + v2[0]); break; - case FRUSTUM_RIGHT: t = ComputeInterpFactor(v1[3] - v1[0], v2[3] - v2[0]); break; - case FRUSTUM_TOP: t = ComputeInterpFactor(v1[3] + v1[1], v2[3] + v2[1]); break; - case FRUSTUM_BOTTOM: t = ComputeInterpFactor(v1[3] - v1[1], v2[3] - v2[1]); break; - case FRUSTUM_NEAR: t = ComputeInterpFactor(v1[2], v2[2]); break; - case FRUSTUM_FAR: t = ComputeInterpFactor(v1[3] - v1[2], v2[3] - v2[2]); break; - default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane); + case FRUSTUM_LEFT: + t = ComputeInterpFactor(v1[3] + v1[0], v2[3] + v2[0]); + break; + case FRUSTUM_RIGHT: + t = ComputeInterpFactor(v1[3] - v1[0], v2[3] - v2[0]); + break; + case FRUSTUM_TOP: + t = ComputeInterpFactor(v1[3] + v1[1], v2[3] + v2[1]); + break; + case FRUSTUM_BOTTOM: + t = ComputeInterpFactor(v1[3] - v1[1], v2[3] - v2[1]); + break; + case FRUSTUM_NEAR: + t = ComputeInterpFactor(v1[2], v2[2]); + break; + case FRUSTUM_FAR: + t = ComputeInterpFactor(v1[3] - v1[2], v2[3] - v2[2]); + break; + default: + SWR_INVALID("invalid clipping plane: %d", ClippingPlane); }; + const float* a1 = &pInAttribs[s * numInAttribs]; + const float* a2 = &pInAttribs[p * numInAttribs]; - const float *a1 = &pInAttribs[s*numInAttribs]; - const float *a2 = &pInAttribs[p*numInAttribs]; - - float *pOutP = &pOutPts[i*4]; - float *pOutA = &pOutAttribs[i*numInAttribs]; + float* pOutP = &pOutPts[i * 4]; + float* pOutA = &pOutAttribs[i * numInAttribs]; // Interpolate new position. - for(int j = 0; j < 4; ++j) + for (int j = 0; j < 4; ++j) { - pOutP[j] = v1[j] + (v2[j]-v1[j])*t; + pOutP[j] = v1[j] + (v2[j] - v1[j]) * t; } // Interpolate Attributes - for(int attr = 0; attr < numInAttribs; ++attr) + for (int attr = 0; attr < numInAttribs; ++attr) { - pOutA[attr] = a1[attr] + (a2[attr]-a1[attr])*t; + pOutA[attr] = a1[attr] + (a2[attr] - a1[attr]) * t; } } - // Checks whether vertex v lies inside clipping plane // in homogenous coords check -w < {x,y,z} < w; // -template<SWR_CLIPCODES ClippingPlane> +template <SWR_CLIPCODES ClippingPlane> inline int inside(const float v[4]) { switch (ClippingPlane) { - case FRUSTUM_LEFT : return (v[0]>=-v[3]); - case FRUSTUM_RIGHT : return (v[0]<= v[3]); - case FRUSTUM_TOP : return (v[1]>=-v[3]); - case FRUSTUM_BOTTOM : return (v[1]<= v[3]); - case FRUSTUM_NEAR : return (v[2]>=0.0f); - case FRUSTUM_FAR : return (v[2]<= v[3]); + case FRUSTUM_LEFT: + return (v[0] >= -v[3]); + case FRUSTUM_RIGHT: + return (v[0] <= v[3]); + case FRUSTUM_TOP: + return (v[1] >= -v[3]); + case FRUSTUM_BOTTOM: + return (v[1] <= v[3]); + case FRUSTUM_NEAR: + return (v[2] >= 0.0f); + case FRUSTUM_FAR: + return (v[2] <= v[3]); default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane); return 0; } } - // Clips a polygon in homogenous coordinates to a particular clipping plane. // Takes in vertices of the polygon (InPts) and the clipping plane // Puts the vertices of the clipped polygon in OutPts // Returns number of points in clipped polygon // -template<SWR_CLIPCODES ClippingPlane> -int ClipTriToPlane( const float *pInPts, int numInPts, - const float *pInAttribs, int numInAttribs, - float *pOutPts, float *pOutAttribs) +template <SWR_CLIPCODES ClippingPlane> +int ClipTriToPlane(const float* pInPts, + int numInPts, + const float* pInAttribs, + int numInAttribs, + float* pOutPts, + float* pOutAttribs) { - int i=0; // index number of OutPts, # of vertices in OutPts = i div 4; + int i = 0; // index number of OutPts, # of vertices in OutPts = i div 4; for (int j = 0; j < numInPts; ++j) { int s = j; int p = (j + 1) % numInPts; - int s_in = inside<ClippingPlane>(&pInPts[s*4]); - int p_in = inside<ClippingPlane>(&pInPts[p*4]); + int s_in = inside<ClippingPlane>(&pInPts[s * 4]); + int p_in = inside<ClippingPlane>(&pInPts[p * 4]); // test if vertex is to be added to output vertices - if (s_in != p_in) // edge crosses clipping plane + if (s_in != p_in) // edge crosses clipping plane { // find point of intersection - intersect<ClippingPlane>(s, p, pInPts, pInAttribs, numInAttribs, i, pOutPts, pOutAttribs); + intersect<ClippingPlane>( + s, p, pInPts, pInAttribs, numInAttribs, i, pOutPts, pOutAttribs); i++; } if (p_in) // 2nd vertex is inside clipping volume, add it to output { // Copy 2nd vertex position of edge over to output. - for(int k = 0; k < 4; ++k) + for (int k = 0; k < 4; ++k) { - pOutPts[i*4 + k] = pInPts[p*4 + k]; + pOutPts[i * 4 + k] = pInPts[p * 4 + k]; } // Copy 2nd vertex attributes of edge over to output. - for(int attr = 0; attr < numInAttribs; ++attr) + for (int attr = 0; attr < numInAttribs; ++attr) { - pOutAttribs[i*numInAttribs+attr] = pInAttribs[p*numInAttribs+attr]; + pOutAttribs[i * numInAttribs + attr] = pInAttribs[p * numInAttribs + attr]; } i++; } @@ -160,8 +182,14 @@ int ClipTriToPlane( const float *pInPts, int numInPts, return i; } -void ClipRectangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, - simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx) +void ClipRectangles(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simdvector prims[], + uint32_t primMask, + simdscalari const& primId, + simdscalari const& viewportIdx, + simdscalari const& rtIdx) { RDTSC_BEGIN(FEClipRectangles, pDC->drawId); Clipper<SIMD256, 3> clipper(workerId, pDC); @@ -169,8 +197,14 @@ void ClipRectangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvect RDTSC_END(FEClipRectangles, 1); } -void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, - simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx) +void ClipTriangles(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simdvector prims[], + uint32_t primMask, + simdscalari const& primId, + simdscalari const& viewportIdx, + simdscalari const& rtIdx) { RDTSC_BEGIN(FEClipTriangles, pDC->drawId); Clipper<SIMD256, 3> clipper(workerId, pDC); @@ -178,8 +212,14 @@ void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvecto RDTSC_END(FEClipTriangles, 1); } -void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, - simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx) +void ClipLines(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simdvector prims[], + uint32_t primMask, + simdscalari const& primId, + simdscalari const& viewportIdx, + simdscalari const& rtIdx) { RDTSC_BEGIN(FEClipLines, pDC->drawId); Clipper<SIMD256, 2> clipper(workerId, pDC); @@ -187,8 +227,14 @@ void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector pr RDTSC_END(FEClipLines, 1); } -void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, - simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx) +void ClipPoints(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simdvector prims[], + uint32_t primMask, + simdscalari const& primId, + simdscalari const& viewportIdx, + simdscalari const& rtIdx) { RDTSC_BEGIN(FEClipPoints, pDC->drawId); Clipper<SIMD256, 1> clipper(workerId, pDC); @@ -197,12 +243,21 @@ void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector p } #if USE_SIMD16_FRONTEND -void SIMDCALL ClipRectangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, - simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx) +void SIMDCALL ClipRectangles_simd16(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simd16vector prims[], + uint32_t primMask, + simd16scalari const& primId, + simd16scalari const& viewportIdx, + simd16scalari const& rtIdx) { RDTSC_BEGIN(FEClipRectangles, pDC->drawId); - enum { VERTS_PER_PRIM = 3 }; + enum + { + VERTS_PER_PRIM = 3 + }; Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC); @@ -212,12 +267,21 @@ void SIMDCALL ClipRectangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t wo RDTSC_END(FEClipRectangles, 1); } -void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, - simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx) +void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simd16vector prims[], + uint32_t primMask, + simd16scalari const& primId, + simd16scalari const& viewportIdx, + simd16scalari const& rtIdx) { RDTSC_BEGIN(FEClipTriangles, pDC->drawId); - enum { VERTS_PER_PRIM = 3 }; + enum + { + VERTS_PER_PRIM = 3 + }; Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC); @@ -227,12 +291,21 @@ void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t wor RDTSC_END(FEClipTriangles, 1); } -void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, - simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx) +void SIMDCALL ClipLines_simd16(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simd16vector prims[], + uint32_t primMask, + simd16scalari const& primId, + simd16scalari const& viewportIdx, + simd16scalari const& rtIdx) { RDTSC_BEGIN(FEClipLines, pDC->drawId); - enum { VERTS_PER_PRIM = 2 }; + enum + { + VERTS_PER_PRIM = 2 + }; Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC); @@ -242,12 +315,21 @@ void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerI RDTSC_END(FEClipLines, 1); } -void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, - simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx) +void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simd16vector prims[], + uint32_t primMask, + simd16scalari const& primId, + simd16scalari const& viewportIdx, + simd16scalari const& rtIdx) { RDTSC_BEGIN(FEClipPoints, pDC->drawId); - enum { VERTS_PER_PRIM = 1 }; + enum + { + VERTS_PER_PRIM = 1 + }; Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC); diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index 90ae4263575..7b4ed58c3fa 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -1,30 +1,30 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file clip.h -* -* @brief Definitions for clipping -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file clip.h + * + * @brief Definitions for clipping + * + ******************************************************************************/ #pragma once #include "common/simdintrin.h" @@ -40,18 +40,19 @@ extern THREAD SIMDVERTEX_T<SIMD512> tlsTempVertices_simd16[7]; enum SWR_CLIPCODES { - // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare. - // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes. +// Shift clip codes out of the mantissa to prevent denormalized values when used in float compare. +// Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, +// rather than intersection, of clipcodes. #define CLIPCODE_SHIFT 23 - FRUSTUM_LEFT = (0x01 << CLIPCODE_SHIFT), - FRUSTUM_TOP = (0x02 << CLIPCODE_SHIFT), - FRUSTUM_RIGHT = (0x04 << CLIPCODE_SHIFT), - FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT), + FRUSTUM_LEFT = (0x01 << CLIPCODE_SHIFT), + FRUSTUM_TOP = (0x02 << CLIPCODE_SHIFT), + FRUSTUM_RIGHT = (0x04 << CLIPCODE_SHIFT), + FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT), - FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT), - FRUSTUM_FAR = (0x20 << CLIPCODE_SHIFT), + FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT), + FRUSTUM_FAR = (0x20 << CLIPCODE_SHIFT), - NEGW = (0x40 << CLIPCODE_SHIFT), + NEGW = (0x40 << CLIPCODE_SHIFT), GUARDBAND_LEFT = (0x80 << CLIPCODE_SHIFT | 0x1), GUARDBAND_TOP = (0x80 << CLIPCODE_SHIFT | 0x2), @@ -59,32 +60,41 @@ enum SWR_CLIPCODES GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8) }; -#define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW) -#define FRUSTUM_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|FRUSTUM_LEFT|FRUSTUM_RIGHT|FRUSTUM_TOP|FRUSTUM_BOTTOM) - -template<typename SIMD_T> -void ComputeClipCodes(const API_STATE &state, const Vec4<SIMD_T> &vertex, Float<SIMD_T> &clipCodes, Integer<SIMD_T> const &viewportIndexes) +#define GUARDBAND_CLIP_MASK \ + (FRUSTUM_NEAR | FRUSTUM_FAR | GUARDBAND_LEFT | GUARDBAND_TOP | GUARDBAND_RIGHT | \ + GUARDBAND_BOTTOM | NEGW) +#define FRUSTUM_CLIP_MASK \ + (FRUSTUM_NEAR | FRUSTUM_FAR | FRUSTUM_LEFT | FRUSTUM_RIGHT | FRUSTUM_TOP | FRUSTUM_BOTTOM) + +template <typename SIMD_T> +void ComputeClipCodes(const API_STATE& state, + const Vec4<SIMD_T>& vertex, + Float<SIMD_T>& clipCodes, + Integer<SIMD_T> const& viewportIndexes) { clipCodes = SIMD_T::setzero_ps(); // -w - Float<SIMD_T> vNegW = SIMD_T::mul_ps(vertex.w,SIMD_T::set1_ps(-1.0f)); + Float<SIMD_T> vNegW = SIMD_T::mul_ps(vertex.w, SIMD_T::set1_ps(-1.0f)); // FRUSTUM_LEFT Float<SIMD_T> vRes = SIMD_T::cmplt_ps(vertex.x, vNegW); - clipCodes = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT))); + clipCodes = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT))); // FRUSTUM_TOP - vRes = SIMD_T::cmplt_ps(vertex.y, vNegW); - clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP)))); + vRes = SIMD_T::cmplt_ps(vertex.y, vNegW); + clipCodes = SIMD_T::or_ps( + clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP)))); // FRUSTUM_RIGHT - vRes = SIMD_T::cmpgt_ps(vertex.x, vertex.w); - clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT)))); + vRes = SIMD_T::cmpgt_ps(vertex.x, vertex.w); + clipCodes = SIMD_T::or_ps( + clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT)))); // FRUSTUM_BOTTOM - vRes = SIMD_T::cmpgt_ps(vertex.y, vertex.w); - clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM)))); + vRes = SIMD_T::cmpgt_ps(vertex.y, vertex.w); + clipCodes = SIMD_T::or_ps( + clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM)))); if (state.rastState.depthClipEnable) { @@ -98,50 +108,66 @@ void ComputeClipCodes(const API_STATE &state, const Vec4<SIMD_T> &vertex, Float< { vRes = SIMD_T::cmplt_ps(vertex.z, vNegW); } - clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR)))); + clipCodes = SIMD_T::or_ps( + clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR)))); // FRUSTUM_FAR - vRes = SIMD_T::cmpgt_ps(vertex.z, vertex.w); - clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR)))); + vRes = SIMD_T::cmpgt_ps(vertex.z, vertex.w); + clipCodes = SIMD_T::or_ps( + clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR)))); } // NEGW vRes = SIMD_T::cmple_ps(vertex.w, SIMD_T::setzero_ps()); - clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW)))); + clipCodes = + SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW)))); // GUARDBAND_LEFT - Float<SIMD_T> gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.left[0], viewportIndexes)); - vRes = SIMD_T::cmplt_ps(vertex.x, gbMult); - clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT)))); + Float<SIMD_T> gbMult = SIMD_T::mul_ps(vNegW, + SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>( + &state.gbState.left[0], viewportIndexes)); + vRes = SIMD_T::cmplt_ps(vertex.x, gbMult); + clipCodes = SIMD_T::or_ps( + clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT)))); // GUARDBAND_TOP - gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.top[0], viewportIndexes)); - vRes = SIMD_T::cmplt_ps(vertex.y, gbMult); - clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP)))); + gbMult = SIMD_T::mul_ps(vNegW, + SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>( + &state.gbState.top[0], viewportIndexes)); + vRes = SIMD_T::cmplt_ps(vertex.y, gbMult); + clipCodes = SIMD_T::or_ps( + clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP)))); // GUARDBAND_RIGHT - gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.right[0], viewportIndexes)); - vRes = SIMD_T::cmpgt_ps(vertex.x, gbMult); - clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT)))); + gbMult = SIMD_T::mul_ps(vertex.w, + SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>( + &state.gbState.right[0], viewportIndexes)); + vRes = SIMD_T::cmpgt_ps(vertex.x, gbMult); + clipCodes = SIMD_T::or_ps( + clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT)))); // GUARDBAND_BOTTOM - gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.bottom[0], viewportIndexes)); - vRes = SIMD_T::cmpgt_ps(vertex.y, gbMult); - clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM)))); + gbMult = SIMD_T::mul_ps(vertex.w, + SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>( + &state.gbState.bottom[0], viewportIndexes)); + vRes = SIMD_T::cmpgt_ps(vertex.y, gbMult); + clipCodes = SIMD_T::or_ps( + clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM)))); } -template<typename SIMD_T> +template <typename SIMD_T> struct BinnerChooser { }; -template<> +template <> struct BinnerChooser<SIMD256> { PFN_PROCESS_PRIMS pfnBinFunc; BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast) - :pfnBinFunc(nullptr) + : + pfnBinFunc(nullptr) { if (numVertsPerPrim == 3) { @@ -159,7 +185,8 @@ struct BinnerChooser<SIMD256> } BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast) - :pfnBinFunc(nullptr) + : + pfnBinFunc(nullptr) { switch (topology) { @@ -179,7 +206,14 @@ struct BinnerChooser<SIMD256> }; } - void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD256::Vec4 prims[], uint32_t primMask, SIMD256::Integer const &primID, SIMD256::Integer &viewportIdx, SIMD256::Integer &rtIdx) + void BinFunc(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + SIMD256::Vec4 prims[], + uint32_t primMask, + SIMD256::Integer const& primID, + SIMD256::Integer& viewportIdx, + SIMD256::Integer& rtIdx) { SWR_ASSERT(pfnBinFunc != nullptr); @@ -188,13 +222,14 @@ struct BinnerChooser<SIMD256> }; #if USE_SIMD16_FRONTEND -template<> +template <> struct BinnerChooser<SIMD512> { PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc; BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast) - :pfnBinFunc(nullptr) + : + pfnBinFunc(nullptr) { if (numVertsPerPrim == 3) { @@ -212,7 +247,8 @@ struct BinnerChooser<SIMD512> } BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast) - :pfnBinFunc(nullptr) + : + pfnBinFunc(nullptr) { switch (topology) { @@ -232,7 +268,14 @@ struct BinnerChooser<SIMD512> }; } - void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD512::Vec4 prims[], uint32_t primMask, SIMD512::Integer const &primID, SIMD512::Integer &viewportIdx, SIMD512::Integer &rtIdx) + void BinFunc(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + SIMD512::Vec4 prims[], + uint32_t primMask, + SIMD512::Integer const& primID, + SIMD512::Integer& viewportIdx, + SIMD512::Integer& rtIdx) { SWR_ASSERT(pfnBinFunc != nullptr); @@ -241,18 +284,15 @@ struct BinnerChooser<SIMD512> }; #endif -template<typename SIMD_T> +template <typename SIMD_T> struct SimdHelper { }; -template<> +template <> struct SimdHelper<SIMD256> { - static SIMD256::Float insert_lo_ps(SIMD256::Float a) - { - return a; - } + static SIMD256::Float insert_lo_ps(SIMD256::Float a) { return a; } static SIMD256::Mask cmpeq_ps_mask(SIMD256::Float a, SIMD256::Float b) { @@ -261,7 +301,7 @@ struct SimdHelper<SIMD256> }; #if USE_SIMD16_FRONTEND -template<> +template <> struct SimdHelper<SIMD512> { static SIMD512::Float insert_lo_ps(SIMD256::Float a) @@ -277,32 +317,26 @@ struct SimdHelper<SIMD512> #endif // Temp storage used by the clipper -template<typename SIMD_T> +template <typename SIMD_T> struct ClipHelper { }; -template<> +template <> struct ClipHelper<SIMD256> { - static SIMDVERTEX_T<SIMD256> *GetTempVertices() - { - return tlsTempVertices; - } + static SIMDVERTEX_T<SIMD256>* GetTempVertices() { return tlsTempVertices; } }; #if USE_SIMD16_FRONTEND -template<> +template <> struct ClipHelper<SIMD512> { - static SIMDVERTEX_T<SIMD512> *GetTempVertices() - { - return tlsTempVertices_simd16; - } + static SIMDVERTEX_T<SIMD512>* GetTempVertices() { return tlsTempVertices_simd16; } }; #endif -template<typename SIMD_T, uint32_t NumVertsPerPrim> +template <typename SIMD_T, uint32_t NumVertsPerPrim> class Clipper { public: @@ -312,7 +346,7 @@ public: static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim"); } - void ComputeClipCodes(Vec4<SIMD_T> vertex[], const Integer<SIMD_T> &viewportIndexes) + void ComputeClipCodes(Vec4<SIMD_T> vertex[], const Integer<SIMD_T>& viewportIndexes) { for (uint32_t i = 0; i < NumVertsPerPrim; ++i) { @@ -348,7 +382,8 @@ public: { Float<SIMD_T> clipUnion = ComputeClipCodeUnion(); - clipUnion = SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK))); + clipUnion = + SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK))); return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion, SIMD_T::setzero_ps())); } @@ -360,19 +395,21 @@ public: for (uint32_t e = 0; e < NumVertsPerPrim; ++e) { - Float<SIMD_T> vNan01 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]); + Float<SIMD_T> vNan01 = + SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]); vNanMask = SIMD_T::or_ps(vNanMask, vNan01); - Float<SIMD_T> vNan23 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]); + Float<SIMD_T> vNan23 = + SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]); vNanMask = SIMD_T::or_ps(vNanMask, vNan23); } return SIMD_T::movemask_ps(vNanMask); } - int ComputeUserClipCullMask(PA_STATE &pa, Vec4<SIMD_T> prim[]) + int ComputeUserClipCullMask(PA_STATE& pa, Vec4<SIMD_T> prim[]) { - uint8_t cullMask = state.backendState.cullDistanceMask; + uint8_t cullMask = state.backendState.cullDistanceMask; uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset; Float<SIMD_T> vClipCullMask = SIMD_T::setzero_ps(); @@ -387,7 +424,7 @@ public: while (_BitScanForward(&index, cullMask)) { cullMask &= ~(1 << index); - uint32_t slot = index >> 2; + uint32_t slot = index >> 2; uint32_t component = index & 0x3; Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f); @@ -404,7 +441,8 @@ public: } // cull if cull distance < 0 || NAN - Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vCullComp); + Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>( + SIMD_T::setzero_ps(), vCullComp); vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull); } vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem); @@ -415,7 +453,7 @@ public: while (_BitScanForward(&index, clipMask)) { clipMask &= ~(1 << index); - uint32_t slot = index >> 2; + uint32_t slot = index >> 2; uint32_t component = index & 0x3; Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f); @@ -431,8 +469,10 @@ public: vClipComp = vClipCullDistHi[e][component]; } - Float<SIMD_T> vClip = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp); - Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vClipComp); + Float<SIMD_T> vClip = + SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp); + Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>( + SIMD_T::setzero_ps(), vClipComp); vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull); vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip); } @@ -442,14 +482,19 @@ public: return SIMD_T::movemask_ps(vClipCullMask); } - void ClipSimd(const Vec4<SIMD_T> prim[], const Float<SIMD_T> &vPrimMask, const Float<SIMD_T> &vClipMask, PA_STATE &pa, - const Integer<SIMD_T> &vPrimId, const Integer<SIMD_T> &vViewportIdx, const Integer<SIMD_T> &vRtIdx) + void ClipSimd(const Vec4<SIMD_T> prim[], + const Float<SIMD_T>& vPrimMask, + const Float<SIMD_T>& vClipMask, + PA_STATE& pa, + const Integer<SIMD_T>& vPrimId, + const Integer<SIMD_T>& vViewportIdx, + const Integer<SIMD_T>& vRtIdx) { // input/output vertex store for clipper SIMDVERTEX_T<SIMD_T> vertices[7]; // maximum 7 verts generated per triangle uint32_t constantInterpMask = state.backendState.constantInterpolationMask; - uint32_t provokingVertex = 0; + uint32_t provokingVertex = 0; if (pa.binTopology == TOP_TRIANGLE_FAN) { provokingVertex = state.frontendState.provokingVertex.triFan; @@ -470,8 +515,9 @@ public: for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot) { // Compute absolute attrib slot in vertex array - uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot; - maxSlot = std::max<int32_t>(maxSlot, mapSlot); + uint32_t mapSlot = + backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot; + maxSlot = std::max<int32_t>(maxSlot, mapSlot); uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot; pa.Assemble(inputSlot, tmpVector); @@ -516,9 +562,11 @@ public: uint32_t numAttribs = maxSlot + 1; - Integer<SIMD_T> vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs); + Integer<SIMD_T> vNumClippedVerts = + ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs); - BinnerChooser<SIMD_T> binner(NumVertsPerPrim, pa.pDC->pState->state.rastState.conservativeRast); + BinnerChooser<SIMD_T> binner(NumVertsPerPrim, + pa.pDC->pState->state.rastState.conservativeRast); // set up new PA for binning clipped primitives PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN; @@ -545,20 +593,20 @@ public: SWR_ASSERT(0 && "Unexpected points in clipper."); } - const uint32_t *pVertexCount = reinterpret_cast<const uint32_t *>(&vNumClippedVerts); - const uint32_t *pPrimitiveId = reinterpret_cast<const uint32_t *>(&vPrimId); - const uint32_t *pViewportIdx = reinterpret_cast<const uint32_t *>(&vViewportIdx); - const uint32_t *pRtIdx = reinterpret_cast<const uint32_t *>(&vRtIdx); - - const SIMD256::Integer vOffsets = SIMD256::set_epi32( - 0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane - 6 * sizeof(SIMDVERTEX_T<SIMD_T>), - 5 * sizeof(SIMDVERTEX_T<SIMD_T>), - 4 * sizeof(SIMDVERTEX_T<SIMD_T>), - 3 * sizeof(SIMDVERTEX_T<SIMD_T>), - 2 * sizeof(SIMDVERTEX_T<SIMD_T>), - 1 * sizeof(SIMDVERTEX_T<SIMD_T>), - 0 * sizeof(SIMDVERTEX_T<SIMD_T>)); + const uint32_t* pVertexCount = reinterpret_cast<const uint32_t*>(&vNumClippedVerts); + const uint32_t* pPrimitiveId = reinterpret_cast<const uint32_t*>(&vPrimId); + const uint32_t* pViewportIdx = reinterpret_cast<const uint32_t*>(&vViewportIdx); + const uint32_t* pRtIdx = reinterpret_cast<const uint32_t*>(&vRtIdx); + + const SIMD256::Integer vOffsets = + SIMD256::set_epi32(0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane + 6 * sizeof(SIMDVERTEX_T<SIMD_T>), + 5 * sizeof(SIMDVERTEX_T<SIMD_T>), + 4 * sizeof(SIMDVERTEX_T<SIMD_T>), + 3 * sizeof(SIMDVERTEX_T<SIMD_T>), + 2 * sizeof(SIMDVERTEX_T<SIMD_T>), + 1 * sizeof(SIMDVERTEX_T<SIMD_T>), + 0 * sizeof(SIMDVERTEX_T<SIMD_T>)); // only need to gather 7 verts // @todo dynamic mask based on actual # of verts generated per lane @@ -571,14 +619,16 @@ public: // for triangle fan #if defined(_DEBUG) - // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack overflow in debug builds - SIMDVERTEX_T<SIMD_T> *transposedPrims = reinterpret_cast<SIMDVERTEX_T<SIMD_T> *>(AlignedMalloc(sizeof(SIMDVERTEX_T<SIMD_T>) * 2, 64)); + // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack + // overflow in debug builds + SIMDVERTEX_T<SIMD_T>* transposedPrims = reinterpret_cast<SIMDVERTEX_T<SIMD_T>*>( + AlignedMalloc(sizeof(SIMDVERTEX_T<SIMD_T>) * 2, 64)); #else - SIMDVERTEX_T<SIMD_T> transposedPrims[2]; + SIMDVERTEX_T<SIMD_T> transposedPrims[2]; #endif - uint32_t numInputPrims = pa.NumPrims(); + uint32_t numInputPrims = pa.NumPrims(); for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim) { uint32_t numEmittedVerts = pVertexCount[inputPrim]; @@ -598,7 +648,8 @@ public: // for triangle fan // transpose pos - uint8_t *pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim; + uint8_t* pBase = reinterpret_cast<uint8_t*>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + + sizeof(float) * inputPrim; #if 0 // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug @@ -607,13 +658,17 @@ public: #endif for (uint32_t c = 0; c < 4; ++c) { - SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask); - transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp); + SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>( + SIMD256::setzero_ps(), reinterpret_cast<const float*>(pBase), vOffsets, vMask); + transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = + SimdHelper<SIMD_T>::insert_lo_ps(temp); pBase += sizeof(Float<SIMD_T>); } // transpose attribs - pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[backendState.vertexAttribOffset]) + sizeof(float) * inputPrim; + pBase = + reinterpret_cast<uint8_t*>(&vertices[0].attrib[backendState.vertexAttribOffset]) + + sizeof(float) * inputPrim; for (uint32_t attrib = 0; attrib < numAttribs; ++attrib) { @@ -621,8 +676,14 @@ public: for (uint32_t c = 0; c < 4; ++c) { - SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask); - transposedPrims[0].attrib[attribSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp); + SIMD256::Float temp = + SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>( + SIMD256::setzero_ps(), + reinterpret_cast<const float*>(pBase), + vOffsets, + vMask); + transposedPrims[0].attrib[attribSlot][c] = + SimdHelper<SIMD_T>::insert_lo_ps(temp); pBase += sizeof(Float<SIMD_T>); } } @@ -631,40 +692,60 @@ public: uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset; if (state.backendState.clipDistanceMask & 0x0f) { - pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot]) + sizeof(float) * inputPrim; + pBase = reinterpret_cast<uint8_t*>(&vertices[0].attrib[vertexClipCullSlot]) + + sizeof(float) * inputPrim; for (uint32_t c = 0; c < 4; ++c) { - SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask); - transposedPrims[0].attrib[vertexClipCullSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp); + SIMD256::Float temp = + SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>( + SIMD256::setzero_ps(), + reinterpret_cast<const float*>(pBase), + vOffsets, + vMask); + transposedPrims[0].attrib[vertexClipCullSlot][c] = + SimdHelper<SIMD_T>::insert_lo_ps(temp); pBase += sizeof(Float<SIMD_T>); } } if (state.backendState.clipDistanceMask & 0xf0) { - pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot + 1]) + sizeof(float) * inputPrim; + pBase = reinterpret_cast<uint8_t*>(&vertices[0].attrib[vertexClipCullSlot + 1]) + + sizeof(float) * inputPrim; for (uint32_t c = 0; c < 4; ++c) { - SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask); - transposedPrims[0].attrib[vertexClipCullSlot + 1][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp); + SIMD256::Float temp = + SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>( + SIMD256::setzero_ps(), + reinterpret_cast<const float*>(pBase), + vOffsets, + vMask); + transposedPrims[0].attrib[vertexClipCullSlot + 1][c] = + SimdHelper<SIMD_T>::insert_lo_ps(temp); pBase += sizeof(Float<SIMD_T>); } } - PA_STATE_OPT clipPA(pDC, numEmittedPrims, reinterpret_cast<uint8_t *>(&transposedPrims[0]), numEmittedVerts, SWR_VTX_NUM_SLOTS, true, NumVertsPerPrim, clipTopology); + PA_STATE_OPT clipPA(pDC, + numEmittedPrims, + reinterpret_cast<uint8_t*>(&transposedPrims[0]), + numEmittedVerts, + SWR_VTX_NUM_SLOTS, + true, + NumVertsPerPrim, + clipTopology); clipPA.viewportArrayActive = pa.viewportArrayActive; - clipPA.rtArrayActive = pa.rtArrayActive; + clipPA.rtArrayActive = pa.rtArrayActive; - static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f }; + static const uint32_t primMaskMap[] = {0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f}; const uint32_t primMask = primMaskMap[numEmittedPrims]; - const Integer<SIMD_T> primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]); + const Integer<SIMD_T> primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]); const Integer<SIMD_T> viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]); - const Integer<SIMD_T> rtIdx = SIMD_T::set1_epi32(pRtIdx[inputPrim]); - + const Integer<SIMD_T> rtIdx = SIMD_T::set1_epi32(pRtIdx[inputPrim]); while (clipPA.GetNextStreamOutput()) { @@ -676,7 +757,8 @@ public: if (assemble) { - binner.pfnBinFunc(pDC, clipPA, workerId, attrib, primMask, primID, viewportIdx, rtIdx); + binner.pfnBinFunc( + pDC, clipPA, workerId, attrib, primMask, primID, viewportIdx, rtIdx); } } while (clipPA.NextPrim()); @@ -691,12 +773,17 @@ public: UPDATE_STAT_FE(CPrimitives, numClippedPrims); } - void ExecuteStage(PA_STATE &pa, Vec4<SIMD_T> prim[], uint32_t primMask, - Integer<SIMD_T> const &primId, Integer<SIMD_T> const &viewportIdx, Integer<SIMD_T> const &rtIdx) + void ExecuteStage(PA_STATE& pa, + Vec4<SIMD_T> prim[], + uint32_t primMask, + Integer<SIMD_T> const& primId, + Integer<SIMD_T> const& viewportIdx, + Integer<SIMD_T> const& rtIdx) { SWR_ASSERT(pa.pDC != nullptr); - BinnerChooser<SIMD_T> binner(pa.binTopology, pa.pDC->pState->state.rastState.conservativeRast); + BinnerChooser<SIMD_T> binner(pa.binTopology, + pa.pDC->pState->state.rastState.conservativeRast); // update clipper invocations pipeline stat uint32_t numInvoc = _mm_popcnt_u32(primMask); @@ -707,7 +794,7 @@ public: // cull prims with NAN coords primMask &= ~ComputeNaNMask(prim); - // user cull distance cull + // user cull distance cull if (state.backendState.cullDistanceMask | state.backendState.clipDistanceMask) { primMask &= ~ComputeUserClipCullMask(pa, prim); @@ -715,10 +802,12 @@ public: Float<SIMD_T> clipIntersection = ComputeClipCodeIntersection(); // Mask out non-frustum codes - clipIntersection = SIMD_T::and_ps(clipIntersection, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_CLIP_MASK))); + clipIntersection = SIMD_T::and_ps(clipIntersection, + SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_CLIP_MASK))); // cull prims outside view frustum - int validMask = primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps()); + int validMask = + primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps()); // skip clipping for points uint32_t clipMask = 0; @@ -734,7 +823,13 @@ public: RDTSC_BEGIN(FEGuardbandClip, pa.pDC->drawId); // we have to clip tris, execute the clipper, which will also // call the binner - ClipSimd(prim, SIMD_T::vmask_ps(validMask), SIMD_T::vmask_ps(clipMask), pa, primId, viewportIdx, rtIdx); + ClipSimd(prim, + SIMD_T::vmask_ps(validMask), + SIMD_T::vmask_ps(clipMask), + pa, + primId, + viewportIdx, + rtIdx); RDTSC_END(FEGuardbandClip, 1); } else if (validMask) @@ -743,24 +838,26 @@ public: UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask)); // forward valid prims directly to binner - binner.pfnBinFunc(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx, rtIdx); + binner.pfnBinFunc( + this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx, rtIdx); } } private: - Float<SIMD_T> ComputeInterpFactor(Float<SIMD_T> const &boundaryCoord0, Float<SIMD_T> const &boundaryCoord1) + Float<SIMD_T> ComputeInterpFactor(Float<SIMD_T> const& boundaryCoord0, + Float<SIMD_T> const& boundaryCoord1) { return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1)); } - Integer<SIMD_T> ComputeOffsets(uint32_t attrib, Integer<SIMD_T> const &vIndices, uint32_t component) + Integer<SIMD_T> + ComputeOffsets(uint32_t attrib, Integer<SIMD_T> const& vIndices, uint32_t component) { const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>); const uint32_t componentStride = sizeof(Float<SIMD_T>); const uint32_t attribStride = sizeof(Vec4<SIMD_T>); - static const OSALIGNSIMD16(uint32_t) elemOffset[16] = - { + static const OSALIGNSIMD16(uint32_t) elemOffset[16] = { 0 * sizeof(float), 1 * sizeof(float), 2 * sizeof(float), @@ -779,15 +876,19 @@ private: 15 * sizeof(float), }; - static_assert(sizeof(Integer<SIMD_T>) <= sizeof(elemOffset), "Clipper::ComputeOffsets, Increase number of element offsets."); + static_assert(sizeof(Integer<SIMD_T>) <= sizeof(elemOffset), + "Clipper::ComputeOffsets, Increase number of element offsets."); - Integer<SIMD_T> vElemOffset = SIMD_T::loadu_si(reinterpret_cast<const Integer<SIMD_T> *>(elemOffset)); + Integer<SIMD_T> vElemOffset = + SIMD_T::loadu_si(reinterpret_cast<const Integer<SIMD_T>*>(elemOffset)); // step to the simdvertex - Integer<SIMD_T> vOffsets = SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride)); + Integer<SIMD_T> vOffsets = + SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride)); // step to the attribute and component - vOffsets = SIMD_T::add_epi32(vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component)); + vOffsets = SIMD_T::add_epi32( + vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component)); // step to the lane vOffsets = SIMD_T::add_epi32(vOffsets, vElemOffset); @@ -795,53 +896,71 @@ private: return vOffsets; } - Float<SIMD_T> GatherComponent(const float* pBuffer, uint32_t attrib, Float<SIMD_T> const &vMask, Integer<SIMD_T> const &vIndices, uint32_t component) + Float<SIMD_T> GatherComponent(const float* pBuffer, + uint32_t attrib, + Float<SIMD_T> const& vMask, + Integer<SIMD_T> const& vIndices, + uint32_t component) { Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component); - Float<SIMD_T> vSrc = SIMD_T::setzero_ps(); + Float<SIMD_T> vSrc = SIMD_T::setzero_ps(); - return SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(vSrc, pBuffer, vOffsets, vMask); + return SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>( + vSrc, pBuffer, vOffsets, vMask); } - void ScatterComponent(const float* pBuffer, uint32_t attrib, Float<SIMD_T> const &vMask, Integer<SIMD_T> const &vIndices, uint32_t component, Float<SIMD_T> const &vSrc) + void ScatterComponent(const float* pBuffer, + uint32_t attrib, + Float<SIMD_T> const& vMask, + Integer<SIMD_T> const& vIndices, + uint32_t component, + Float<SIMD_T> const& vSrc) { Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component); - const uint32_t *pOffsets = reinterpret_cast<const uint32_t *>(&vOffsets); - const float *pSrc = reinterpret_cast<const float *>(&vSrc); - uint32_t mask = SIMD_T::movemask_ps(vMask); - DWORD lane; + const uint32_t* pOffsets = reinterpret_cast<const uint32_t*>(&vOffsets); + const float* pSrc = reinterpret_cast<const float*>(&vSrc); + uint32_t mask = SIMD_T::movemask_ps(vMask); + DWORD lane; while (_BitScanForward(&lane, mask)) { mask &= ~(1 << lane); - const uint8_t *pBuf = reinterpret_cast<const uint8_t *>(pBuffer) + pOffsets[lane]; - *(float *)pBuf = pSrc[lane]; + const uint8_t* pBuf = reinterpret_cast<const uint8_t*>(pBuffer) + pOffsets[lane]; + *(float*)pBuf = pSrc[lane]; } } - template<SWR_CLIPCODES ClippingPlane> - void intersect( - const Float<SIMD_T> &vActiveMask, // active lanes to operate on - const Integer<SIMD_T> &s, // index to first edge vertex v0 in pInPts. - const Integer<SIMD_T> &p, // index to second edge vertex v1 in pInPts. - const Vec4<SIMD_T> &v1, // vertex 0 position - const Vec4<SIMD_T> &v2, // vertex 1 position - Integer<SIMD_T> &outIndex, // output index. - const float *pInVerts, // array of all the input positions. - uint32_t numInAttribs, // number of attributes per vertex. - float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4. + template <SWR_CLIPCODES ClippingPlane> + void intersect(const Float<SIMD_T>& vActiveMask, // active lanes to operate on + const Integer<SIMD_T>& s, // index to first edge vertex v0 in pInPts. + const Integer<SIMD_T>& p, // index to second edge vertex v1 in pInPts. + const Vec4<SIMD_T>& v1, // vertex 0 position + const Vec4<SIMD_T>& v2, // vertex 1 position + Integer<SIMD_T>& outIndex, // output index. + const float* pInVerts, // array of all the input positions. + uint32_t numInAttribs, // number of attributes per vertex. + float* pOutVerts) // array of output positions. We'll write our new intersection + // point at i*4. { - uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset; + uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset; uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset; // compute interpolation factor Float<SIMD_T> t; switch (ClippingPlane) { - case FRUSTUM_LEFT: t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0])); break; - case FRUSTUM_RIGHT: t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0])); break; - case FRUSTUM_TOP: t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1])); break; - case FRUSTUM_BOTTOM: t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1])); break; + case FRUSTUM_LEFT: + t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0])); + break; + case FRUSTUM_RIGHT: + t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0])); + break; + case FRUSTUM_TOP: + t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1])); + break; + case FRUSTUM_BOTTOM: + t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1])); + break; case FRUSTUM_NEAR: // DX Znear plane is 0, GL is -w if (this->state.rastState.clipHalfZ) @@ -853,8 +972,11 @@ private: t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[2]), SIMD_T::add_ps(v2[3], v2[2])); } break; - case FRUSTUM_FAR: t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2])); break; - default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane); + case FRUSTUM_FAR: + t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2])); + break; + default: + SWR_INVALID("invalid clipping plane: %d", ClippingPlane); }; // interpolate position and store @@ -872,7 +994,8 @@ private: { Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); - Float<SIMD_T> vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0); + Float<SIMD_T> vOutAttrib = + SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0); ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); } } @@ -885,7 +1008,8 @@ private: { Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); - Float<SIMD_T> vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0); + Float<SIMD_T> vOutAttrib = + SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0); ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); } } @@ -897,44 +1021,58 @@ private: { Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); - Float<SIMD_T> vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0); + Float<SIMD_T> vOutAttrib = + SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0); ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); } } } - template<SWR_CLIPCODES ClippingPlane> - Float<SIMD_T> inside(const Vec4<SIMD_T> &v) + template <SWR_CLIPCODES ClippingPlane> + Float<SIMD_T> inside(const Vec4<SIMD_T>& v) { switch (ClippingPlane) { - case FRUSTUM_LEFT: return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f))); - case FRUSTUM_RIGHT: return SIMD_T::cmple_ps(v[0], v[3]); - case FRUSTUM_TOP: return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f))); - case FRUSTUM_BOTTOM: return SIMD_T::cmple_ps(v[1], v[3]); - case FRUSTUM_NEAR: return SIMD_T::cmpge_ps(v[2], this->state.rastState.clipHalfZ ? SIMD_T::setzero_ps() : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f))); - case FRUSTUM_FAR: return SIMD_T::cmple_ps(v[2], v[3]); + case FRUSTUM_LEFT: + return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f))); + case FRUSTUM_RIGHT: + return SIMD_T::cmple_ps(v[0], v[3]); + case FRUSTUM_TOP: + return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f))); + case FRUSTUM_BOTTOM: + return SIMD_T::cmple_ps(v[1], v[3]); + case FRUSTUM_NEAR: + return SIMD_T::cmpge_ps(v[2], + this->state.rastState.clipHalfZ + ? SIMD_T::setzero_ps() + : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f))); + case FRUSTUM_FAR: + return SIMD_T::cmple_ps(v[2], v[3]); default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane); return SIMD_T::setzero_ps(); } } - template<SWR_CLIPCODES ClippingPlane> - Integer<SIMD_T> ClipTriToPlane(const float *pInVerts, const Integer<SIMD_T> &vNumInPts, uint32_t numInAttribs, float *pOutVerts) + template <SWR_CLIPCODES ClippingPlane> + Integer<SIMD_T> ClipTriToPlane(const float* pInVerts, + const Integer<SIMD_T>& vNumInPts, + uint32_t numInAttribs, + float* pOutVerts) { uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset; - Integer<SIMD_T> vCurIndex = SIMD_T::setzero_si(); - Integer<SIMD_T> vOutIndex = SIMD_T::setzero_si(); - Float<SIMD_T> vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts)); + Integer<SIMD_T> vCurIndex = SIMD_T::setzero_si(); + Integer<SIMD_T> vOutIndex = SIMD_T::setzero_si(); + Float<SIMD_T> vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts)); while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty { - Integer<SIMD_T> s = vCurIndex; - Integer<SIMD_T> p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1)); + Integer<SIMD_T> s = vCurIndex; + Integer<SIMD_T> p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1)); Integer<SIMD_T> underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p); - p = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask))); + p = SIMD_T::castps_si(SIMD_T::blendv_ps( + SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask))); // gather position Vec4<SIMD_T> vInPos0, vInPos1; @@ -950,7 +1088,7 @@ private: // compute intersection mask (s_in != p_in) Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in); - intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask); + intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask); // store s if inside s_in = SIMD_T::and_ps(s_in, vActiveMask); @@ -959,7 +1097,8 @@ private: // store position for (uint32_t c = 0; c < 4; ++c) { - ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]); + ScatterComponent( + pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]); } // store attribs @@ -996,34 +1135,47 @@ private: } // increment outIndex - vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in); + vOutIndex = SIMD_T::blendv_epi32( + vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in); } // compute and store intersection if (!SIMD_T::testz_ps(intersectMask, intersectMask)) { - intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts); + intersect<ClippingPlane>(intersectMask, + s, + p, + vInPos0, + vInPos1, + vOutIndex, + pInVerts, + numInAttribs, + pOutVerts); // increment outIndex for active lanes - vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask); + vOutIndex = SIMD_T::blendv_epi32( + vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask); } // increment loop index and update active mask - vCurIndex = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1)); + vCurIndex = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1)); vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts)); } return vOutIndex; } - template<SWR_CLIPCODES ClippingPlane> - Integer<SIMD_T> ClipLineToPlane(const float *pInVerts, const Integer<SIMD_T> &vNumInPts, uint32_t numInAttribs, float *pOutVerts) + template <SWR_CLIPCODES ClippingPlane> + Integer<SIMD_T> ClipLineToPlane(const float* pInVerts, + const Integer<SIMD_T>& vNumInPts, + uint32_t numInAttribs, + float* pOutVerts) { uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset; - Integer<SIMD_T> vCurIndex = SIMD_T::setzero_si(); - Integer<SIMD_T> vOutIndex = SIMD_T::setzero_si(); - Float<SIMD_T> vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts)); + Integer<SIMD_T> vCurIndex = SIMD_T::setzero_si(); + Integer<SIMD_T> vOutIndex = SIMD_T::setzero_si(); + Float<SIMD_T> vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts)); if (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) { @@ -1044,7 +1196,7 @@ private: // compute intersection mask (s_in != p_in) Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in); - intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask); + intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask); // store s if inside s_in = SIMD_T::and_ps(s_in, vActiveMask); @@ -1052,7 +1204,8 @@ private: { for (uint32_t c = 0; c < 4; ++c) { - ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]); + ScatterComponent( + pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]); } // interpolate attributes and store @@ -1067,16 +1220,26 @@ private: } // increment outIndex - vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in); + vOutIndex = SIMD_T::blendv_epi32( + vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in); } // compute and store intersection if (!SIMD_T::testz_ps(intersectMask, intersectMask)) { - intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts); + intersect<ClippingPlane>(intersectMask, + s, + p, + vInPos0, + vInPos1, + vOutIndex, + pInVerts, + numInAttribs, + pOutVerts); // increment outIndex for active lanes - vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask); + vOutIndex = SIMD_T::blendv_epi32( + vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask); } // store p if inside @@ -1085,7 +1248,8 @@ private: { for (uint32_t c = 0; c < 4; ++c) { - ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]); + ScatterComponent( + pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]); } // interpolate attributes and store @@ -1100,17 +1264,21 @@ private: } // increment outIndex - vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in); + vOutIndex = SIMD_T::blendv_epi32( + vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in); } } return vOutIndex; } - Integer<SIMD_T> ClipPrims(float *pVertices, const Float<SIMD_T> &vPrimMask, const Float<SIMD_T> &vClipMask, int numAttribs) + Integer<SIMD_T> ClipPrims(float* pVertices, + const Float<SIMD_T>& vPrimMask, + const Float<SIMD_T>& vClipMask, + int numAttribs) { // temp storage - float *pTempVerts = reinterpret_cast<float *>(ClipHelper<SIMD_T>::GetTempVertices()); + float* pTempVerts = reinterpret_cast<float*>(ClipHelper<SIMD_T>::GetTempVertices()); // zero out num input verts for non-active lanes Integer<SIMD_T> vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim); @@ -1122,45 +1290,109 @@ private: { vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts); vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices); - vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts); - vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices); - vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts); + vNumOutPts = + ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts); + vNumOutPts = + ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices); + vNumOutPts = + ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts); vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices); } else { SWR_ASSERT(NumVertsPerPrim == 2); - vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts); - vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices); - vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts); - vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices); - vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts); - vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices); + vNumOutPts = + ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts); + vNumOutPts = + ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices); + vNumOutPts = + ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts); + vNumOutPts = + ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices); + vNumOutPts = + ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts); + vNumOutPts = + ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices); } // restore num verts for non-clipped, active lanes Float<SIMD_T> vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask); - vNumOutPts = SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrim), vNonClippedMask); + vNumOutPts = + SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrim), vNonClippedMask); return vNumOutPts; } - const uint32_t workerId{ 0 }; - DRAW_CONTEXT *pDC{ nullptr }; - const API_STATE &state; - Float<SIMD_T> clipCodes[NumVertsPerPrim]; + const uint32_t workerId{0}; + DRAW_CONTEXT* pDC{nullptr}; + const API_STATE& state; + Float<SIMD_T> clipCodes[NumVertsPerPrim]; }; - // pipeline stage functions -void ClipRectangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx); -void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx); -void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx); -void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx); +void ClipRectangles(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simdvector prims[], + uint32_t primMask, + simdscalari const& primId, + simdscalari const& viewportIdx, + simdscalari const& rtIdx); +void ClipTriangles(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simdvector prims[], + uint32_t primMask, + simdscalari const& primId, + simdscalari const& viewportIdx, + simdscalari const& rtIdx); +void ClipLines(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simdvector prims[], + uint32_t primMask, + simdscalari const& primId, + simdscalari const& viewportIdx, + simdscalari const& rtIdx); +void ClipPoints(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simdvector prims[], + uint32_t primMask, + simdscalari const& primId, + simdscalari const& viewportIdx, + simdscalari const& rtIdx); #if USE_SIMD16_FRONTEND -void SIMDCALL ClipRectangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx); -void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx); -void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx); -void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx); +void SIMDCALL ClipRectangles_simd16(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simd16vector prims[], + uint32_t primMask, + simd16scalari const& primId, + simd16scalari const& viewportIdx, + simd16scalari const& rtIdx); +void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simd16vector prims[], + uint32_t primMask, + simd16scalari const& primId, + simd16scalari const& viewportIdx, + simd16scalari const& rtIdx); +void SIMDCALL ClipLines_simd16(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simd16vector prims[], + uint32_t primMask, + simd16scalari const& primId, + simd16scalari const& viewportIdx, + simd16scalari const& rtIdx); +void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simd16vector prims[], + uint32_t primMask, + simd16scalari const& primId, + simd16scalari const& viewportIdx, + simd16scalari const& rtIdx); #endif - diff --git a/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h b/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h index 00c3a87c188..9e7f96cdeac 100644 --- a/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h +++ b/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h @@ -1,28 +1,28 @@ /**************************************************************************** -* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file conservativerast.h -* -******************************************************************************/ + * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file conservativerast.h + * + ******************************************************************************/ #pragma once #include <type_traits> #include "common/simdintrin.h" @@ -38,77 +38,82 @@ enum FixedPointFmt ////////////////////////////////////////////////////////////////////////// /// @brief convenience typedefs for supported Fixed Point precisions typedef std::integral_constant<uint32_t, FP_UNINIT> Fixed_Uninit; -typedef std::integral_constant<uint32_t, _16_8> Fixed_16_8; -typedef std::integral_constant<uint32_t, _16_9> Fixed_16_9; -typedef std::integral_constant<uint32_t, _X_16> Fixed_X_16; +typedef std::integral_constant<uint32_t, _16_8> Fixed_16_8; +typedef std::integral_constant<uint32_t, _16_9> Fixed_16_9; +typedef std::integral_constant<uint32_t, _X_16> Fixed_X_16; ////////////////////////////////////////////////////////////////////////// /// @struct FixedPointTraits -/// @brief holds constants relating to converting between FP and Fixed point +/// @brief holds constants relating to converting between FP and Fixed point /// @tparam FT: fixed precision type -template<typename FT> -struct FixedPointTraits{}; +template <typename FT> +struct FixedPointTraits +{ +}; ////////////////////////////////////////////////////////////////////////// /// @brief Fixed_16_8 specialization of FixedPointTraits -template<> +template <> struct FixedPointTraits<Fixed_16_8> { /// multiplier to go from FP32 to Fixed Point 16.8 typedef std::integral_constant<uint32_t, 256> ScaleT; /// number of bits to shift to go from 16.8 fixed => int32 typedef std::integral_constant<uint32_t, 8> BitsT; - typedef Fixed_16_8 TypeT; + typedef Fixed_16_8 TypeT; }; ////////////////////////////////////////////////////////////////////////// /// @brief Fixed_16_9 specialization of FixedPointTraits -template<> +template <> struct FixedPointTraits<Fixed_16_9> { /// multiplier to go from FP32 to Fixed Point 16.9 typedef std::integral_constant<uint32_t, 512> ScaleT; /// number of bits to shift to go from 16.9 fixed => int32 typedef std::integral_constant<uint32_t, 9> BitsT; - typedef Fixed_16_9 TypeT; + typedef Fixed_16_9 TypeT; }; ////////////////////////////////////////////////////////////////////////// /// @brief Fixed_16_9 specialization of FixedPointTraits -template<> +template <> struct FixedPointTraits<Fixed_X_16> { /// multiplier to go from FP32 to Fixed Point X.16 typedef std::integral_constant<uint32_t, 65536> ScaleT; /// number of bits to shift to go from X.16 fixed => int32 typedef std::integral_constant<uint32_t, 16> BitsT; - typedef Fixed_X_16 TypeT; + typedef Fixed_X_16 TypeT; }; ////////////////////////////////////////////////////////////////////////// -/// @brief convenience typedefs for conservative rasterization modes +/// @brief convenience typedefs for conservative rasterization modes typedef std::false_type StandardRastT; -typedef std::true_type ConservativeRastT; +typedef std::true_type ConservativeRastT; ////////////////////////////////////////////////////////////////////////// -/// @brief convenience typedefs for Input Coverage rasterization modes -typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_NONE> NoInputCoverageT; +/// @brief convenience typedefs for Input Coverage rasterization modes +typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_NONE> NoInputCoverageT; typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_NORMAL> OuterConservativeCoverageT; -typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE> InnerConservativeCoverageT; +typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE> + InnerConservativeCoverageT; ////////////////////////////////////////////////////////////////////////// /// @struct ConservativeRastTraits /// @brief primary ConservativeRastTraits template. Shouldn't be instantiated /// @tparam ConservativeT: type of conservative rasterization template <typename ConservativeT> -struct ConservativeRastFETraits {}; +struct ConservativeRastFETraits +{ +}; ////////////////////////////////////////////////////////////////////////// /// @brief StandardRast specialization of ConservativeRastTraits template <> struct ConservativeRastFETraits<StandardRastT> { - typedef std::false_type IsConservativeT; + typedef std::false_type IsConservativeT; typedef std::integral_constant<uint32_t, 0> BoundingBoxOffsetT; }; @@ -117,13 +122,13 @@ struct ConservativeRastFETraits<StandardRastT> template <> struct ConservativeRastFETraits<ConservativeRastT> { - typedef std::true_type IsConservativeT; + typedef std::true_type IsConservativeT; typedef std::integral_constant<uint32_t, 1> BoundingBoxOffsetT; }; ////////////////////////////////////////////////////////////////////////// -/// @brief convenience typedefs for ConservativeRastFETraits -typedef ConservativeRastFETraits<StandardRastT> FEStandardRastT; +/// @brief convenience typedefs for ConservativeRastFETraits +typedef ConservativeRastFETraits<StandardRastT> FEStandardRastT; typedef ConservativeRastFETraits<ConservativeRastT> FEConservativeRastT; ////////////////////////////////////////////////////////////////////////// @@ -133,10 +138,11 @@ typedef ConservativeRastFETraits<ConservativeRastT> FEConservativeRastT; /// @tparam ConservativeT: type of conservative rasterization /// @tparam InputCoverageT: type of input coverage requested, if any template <typename ConservativeT, typename _InputCoverageT> -struct ConservativeRastBETraits { - typedef std::false_type IsConservativeT; - typedef _InputCoverageT InputCoverageT; - typedef FixedPointTraits<Fixed_16_8> ConservativePrecisionT; +struct ConservativeRastBETraits +{ + typedef std::false_type IsConservativeT; + typedef _InputCoverageT InputCoverageT; + typedef FixedPointTraits<Fixed_16_8> ConservativePrecisionT; typedef std::integral_constant<int32_t, 0> ConservativeEdgeOffsetT; typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT; }; @@ -146,9 +152,9 @@ struct ConservativeRastBETraits { template <typename _InputCoverageT> struct ConservativeRastBETraits<StandardRastT, _InputCoverageT> { - typedef std::false_type IsConservativeT; - typedef _InputCoverageT InputCoverageT; - typedef FixedPointTraits<Fixed_16_8> ConservativePrecisionT; + typedef std::false_type IsConservativeT; + typedef _InputCoverageT InputCoverageT; + typedef FixedPointTraits<Fixed_16_8> ConservativePrecisionT; typedef std::integral_constant<int32_t, 0> ConservativeEdgeOffsetT; typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT; }; @@ -159,16 +165,17 @@ struct ConservativeRastBETraits<StandardRastT, _InputCoverageT> template <> struct ConservativeRastBETraits<ConservativeRastT, NoInputCoverageT> { - typedef std::true_type IsConservativeT; + typedef std::true_type IsConservativeT; typedef NoInputCoverageT InputCoverageT; typedef FixedPointTraits<Fixed_16_9> ConservativePrecisionT; /// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision - /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead of - /// of having to compare individual edges to pixel corners to check if any part of the triangle - /// intersects a pixel - typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value/2) + 1> ConservativeEdgeOffsetT; + /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead + /// of of having to compare individual edges to pixel corners to check if any part of the + /// triangle intersects a pixel + typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value / 2) + 1> + ConservativeEdgeOffsetT; typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT; }; @@ -178,18 +185,18 @@ struct ConservativeRastBETraits<ConservativeRastT, NoInputCoverageT> template <> struct ConservativeRastBETraits<ConservativeRastT, OuterConservativeCoverageT> { - typedef std::true_type IsConservativeT; + typedef std::true_type IsConservativeT; typedef OuterConservativeCoverageT InputCoverageT; typedef FixedPointTraits<Fixed_16_9> ConservativePrecisionT; /// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision - /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead of - /// of having to compare individual edges to pixel corners to check if any part of the triangle - /// intersects a pixel - typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value/2) + 1> ConservativeEdgeOffsetT; + /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead + /// of of having to compare individual edges to pixel corners to check if any part of the + /// triangle intersects a pixel + typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value / 2) + 1> + ConservativeEdgeOffsetT; typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT; - }; ////////////////////////////////////////////////////////////////////////// @@ -198,19 +205,25 @@ struct ConservativeRastBETraits<ConservativeRastT, OuterConservativeCoverageT> template <> struct ConservativeRastBETraits<ConservativeRastT, InnerConservativeCoverageT> { - typedef std::true_type IsConservativeT; + typedef std::true_type IsConservativeT; typedef InnerConservativeCoverageT InputCoverageT; typedef FixedPointTraits<Fixed_16_9> ConservativePrecisionT; /// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision - /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead of - /// of having to compare individual edges to pixel corners to check if any part of the triangle - /// intersects a pixel - typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value/2) + 1> ConservativeEdgeOffsetT; - - /// undo the outer conservative offset and offset edge towards from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision - /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead of - /// of having to compare individual edges to pixel corners to check if a pixel is fully covered by a triangle - typedef std::integral_constant<int32_t, static_cast<int32_t>(-((ConservativePrecisionT::ScaleT::value/2) + 1) - ConservativeEdgeOffsetT::value)> InnerConservativeEdgeOffsetT; + /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead + /// of of having to compare individual edges to pixel corners to check if any part of the + /// triangle intersects a pixel + typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value / 2) + 1> + ConservativeEdgeOffsetT; + + /// undo the outer conservative offset and offset edge towards from pixel center by 1/2 pixel + + /// 1/512, in Fixed 16.9 precision this allows the rasterizer to do the 3 edge coverage tests + /// against a single point, instead of of having to compare individual edges to pixel corners to + /// check if a pixel is fully covered by a triangle + typedef std::integral_constant<int32_t, + static_cast<int32_t>( + -((ConservativePrecisionT::ScaleT::value / 2) + 1) - + ConservativeEdgeOffsetT::value)> + InnerConservativeEdgeOffsetT; };
\ No newline at end of file diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index 2cd61e4abbb..6d378ed36e4 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -1,34 +1,34 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file context.h -* -* @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT -* The SWR_CONTEXT is our global context and contains the DC ring, -* thread state, etc. -* -* The DRAW_CONTEXT contains all state associated with a draw operation. -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file context.h + * + * @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT + * The SWR_CONTEXT is our global context and contains the DC ring, + * thread state, etc. + * + * The DRAW_CONTEXT contains all state associated with a draw operation. + * + ******************************************************************************/ #pragma once #include <condition_variable> @@ -59,9 +59,9 @@ struct TRI_FLAGS { uint32_t frontFacing : 1; uint32_t yMajor : 1; - uint32_t coverageMask : (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); + uint32_t coverageMask : (SIMD_TILE_X_DIM* SIMD_TILE_Y_DIM); uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); - float pointSize; + float pointSize; uint32_t renderTargetArrayIndex; uint32_t viewportIndex; }; @@ -77,14 +77,15 @@ struct SWR_TRIANGLE_DESC float OneOverW[3]; float recipDet; - float *pRecipW; - float *pAttribs; - float *pPerspAttribs; - float *pSamplePos; - float *pUserClipBuffer; + float* pRecipW; + float* pAttribs; + float* pPerspAttribs; + float* pSamplePos; + float* pUserClipBuffer; uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES]; - uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if entire pixel is covered + uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if + // entire pixel is covered uint64_t anyCoveredSamples; TRI_FLAGS triFlags; @@ -92,10 +93,10 @@ struct SWR_TRIANGLE_DESC struct TRIANGLE_WORK_DESC { - float *pTriBuffer; - float *pAttribs; - float *pUserClipBuffer; - uint32_t numAttribs; + float* pTriBuffer; + float* pAttribs; + float* pUserClipBuffer; + uint32_t numAttribs; TRI_FLAGS triFlags; }; @@ -104,33 +105,33 @@ struct CLEAR_DESC SWR_RECT rect; uint32_t attachmentMask; uint32_t renderTargetArrayIndex; - float clearRTColor[4]; // RGBA_32F - float clearDepth; // [0..1] - uint8_t clearStencil; + float clearRTColor[4]; // RGBA_32F + float clearDepth; // [0..1] + uint8_t clearStencil; }; struct DISCARD_INVALIDATE_TILES_DESC { - uint32_t attachmentMask; - SWR_RECT rect; + uint32_t attachmentMask; + SWR_RECT rect; SWR_TILE_STATE newTileState; - bool createNewTiles; - bool fullTilesOnly; + bool createNewTiles; + bool fullTilesOnly; }; struct SYNC_DESC { PFN_CALLBACK_FUNC pfnCallbackFunc; - uint64_t userData; - uint64_t userData2; - uint64_t userData3; + uint64_t userData; + uint64_t userData2; + uint64_t userData3; }; struct STORE_TILES_DESC { - uint32_t attachmentMask; + uint32_t attachmentMask; SWR_TILE_STATE postStoreTileState; - SWR_RECT rect; + SWR_RECT rect; }; struct COMPUTE_DESC @@ -140,7 +141,10 @@ struct COMPUTE_DESC uint32_t threadGroupCountZ; }; -typedef void(*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc); +typedef void (*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC, + uint32_t workerId, + uint32_t macroTile, + void* pDesc); enum WORK_TYPE { @@ -154,51 +158,55 @@ enum WORK_TYPE OSALIGNSIMD(struct) BE_WORK { - WORK_TYPE type; + WORK_TYPE type; PFN_WORK_FUNC pfnWork; union { - SYNC_DESC sync; - TRIANGLE_WORK_DESC tri; - CLEAR_DESC clear; + SYNC_DESC sync; + TRIANGLE_WORK_DESC tri; + CLEAR_DESC clear; DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles; - STORE_TILES_DESC storeTiles; + STORE_TILES_DESC storeTiles; } desc; }; struct DRAW_WORK { - DRAW_CONTEXT* pDC; + DRAW_CONTEXT* pDC; union { - uint32_t numIndices; // DrawIndexed: Number of indices for draw. - uint32_t numVerts; // Draw: Number of verts (triangles, lines, etc) + uint32_t numIndices; // DrawIndexed: Number of indices for draw. + uint32_t numVerts; // Draw: Number of verts (triangles, lines, etc) }; union { - gfxptr_t xpIB; // DrawIndexed: App supplied int32 indices - uint32_t startVertex; // Draw: Starting vertex in VB to render from. + gfxptr_t xpIB; // DrawIndexed: App supplied int32 indices + uint32_t startVertex; // Draw: Starting vertex in VB to render from. }; - int32_t baseVertex; - uint32_t numInstances; // Number of instances - uint32_t startInstance; // Instance offset - uint32_t startPrimID; // starting primitiveID for this draw batch - uint32_t startVertexID; // starting VertexID for this draw batch (only needed for non-indexed draws) - SWR_FORMAT type; // index buffer type + int32_t baseVertex; + uint32_t numInstances; // Number of instances + uint32_t startInstance; // Instance offset + uint32_t startPrimID; // starting primitiveID for this draw batch + uint32_t + startVertexID; // starting VertexID for this draw batch (only needed for non-indexed draws) + SWR_FORMAT type; // index buffer type }; -typedef void(*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pDesc); +typedef void (*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext, + DRAW_CONTEXT* pDC, + uint32_t workerId, + void* pDesc); struct FE_WORK { - WORK_TYPE type; + WORK_TYPE type; PFN_FE_WORK_FUNC pfnWork; union { - SYNC_DESC sync; - DRAW_WORK draw; - CLEAR_DESC clear; + SYNC_DESC sync; + DRAW_WORK draw; + CLEAR_DESC clear; DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles; - STORE_TILES_DESC storeTiles; + STORE_TILES_DESC storeTiles; } desc; }; @@ -213,13 +221,25 @@ struct GUARDBANDS struct PA_STATE; // function signature for pipeline stages that execute after primitive assembly -typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], - uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx, simdscalari const &rtIdx); +typedef void (*PFN_PROCESS_PRIMS)(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simdvector prims[], + uint32_t primMask, + simdscalari const& primID, + simdscalari const& viewportIdx, + simdscalari const& rtIdx); #if ENABLE_AVX512_SIMD16 // function signature for pipeline stages that execute after primitive assembly -typedef void(SIMDCALL *PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], - uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx, simd16scalari const &rtIdx); +typedef void(SIMDCALL* PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simd16vector prims[], + uint32_t primMask, + simd16scalari const& primID, + simd16scalari const& viewportIdx, + simd16scalari const& rtIdx); #endif OSALIGNLINE(struct) API_STATE @@ -228,86 +248,85 @@ OSALIGNLINE(struct) API_STATE SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS]; // GS - Geometry Shader State - SWR_GS_STATE gsState; - PFN_GS_FUNC pfnGsFunc; + SWR_GS_STATE gsState; + PFN_GS_FUNC pfnGsFunc; // FS - Fetch Shader State - PFN_FETCH_FUNC pfnFetchFunc; + PFN_FETCH_FUNC pfnFetchFunc; // VS - Vertex Shader State - PFN_VERTEX_FUNC pfnVertexFunc; + PFN_VERTEX_FUNC pfnVertexFunc; // Index Buffer - SWR_INDEX_BUFFER_STATE indexBuffer; + SWR_INDEX_BUFFER_STATE indexBuffer; // CS - Compute Shader - PFN_CS_FUNC pfnCsFunc; - uint32_t totalThreadsInGroup; - uint32_t totalSpillFillSize; - uint32_t scratchSpaceSize; - uint32_t scratchSpaceNumInstances; + PFN_CS_FUNC pfnCsFunc; + uint32_t totalThreadsInGroup; + uint32_t totalSpillFillSize; + uint32_t scratchSpaceSize; + uint32_t scratchSpaceNumInstances; // FE - Frontend State - SWR_FRONTEND_STATE frontendState; + SWR_FRONTEND_STATE frontendState; // SOS - Streamout Shader State - PFN_SO_FUNC pfnSoFunc[MAX_SO_STREAMS]; + PFN_SO_FUNC pfnSoFunc[MAX_SO_STREAMS]; // Streamout state - SWR_STREAMOUT_STATE soState; + SWR_STREAMOUT_STATE soState; mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS]; // Tessellation State - PFN_HS_FUNC pfnHsFunc; - PFN_DS_FUNC pfnDsFunc; - SWR_TS_STATE tsState; + PFN_HS_FUNC pfnHsFunc; + PFN_DS_FUNC pfnDsFunc; + SWR_TS_STATE tsState; // Number of attributes used by the frontend (vs, so, gs) - uint32_t feNumAttributes; - + uint32_t feNumAttributes; // RS - Rasterizer State - SWR_RASTSTATE rastState; + SWR_RASTSTATE rastState; // floating point multisample offsets float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2]; - GUARDBANDS gbState; + GUARDBANDS gbState; - SWR_VIEWPORT vp[KNOB_NUM_VIEWPORTS_SCISSORS]; - SWR_VIEWPORT_MATRICES vpMatrices; + SWR_VIEWPORT vp[KNOB_NUM_VIEWPORTS_SCISSORS]; + SWR_VIEWPORT_MATRICES vpMatrices; - SWR_RECT scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS]; - SWR_RECT scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS]; - bool scissorsTileAligned; + SWR_RECT scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS]; + SWR_RECT scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS]; + bool scissorsTileAligned; - bool forceFront; - PRIMITIVE_TOPOLOGY topology; + bool forceFront; + PRIMITIVE_TOPOLOGY topology; // Backend state OSALIGNLINE(SWR_BACKEND_STATE) backendState; - SWR_DEPTH_BOUNDS_STATE depthBoundsState; + SWR_DEPTH_BOUNDS_STATE depthBoundsState; // PS - Pixel shader state - SWR_PS_STATE psState; + SWR_PS_STATE psState; SWR_DEPTH_STENCIL_STATE depthStencilState; // OM - Output Merger State - SWR_BLEND_STATE blendState; - PFN_BLEND_JIT_FUNC pfnBlendFunc[SWR_NUM_RENDERTARGETS]; + SWR_BLEND_STATE blendState; + PFN_BLEND_JIT_FUNC pfnBlendFunc[SWR_NUM_RENDERTARGETS]; struct { - uint32_t enableStatsFE : 1; // Enable frontend pipeline stats - uint32_t enableStatsBE : 1; // Enable backend pipeline stats - uint32_t colorHottileEnable : 8; // Bitmask of enabled color hottiles - uint32_t depthHottileEnable: 1; // Enable depth buffer hottile - uint32_t stencilHottileEnable : 1; // Enable stencil buffer hottile + uint32_t enableStatsFE : 1; // Enable frontend pipeline stats + uint32_t enableStatsBE : 1; // Enable backend pipeline stats + uint32_t colorHottileEnable : 8; // Bitmask of enabled color hottiles + uint32_t depthHottileEnable : 1; // Enable depth buffer hottile + uint32_t stencilHottileEnable : 1; // Enable stencil buffer hottile }; - PFN_QUANTIZE_DEPTH pfnQuantizeDepth; + PFN_QUANTIZE_DEPTH pfnQuantizeDepth; }; class MacroTileMgr; @@ -343,13 +362,23 @@ struct BarycentricCoeffs }; // pipeline function pointer types -typedef void(*PFN_BACKEND_FUNC)(DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&); -typedef void(*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT &, uint8_t* (&)[SWR_NUM_RENDERTARGETS], uint32_t, const SWR_BLEND_STATE*, - const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS], simdscalar&, simdscalar const &); -typedef void(*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &); -typedef void(*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&); -typedef void(*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &, const uint64_t *const, const uint32_t, - simdscalar const &, simdscalar const &); +typedef void (*PFN_BACKEND_FUNC)( + DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&); +typedef void (*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT&, + uint8_t* (&)[SWR_NUM_RENDERTARGETS], + uint32_t, + const SWR_BLEND_STATE*, + const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS], + simdscalar&, + simdscalar const&); +typedef void (*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&); +typedef void (*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&); +typedef void (*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&, + SWR_PS_CONTEXT&, + const uint64_t* const, + const uint32_t, + simdscalar const&, + simdscalar const&); struct BACKEND_FUNCS { @@ -361,16 +390,16 @@ struct DRAW_STATE { API_STATE state; - void* pPrivateState; // Its required the driver sets this up for each draw. + void* pPrivateState; // Its required the driver sets this up for each draw. // pipeline function pointers, filled in by API thread when setting up the draw - BACKEND_FUNCS backendFuncs; + BACKEND_FUNCS backendFuncs; PFN_PROCESS_PRIMS pfnProcessPrims; #if USE_SIMD16_FRONTEND PFN_PROCESS_PRIMS_SIMD16 pfnProcessPrims_simd16; #endif - CachingArena* pArena; // This should only be used by API thread. + CachingArena* pArena; // This should only be used by API thread. }; struct DRAW_DYNAMIC_STATE @@ -386,7 +415,7 @@ struct DRAW_DYNAMIC_STATE uint32_t SoWriteOffset[4]; bool SoWriteOffsetDirty[4]; - SWR_STATS_FE statsFE; // Only one FE thread per DC. + SWR_STATS_FE statsFE; // Only one FE thread per DC. SWR_STATS* pStats; }; @@ -395,30 +424,30 @@ struct DRAW_DYNAMIC_STATE // This draw context maintains all of the state needed for the draw operation. struct DRAW_CONTEXT { - SWR_CONTEXT* pContext; + SWR_CONTEXT* pContext; union { - MacroTileMgr* pTileMgr; - DispatchQueue* pDispatch; // Queue for thread groups. (isCompute) + MacroTileMgr* pTileMgr; + DispatchQueue* pDispatch; // Queue for thread groups. (isCompute) }; - DRAW_STATE* pState; // Read-only state. Core should not update this outside of API thread. - CachingArena* pArena; + DRAW_STATE* pState; // Read-only state. Core should not update this outside of API thread. + CachingArena* pArena; - uint32_t drawId; - bool dependentFE; // Frontend work is dependent on all previous FE - bool dependent; // Backend work is dependent on all previous BE - bool isCompute; // Is this DC a compute context? - bool cleanupState; // True if this is the last draw using an entry in the state ring. + uint32_t drawId; + bool dependentFE; // Frontend work is dependent on all previous FE + bool dependent; // Backend work is dependent on all previous BE + bool isCompute; // Is this DC a compute context? + bool cleanupState; // True if this is the last draw using an entry in the state ring. - FE_WORK FeWork; + FE_WORK FeWork; - SYNC_DESC retireCallback; // Call this func when this DC is retired. + SYNC_DESC retireCallback; // Call this func when this DC is retired. DRAW_DYNAMIC_STATE dynState; - volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw? - volatile OSALIGNLINE(uint32_t) FeLock; - volatile OSALIGNLINE(uint32_t) threadsDone; + volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw? + volatile OSALIGNLINE(uint32_t) FeLock; + volatile OSALIGNLINE(uint32_t) threadsDone; }; static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT"); @@ -444,14 +473,14 @@ class HotTileMgr; struct SWR_CONTEXT { // Draw Context Ring - // Each draw needs its own state in order to support mulitple draws in flight across multiple threads. - // We maintain N draw contexts configured as a ring. The size of the ring limits the maximum number - // of draws that can be in flight at any given time. + // Each draw needs its own state in order to support mulitple draws in flight across multiple + // threads. We maintain N draw contexts configured as a ring. The size of the ring limits the + // maximum number of draws that can be in flight at any given time. // // Description: // 1. State - When an application first sets state we'll request a new draw context to use. - // a. If there are no available draw contexts then we'll have to wait until one becomes free. - // b. If one is available then set pCurDrawContext to point to it and mark it in use. + // a. If there are no available draw contexts then we'll have to wait until one becomes + // free. b. If one is available then set pCurDrawContext to point to it and mark it in use. // c. All state calls set state on pCurDrawContext. // 2. Draw - Creates submits a work item that is associated with current draw context. // a. Set pPrevDrawContext = pCurDrawContext @@ -461,10 +490,11 @@ struct SWR_CONTEXT // b. State is copied from prev draw context to current. RingBuffer<DRAW_CONTEXT> dcRing; - DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw. - DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from. + DRAW_CONTEXT* pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw. + DRAW_CONTEXT* pPrevDrawContext; // This points to DC entry for the previous context submitted + // that we can copy state from. - MacroTileMgr* pMacroTileManagerArray; + MacroTileMgr* pMacroTileManagerArray; DispatchQueue* pDispatchQueueArray; // Draw State Ring @@ -474,33 +504,33 @@ struct SWR_CONTEXT // to reference a single entry in the DS ring. RingBuffer<DRAW_STATE> dsRing; - uint32_t curStateId; // Current index to the next available entry in the DS ring. + uint32_t curStateId; // Current index to the next available entry in the DS ring. uint32_t NumWorkerThreads; uint32_t NumFEThreads; uint32_t NumBEThreads; - THREAD_POOL threadPool; // Thread pool associated with this context - SWR_THREADING_INFO threadInfo; - SWR_API_THREADING_INFO apiThreadInfo; + THREAD_POOL threadPool; // Thread pool associated with this context + SWR_THREADING_INFO threadInfo; + SWR_API_THREADING_INFO apiThreadInfo; SWR_WORKER_PRIVATE_STATE workerPrivateState; uint32_t MAX_DRAWS_IN_FLIGHT; std::condition_variable FifosNotEmpty; - std::mutex WaitLock; + std::mutex WaitLock; uint32_t privateStateSize; - HotTileMgr *pHotTileMgr; + HotTileMgr* pHotTileMgr; // Callback functions, passed in at create context time - PFN_LOAD_TILE pfnLoadTile; - PFN_STORE_TILE pfnStoreTile; - PFN_CLEAR_TILE pfnClearTile; - PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset; - PFN_UPDATE_STATS pfnUpdateStats; - PFN_UPDATE_STATS_FE pfnUpdateStatsFE; + PFN_LOAD_TILE pfnLoadTile; + PFN_STORE_TILE pfnStoreTile; + PFN_CLEAR_TILE pfnClearTile; + PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset; + PFN_UPDATE_STATS pfnUpdateStats; + PFN_UPDATE_STATS_FE pfnUpdateStatsFE; // Global Stats @@ -509,7 +539,7 @@ struct SWR_CONTEXT // Scratch space for workers. uint8_t** ppScratch; - volatile OSALIGNLINE(uint32_t) drawsOutstandingFE; + volatile OSALIGNLINE(uint32_t) drawsOutstandingFE; OSALIGNLINE(CachingAllocator) cachingArenaAllocator; uint32_t frameCount; @@ -522,27 +552,35 @@ struct SWR_CONTEXT HANDLE* pArContext; }; -#define UPDATE_STAT_BE(name, count) if (GetApiState(pDC).enableStatsBE) { pDC->dynState.pStats[workerId].name += count; } -#define UPDATE_STAT_FE(name, count) if (GetApiState(pDC).enableStatsFE) { pDC->dynState.statsFE.name += count; } +#define UPDATE_STAT_BE(name, count) \ + if (GetApiState(pDC).enableStatsBE) \ + { \ + pDC->dynState.pStats[workerId].name += count; \ + } +#define UPDATE_STAT_FE(name, count) \ + if (GetApiState(pDC).enableStatsFE) \ + { \ + pDC->dynState.statsFE.name += count; \ + } // ArchRast instrumentation framework -#define AR_WORKER_CTX pDC->pContext->pArContext[workerId] -#define AR_API_CTX pDC->pContext->pArContext[pContext->NumWorkerThreads] +#define AR_WORKER_CTX pDC->pContext->pArContext[workerId] +#define AR_API_CTX pDC->pContext->pArContext[pContext->NumWorkerThreads] #ifdef KNOB_ENABLE_RDTSC #define RDTSC_BEGIN(type, drawid) RDTSC_START(type) -#define RDTSC_END(type, count) RDTSC_STOP(type, count, 0) +#define RDTSC_END(type, count) RDTSC_STOP(type, count, 0) #else #define RDTSC_BEGIN(type, count) #define RDTSC_END(type, count) #endif #ifdef KNOB_ENABLE_AR - #define _AR_EVENT(ctx, event) ArchRast::Dispatch(ctx, ArchRast::event) - #define _AR_FLUSH(ctx, id) ArchRast::FlushDraw(ctx, id) +#define _AR_EVENT(ctx, event) ArchRast::Dispatch(ctx, ArchRast::event) +#define _AR_FLUSH(ctx, id) ArchRast::FlushDraw(ctx, id) #else - #define _AR_EVENT(ctx, event) - #define _AR_FLUSH(ctx, id) +#define _AR_EVENT(ctx, event) +#define _AR_FLUSH(ctx, id) #endif // Use these macros for api thread. diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h index fafc36d1de6..54a3489205a 100644 --- a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h +++ b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h @@ -1,36 +1,39 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file depthstencil.h -* -* @brief Implements depth/stencil functionality -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file depthstencil.h + * + * @brief Implements depth/stencil functionality + * + ******************************************************************************/ #pragma once #include "common/os.h" #include "format_conversion.h" INLINE -void StencilOp(SWR_STENCILOP op, simdscalar const &mask, simdscalar const &stencilRefps, simdscalar &stencilps) +void StencilOp(SWR_STENCILOP op, + simdscalar const& mask, + simdscalar const& stencilRefps, + simdscalar& stencilps) { simdscalari stencil = _simd_castps_si(stencilps); @@ -47,30 +50,31 @@ void StencilOp(SWR_STENCILOP op, simdscalar const &mask, simdscalar const &stenc case STENCILOP_INCRSAT: { simdscalari stencilincr = _simd_adds_epu8(stencil, _simd_set1_epi32(1)); - stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask); + stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask); break; } case STENCILOP_DECRSAT: { simdscalari stencildecr = _simd_subs_epu8(stencil, _simd_set1_epi32(1)); - stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask); + stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask); break; } case STENCILOP_INCR: { simdscalari stencilincr = _simd_add_epi8(stencil, _simd_set1_epi32(1)); - stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask); + stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask); break; } case STENCILOP_DECR: { simdscalari stencildecr = _simd_add_epi8(stencil, _simd_set1_epi32((-1) & 0xff)); - stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask); + stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask); break; } case STENCILOP_INVERT: { - simdscalar stencilinvert = _simd_andnot_ps(stencilps, _simd_cmpeq_ps(_simd_setzero_ps(), _simd_setzero_ps())); + simdscalar stencilinvert = + _simd_andnot_ps(stencilps, _simd_cmpeq_ps(_simd_setzero_ps(), _simd_setzero_ps())); stencilps = _simd_blendv_ps(stencilps, stencilinvert, mask); break; } @@ -79,12 +83,11 @@ void StencilOp(SWR_STENCILOP op, simdscalar const &mask, simdscalar const &stenc } } - -template<SWR_FORMAT depthFormatT> -simdscalar QuantizeDepth(simdscalar const &depth) +template <SWR_FORMAT depthFormatT> +simdscalar QuantizeDepth(simdscalar const& depth) { SWR_TYPE depthType = FormatTraits<depthFormatT>::GetType(0); - uint32_t depthBpc = FormatTraits<depthFormatT>::GetBPC(0); + uint32_t depthBpc = FormatTraits<depthFormatT>::GetBPC(0); if (depthType == SWR_TYPE_FLOAT) { @@ -98,11 +101,11 @@ simdscalar QuantizeDepth(simdscalar const &depth) // should be unorm depth if not float SWR_ASSERT(depthType == SWR_TYPE_UNORM); - float quantize = (float)((1 << depthBpc) - 1); - simdscalar result = _simd_mul_ps(depth, _simd_set1_ps(quantize)); - result = _simd_add_ps(result, _simd_set1_ps(0.5f)); - result = _simd_round_ps(result, _MM_FROUND_TO_ZERO); - + float quantize = (float)((1 << depthBpc) - 1); + simdscalar result = _simd_mul_ps(depth, _simd_set1_ps(quantize)); + result = _simd_add_ps(result, _simd_set1_ps(0.5f)); + result = _simd_round_ps(result, _MM_FROUND_TO_ZERO); + if (depthBpc > 16) { result = _simd_div_ps(result, _simd_set1_ps(quantize)); @@ -116,42 +119,62 @@ simdscalar QuantizeDepth(simdscalar const &depth) } INLINE -simdscalar DepthStencilTest(const API_STATE* pState, - bool frontFacing, uint32_t viewportIndex, simdscalar const &iZ, uint8_t* pDepthBase, simdscalar const &coverageMask, - uint8_t *pStencilBase, simdscalar* pStencilMask) +simdscalar DepthStencilTest(const API_STATE* pState, + bool frontFacing, + uint32_t viewportIndex, + simdscalar const& iZ, + uint8_t* pDepthBase, + simdscalar const& coverageMask, + uint8_t* pStencilBase, + simdscalar* pStencilMask) { static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format"); - const SWR_DEPTH_STENCIL_STATE* pDSState = &pState->depthStencilState; - const SWR_VIEWPORT* pViewport = &pState->vp[viewportIndex]; + const SWR_DEPTH_STENCIL_STATE* pDSState = &pState->depthStencilState; + const SWR_VIEWPORT* pViewport = &pState->vp[viewportIndex]; simdscalar depthResult = _simd_set1_ps(-1.0f); simdscalar zbuf; // clamp Z to viewport [minZ..maxZ] - simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ); - simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ); + simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ); + simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ); simdscalar interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, iZ)); - + if (pDSState->depthTestEnable) { switch (pDSState->depthTestFunc) { - case ZFUNC_NEVER: depthResult = _simd_setzero_ps(); break; - case ZFUNC_ALWAYS: break; + case ZFUNC_NEVER: + depthResult = _simd_setzero_ps(); + break; + case ZFUNC_ALWAYS: + break; default: zbuf = _simd_load_ps((const float*)pDepthBase); } switch (pDSState->depthTestFunc) { - case ZFUNC_LE: depthResult = _simd_cmple_ps(interpZ, zbuf); break; - case ZFUNC_LT: depthResult = _simd_cmplt_ps(interpZ, zbuf); break; - case ZFUNC_GT: depthResult = _simd_cmpgt_ps(interpZ, zbuf); break; - case ZFUNC_GE: depthResult = _simd_cmpge_ps(interpZ, zbuf); break; - case ZFUNC_EQ: depthResult = _simd_cmpeq_ps(interpZ, zbuf); break; - case ZFUNC_NE: depthResult = _simd_cmpneq_ps(interpZ, zbuf); break; + case ZFUNC_LE: + depthResult = _simd_cmple_ps(interpZ, zbuf); + break; + case ZFUNC_LT: + depthResult = _simd_cmplt_ps(interpZ, zbuf); + break; + case ZFUNC_GT: + depthResult = _simd_cmpgt_ps(interpZ, zbuf); + break; + case ZFUNC_GE: + depthResult = _simd_cmpge_ps(interpZ, zbuf); + break; + case ZFUNC_EQ: + depthResult = _simd_cmpeq_ps(interpZ, zbuf); + break; + case ZFUNC_NE: + depthResult = _simd_cmpneq_ps(interpZ, zbuf); + break; } } @@ -159,9 +182,9 @@ simdscalar DepthStencilTest(const API_STATE* pState, if (pDSState->stencilTestEnable) { - uint8_t stencilRefValue; + uint8_t stencilRefValue; uint32_t stencilTestFunc; - uint8_t stencilTestMask; + uint8_t stencilTestMask; if (frontFacing || !pDSState->doubleSidedStencilTestEnable) { stencilRefValue = pDSState->stencilRefValue; @@ -178,15 +201,19 @@ simdscalar DepthStencilTest(const API_STATE* pState, simdvector sbuf; simdscalar stencilWithMask; simdscalar stencilRef; - switch(stencilTestFunc) + switch (stencilTestFunc) { - case ZFUNC_NEVER: stencilMask = _simd_setzero_ps(); break; - case ZFUNC_ALWAYS: break; + case ZFUNC_NEVER: + stencilMask = _simd_setzero_ps(); + break; + case ZFUNC_ALWAYS: + break; default: LoadSOA<R8_UINT>(pStencilBase, sbuf); - + // apply stencil read mask - stencilWithMask = _simd_castsi_ps(_simd_and_si(_simd_castps_si(sbuf.v[0]), _simd_set1_epi32(stencilTestMask))); + stencilWithMask = _simd_castsi_ps( + _simd_and_si(_simd_castps_si(sbuf.v[0]), _simd_set1_epi32(stencilTestMask))); // do stencil compare in float to avoid simd integer emulation in AVX1 stencilWithMask = _simd_cvtepi32_ps(_simd_castps_si(stencilWithMask)); @@ -195,34 +222,52 @@ simdscalar DepthStencilTest(const API_STATE* pState, break; } - switch(stencilTestFunc) + switch (stencilTestFunc) { - case ZFUNC_LE: stencilMask = _simd_cmple_ps(stencilRef, stencilWithMask); break; - case ZFUNC_LT: stencilMask = _simd_cmplt_ps(stencilRef, stencilWithMask); break; - case ZFUNC_GT: stencilMask = _simd_cmpgt_ps(stencilRef, stencilWithMask); break; - case ZFUNC_GE: stencilMask = _simd_cmpge_ps(stencilRef, stencilWithMask); break; - case ZFUNC_EQ: stencilMask = _simd_cmpeq_ps(stencilRef, stencilWithMask); break; - case ZFUNC_NE: stencilMask = _simd_cmpneq_ps(stencilRef, stencilWithMask); break; + case ZFUNC_LE: + stencilMask = _simd_cmple_ps(stencilRef, stencilWithMask); + break; + case ZFUNC_LT: + stencilMask = _simd_cmplt_ps(stencilRef, stencilWithMask); + break; + case ZFUNC_GT: + stencilMask = _simd_cmpgt_ps(stencilRef, stencilWithMask); + break; + case ZFUNC_GE: + stencilMask = _simd_cmpge_ps(stencilRef, stencilWithMask); + break; + case ZFUNC_EQ: + stencilMask = _simd_cmpeq_ps(stencilRef, stencilWithMask); + break; + case ZFUNC_NE: + stencilMask = _simd_cmpneq_ps(stencilRef, stencilWithMask); + break; } } simdscalar depthWriteMask = _simd_and_ps(depthResult, stencilMask); - depthWriteMask = _simd_and_ps(depthWriteMask, coverageMask); + depthWriteMask = _simd_and_ps(depthWriteMask, coverageMask); *pStencilMask = stencilMask; return depthWriteMask; } INLINE -void DepthStencilWrite(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState, - bool frontFacing, simdscalar const &iZ, uint8_t* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask, - uint8_t *pStencilBase, const simdscalar& stencilMask) +void DepthStencilWrite(const SWR_VIEWPORT* pViewport, + const SWR_DEPTH_STENCIL_STATE* pDSState, + bool frontFacing, + simdscalar const& iZ, + uint8_t* pDepthBase, + const simdscalar& depthMask, + const simdscalar& coverageMask, + uint8_t* pStencilBase, + const simdscalar& stencilMask) { if (pDSState->depthWriteEnable) { // clamp Z to viewport [minZ..maxZ] - simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ); - simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ); + simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ); + simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ); simdscalar interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, iZ)); simdscalar vMask = _simd_and_ps(depthMask, coverageMask); @@ -235,49 +280,56 @@ void DepthStencilWrite(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_ST LoadSOA<R8_UINT>(pStencilBase, sbuf); simdscalar stencilbuf = sbuf.v[0]; - uint8_t stencilRefValue; + uint8_t stencilRefValue; uint32_t stencilFailOp; uint32_t stencilPassDepthPassOp; uint32_t stencilPassDepthFailOp; - uint8_t stencilWriteMask; + uint8_t stencilWriteMask; if (frontFacing || !pDSState->doubleSidedStencilTestEnable) { - stencilRefValue = pDSState->stencilRefValue; - stencilFailOp = pDSState->stencilFailOp; + stencilRefValue = pDSState->stencilRefValue; + stencilFailOp = pDSState->stencilFailOp; stencilPassDepthPassOp = pDSState->stencilPassDepthPassOp; stencilPassDepthFailOp = pDSState->stencilPassDepthFailOp; - stencilWriteMask = pDSState->stencilWriteMask; + stencilWriteMask = pDSState->stencilWriteMask; } else { - stencilRefValue = pDSState->backfaceStencilRefValue; - stencilFailOp = pDSState->backfaceStencilFailOp; + stencilRefValue = pDSState->backfaceStencilRefValue; + stencilFailOp = pDSState->backfaceStencilFailOp; stencilPassDepthPassOp = pDSState->backfaceStencilPassDepthPassOp; stencilPassDepthFailOp = pDSState->backfaceStencilPassDepthFailOp; - stencilWriteMask = pDSState->backfaceStencilWriteMask; + stencilWriteMask = pDSState->backfaceStencilWriteMask; } - simdscalar stencilps = stencilbuf; + simdscalar stencilps = stencilbuf; simdscalar stencilRefps = _simd_castsi_ps(_simd_set1_epi32(stencilRefValue)); - simdscalar stencilFailMask = _simd_andnot_ps(stencilMask, coverageMask); + simdscalar stencilFailMask = _simd_andnot_ps(stencilMask, coverageMask); simdscalar stencilPassDepthPassMask = _simd_and_ps(stencilMask, depthMask); - simdscalar stencilPassDepthFailMask = _simd_and_ps(stencilMask, _simd_andnot_ps(depthMask, _simd_set1_ps(-1))); + simdscalar stencilPassDepthFailMask = + _simd_and_ps(stencilMask, _simd_andnot_ps(depthMask, _simd_set1_ps(-1))); simdscalar origStencil = stencilps; StencilOp((SWR_STENCILOP)stencilFailOp, stencilFailMask, stencilRefps, stencilps); - StencilOp((SWR_STENCILOP)stencilPassDepthFailOp, stencilPassDepthFailMask, stencilRefps, stencilps); - StencilOp((SWR_STENCILOP)stencilPassDepthPassOp, stencilPassDepthPassMask, stencilRefps, stencilps); + StencilOp((SWR_STENCILOP)stencilPassDepthFailOp, + stencilPassDepthFailMask, + stencilRefps, + stencilps); + StencilOp((SWR_STENCILOP)stencilPassDepthPassOp, + stencilPassDepthPassMask, + stencilRefps, + stencilps); // apply stencil write mask simdscalari vWriteMask = _simd_set1_epi32(stencilWriteMask); - stencilps = _simd_and_ps(stencilps, _simd_castsi_ps(vWriteMask)); - stencilps = _simd_or_ps(_simd_andnot_ps(_simd_castsi_ps(vWriteMask), origStencil), stencilps); + stencilps = _simd_and_ps(stencilps, _simd_castsi_ps(vWriteMask)); + stencilps = + _simd_or_ps(_simd_andnot_ps(_simd_castsi_ps(vWriteMask), origStencil), stencilps); simdvector stencilResult; stencilResult.v[0] = _simd_blendv_ps(origStencil, stencilps, coverageMask); StoreSOA<R8_UINT>(stencilResult, pStencilBase); } - } diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp index 43d3a832267..9a9cc2635df 100644 --- a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp +++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp @@ -1,53 +1,52 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file fifo.hpp -* -* @brief Definitions for our fifos used for thread communication. -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file fifo.hpp + * + * @brief Definitions for our fifos used for thread communication. + * + ******************************************************************************/ #pragma once - #include "common/os.h" #include "arena.h" #include <vector> #include <cassert> -template<class T> +template <class T> struct QUEUE { - OSALIGNLINE(volatile uint32_t) mLock{ 0 }; - OSALIGNLINE(volatile uint32_t) mNumEntries{ 0 }; + OSALIGNLINE(volatile uint32_t) mLock{0}; + OSALIGNLINE(volatile uint32_t) mNumEntries{0}; std::vector<T*> mBlocks; - T* mCurBlock{ nullptr }; - uint32_t mHead{ 0 }; - uint32_t mTail{ 0 }; - uint32_t mCurBlockIdx{ 0 }; + T* mCurBlock{nullptr}; + uint32_t mHead{0}; + uint32_t mTail{0}; + uint32_t mCurBlockIdx{0}; // power of 2 static const uint32_t mBlockSizeShift = 6; - static const uint32_t mBlockSize = 1 << mBlockSizeShift; + static const uint32_t mBlockSize = 1 << mBlockSizeShift; template <typename ArenaT> void clear(ArenaT& arena) @@ -55,18 +54,15 @@ struct QUEUE mHead = 0; mTail = 0; mBlocks.clear(); - T* pNewBlock = (T*)arena.AllocAligned(sizeof(T)*mBlockSize, KNOB_SIMD_WIDTH*4); + T* pNewBlock = (T*)arena.AllocAligned(sizeof(T) * mBlockSize, KNOB_SIMD_WIDTH * 4); mBlocks.push_back(pNewBlock); - mCurBlock = pNewBlock; + mCurBlock = pNewBlock; mCurBlockIdx = 0; - mNumEntries = 0; - mLock = 0; + mNumEntries = 0; + mLock = 0; } - uint32_t getNumQueued() - { - return mNumEntries; - } + uint32_t getNumQueued() { return mNumEntries; } bool tryLock() { @@ -80,10 +76,7 @@ struct QUEUE return (initial == 0); } - void unlock() - { - mLock = 0; - } + void unlock() { mLock = 0; } T* peek() { @@ -92,34 +85,33 @@ struct QUEUE return nullptr; } uint32_t block = mHead >> mBlockSizeShift; - return &mBlocks[block][mHead & (mBlockSize-1)]; + return &mBlocks[block][mHead & (mBlockSize - 1)]; } void dequeue_noinc() { - mHead ++; - mNumEntries --; + mHead++; + mNumEntries--; } template <typename ArenaT> bool enqueue_try_nosync(ArenaT& arena, const T* entry) { const float* pSrc = (const float*)entry; - float* pDst = (float*)&mCurBlock[mTail]; + float* pDst = (float*)&mCurBlock[mTail]; - auto lambda = [&](int32_t i) - { - __m256 vSrc = _mm256_load_ps(pSrc + i*KNOB_SIMD_WIDTH); - _mm256_stream_ps(pDst + i*KNOB_SIMD_WIDTH, vSrc); + auto lambda = [&](int32_t i) { + __m256 vSrc = _mm256_load_ps(pSrc + i * KNOB_SIMD_WIDTH); + _mm256_stream_ps(pDst + i * KNOB_SIMD_WIDTH, vSrc); }; - const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH*4); + const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH * 4); static_assert(numSimdLines * KNOB_SIMD_WIDTH * 4 == sizeof(T), - "FIFO element size should be multiple of SIMD width."); + "FIFO element size should be multiple of SIMD width."); UnrollerL<0, numSimdLines, 1>::step(lambda); - mTail ++; + mTail++; if (mTail == mBlockSize) { if (++mCurBlockIdx < mBlocks.size()) @@ -128,7 +120,7 @@ struct QUEUE } else { - T* newBlock = (T*)arena.AllocAligned(sizeof(T)*mBlockSize, KNOB_SIMD_WIDTH*4); + T* newBlock = (T*)arena.AllocAligned(sizeof(T) * mBlockSize, KNOB_SIMD_WIDTH * 4); SWR_ASSERT(newBlock); mBlocks.push_back(newBlock); @@ -138,12 +130,9 @@ struct QUEUE mTail = 0; } - mNumEntries ++; + mNumEntries++; return true; } - void destroy() - { - } - + void destroy() {} }; diff --git a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h index 72843f59062..90bf118727e 100644 --- a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h +++ b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h @@ -1,30 +1,30 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file format_conversion.h -* -* @brief API implementation -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file format_conversion.h + * + * @brief API implementation + * + ******************************************************************************/ #include "format_types.h" #include "format_traits.h" @@ -33,15 +33,15 @@ /// SOA RGBA32_FLOAT format. /// @param pSrc - source data in SOA form /// @param dst - output data in SOA form -template<SWR_FORMAT SrcFormat> -INLINE void LoadSOA(const uint8_t *pSrc, simdvector &dst) +template <SWR_FORMAT SrcFormat> +INLINE void LoadSOA(const uint8_t* pSrc, simdvector& dst) { // fast path for float32 - if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<SrcFormat>::GetBPC(0) == 32)) + if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && + (FormatTraits<SrcFormat>::GetBPC(0) == 32)) { - auto lambda = [&](int comp) - { - simdscalar vComp = _simd_load_ps((const float*)(pSrc + comp*sizeof(simdscalar))); + auto lambda = [&](int comp) { + simdscalar vComp = _simd_load_ps((const float*)(pSrc + comp * sizeof(simdscalar))); dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp; }; @@ -50,8 +50,7 @@ INLINE void LoadSOA(const uint8_t *pSrc, simdvector &dst) return; } - auto lambda = [&](int comp) - { + auto lambda = [&](int comp) { // load SIMD components simdscalar vComp = FormatTraits<SrcFormat>::loadSOA(comp, pSrc); @@ -74,12 +73,12 @@ INLINE void LoadSOA(const uint8_t *pSrc, simdvector &dst) } ////////////////////////////////////////////////////////////////////////// -/// @brief Clamps the given component based on the requirements on the +/// @brief Clamps the given component based on the requirements on the /// Format template arg /// @param vComp - SIMD vector of floats /// @param Component - component -template<SWR_FORMAT Format> -INLINE simdscalar Clamp(simdscalar const &vC, uint32_t Component) +template <SWR_FORMAT Format> +INLINE simdscalar Clamp(simdscalar const& vC, uint32_t Component) { simdscalar vComp = vC; if (FormatTraits<Format>::isNormalized(Component)) @@ -99,21 +98,21 @@ INLINE simdscalar Clamp(simdscalar const &vC, uint32_t Component) { if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UINT) { - int iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1; - int iMin = 0; + int iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1; + int iMin = 0; simdscalari vCompi = _simd_castps_si(vComp); - vCompi = _simd_max_epu32(vCompi, _simd_set1_epi32(iMin)); - vCompi = _simd_min_epu32(vCompi, _simd_set1_epi32(iMax)); - vComp = _simd_castsi_ps(vCompi); + vCompi = _simd_max_epu32(vCompi, _simd_set1_epi32(iMin)); + vCompi = _simd_min_epu32(vCompi, _simd_set1_epi32(iMax)); + vComp = _simd_castsi_ps(vCompi); } else if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SINT) { - int iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1; - int iMin = -1 - iMax; + int iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1; + int iMin = -1 - iMax; simdscalari vCompi = _simd_castps_si(vComp); - vCompi = _simd_max_epi32(vCompi, _simd_set1_epi32(iMin)); - vCompi = _simd_min_epi32(vCompi, _simd_set1_epi32(iMax)); - vComp = _simd_castsi_ps(vCompi); + vCompi = _simd_max_epi32(vCompi, _simd_set1_epi32(iMin)); + vCompi = _simd_min_epi32(vCompi, _simd_set1_epi32(iMax)); + vComp = _simd_castsi_ps(vCompi); } } @@ -125,8 +124,8 @@ INLINE simdscalar Clamp(simdscalar const &vC, uint32_t Component) /// Format template arg /// @param vComp - SIMD vector of floats /// @param Component - component -template<SWR_FORMAT Format> -INLINE simdscalar Normalize(simdscalar const &vC, uint32_t Component) +template <SWR_FORMAT Format> +INLINE simdscalar Normalize(simdscalar const& vC, uint32_t Component) { simdscalar vComp = vC; if (FormatTraits<Format>::isNormalized(Component)) @@ -142,11 +141,12 @@ INLINE simdscalar Normalize(simdscalar const &vC, uint32_t Component) /// RGBA32_FLOAT to SOA format /// @param src - source data in SOA form /// @param dst - output data in SOA form -template<SWR_FORMAT DstFormat> -INLINE void StoreSOA(const simdvector &src, uint8_t *pDst) +template <SWR_FORMAT DstFormat> +INLINE void StoreSOA(const simdvector& src, uint8_t* pDst) { // fast path for float32 - if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<DstFormat>::GetBPC(0) == 32)) + if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && + (FormatTraits<DstFormat>::GetBPC(0) == 32)) { for (uint32_t comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp) { @@ -155,25 +155,24 @@ INLINE void StoreSOA(const simdvector &src, uint8_t *pDst) // Gamma-correct if (FormatTraits<DstFormat>::isSRGB) { - if (comp < 3) // Input format is always RGBA32_FLOAT. + if (comp < 3) // Input format is always RGBA32_FLOAT. { vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp); } } - _simd_store_ps((float*)(pDst + comp*sizeof(simdscalar)), vComp); + _simd_store_ps((float*)(pDst + comp * sizeof(simdscalar)), vComp); } return; } - auto lambda = [&](int comp) - { + auto lambda = [&](int comp) { simdscalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)]; // Gamma-correct if (FormatTraits<DstFormat>::isSRGB) { - if (comp < 3) // Input format is always RGBA32_FLOAT. + if (comp < 3) // Input format is always RGBA32_FLOAT. { vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp); } @@ -203,15 +202,16 @@ INLINE void StoreSOA(const simdvector &src, uint8_t *pDst) /// SOA RGBA32_FLOAT format. /// @param pSrc - source data in SOA form /// @param dst - output data in SOA form -template<SWR_FORMAT SrcFormat> -INLINE void SIMDCALL LoadSOA(const uint8_t *pSrc, simd16vector &dst) +template <SWR_FORMAT SrcFormat> +INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simd16vector& dst) { // fast path for float32 - if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<SrcFormat>::GetBPC(0) == 32)) + if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && + (FormatTraits<SrcFormat>::GetBPC(0) == 32)) { - auto lambda = [&](int comp) - { - simd16scalar vComp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc + comp * sizeof(simd16scalar))); + auto lambda = [&](int comp) { + simd16scalar vComp = + _simd16_load_ps(reinterpret_cast<const float*>(pSrc + comp * sizeof(simd16scalar))); dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp; }; @@ -220,8 +220,7 @@ INLINE void SIMDCALL LoadSOA(const uint8_t *pSrc, simd16vector &dst) return; } - auto lambda = [&](int comp) - { + auto lambda = [&](int comp) { // load SIMD components simd16scalar vComp = FormatTraits<SrcFormat>::loadSOA_16(comp, pSrc); @@ -244,12 +243,12 @@ INLINE void SIMDCALL LoadSOA(const uint8_t *pSrc, simd16vector &dst) } ////////////////////////////////////////////////////////////////////////// -/// @brief Clamps the given component based on the requirements on the +/// @brief Clamps the given component based on the requirements on the /// Format template arg /// @param vComp - SIMD vector of floats /// @param Component - component -template<SWR_FORMAT Format> -INLINE simd16scalar SIMDCALL Clamp(simd16scalar const &v, uint32_t Component) +template <SWR_FORMAT Format> +INLINE simd16scalar SIMDCALL Clamp(simd16scalar const& v, uint32_t Component) { simd16scalar vComp = v; if (FormatTraits<Format>::isNormalized(Component)) @@ -269,21 +268,21 @@ INLINE simd16scalar SIMDCALL Clamp(simd16scalar const &v, uint32_t Component) { if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UINT) { - int iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1; - int iMin = 0; + int iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1; + int iMin = 0; simd16scalari vCompi = _simd16_castps_si(vComp); - vCompi = _simd16_max_epu32(vCompi, _simd16_set1_epi32(iMin)); - vCompi = _simd16_min_epu32(vCompi, _simd16_set1_epi32(iMax)); - vComp = _simd16_castsi_ps(vCompi); + vCompi = _simd16_max_epu32(vCompi, _simd16_set1_epi32(iMin)); + vCompi = _simd16_min_epu32(vCompi, _simd16_set1_epi32(iMax)); + vComp = _simd16_castsi_ps(vCompi); } else if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SINT) { - int iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1; - int iMin = -1 - iMax; + int iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1; + int iMin = -1 - iMax; simd16scalari vCompi = _simd16_castps_si(vComp); - vCompi = _simd16_max_epi32(vCompi, _simd16_set1_epi32(iMin)); - vCompi = _simd16_min_epi32(vCompi, _simd16_set1_epi32(iMax)); - vComp = _simd16_castsi_ps(vCompi); + vCompi = _simd16_max_epi32(vCompi, _simd16_set1_epi32(iMin)); + vCompi = _simd16_min_epi32(vCompi, _simd16_set1_epi32(iMax)); + vComp = _simd16_castsi_ps(vCompi); } } @@ -295,8 +294,8 @@ INLINE simd16scalar SIMDCALL Clamp(simd16scalar const &v, uint32_t Component) /// Format template arg /// @param vComp - SIMD vector of floats /// @param Component - component -template<SWR_FORMAT Format> -INLINE simd16scalar SIMDCALL Normalize(simd16scalar const &vComp, uint32_t Component) +template <SWR_FORMAT Format> +INLINE simd16scalar SIMDCALL Normalize(simd16scalar const& vComp, uint32_t Component) { simd16scalar r = vComp; if (FormatTraits<Format>::isNormalized(Component)) @@ -312,11 +311,12 @@ INLINE simd16scalar SIMDCALL Normalize(simd16scalar const &vComp, uint32_t Compo /// RGBA32_FLOAT to SOA format /// @param src - source data in SOA form /// @param dst - output data in SOA form -template<SWR_FORMAT DstFormat> -INLINE void SIMDCALL StoreSOA(const simd16vector &src, uint8_t *pDst) +template <SWR_FORMAT DstFormat> +INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst) { // fast path for float32 - if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<DstFormat>::GetBPC(0) == 32)) + if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && + (FormatTraits<DstFormat>::GetBPC(0) == 32)) { for (uint32_t comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp) { @@ -325,25 +325,24 @@ INLINE void SIMDCALL StoreSOA(const simd16vector &src, uint8_t *pDst) // Gamma-correct if (FormatTraits<DstFormat>::isSRGB) { - if (comp < 3) // Input format is always RGBA32_FLOAT. + if (comp < 3) // Input format is always RGBA32_FLOAT. { vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp); } } - _simd16_store_ps(reinterpret_cast<float *>(pDst + comp * sizeof(simd16scalar)), vComp); + _simd16_store_ps(reinterpret_cast<float*>(pDst + comp * sizeof(simd16scalar)), vComp); } return; } - auto lambda = [&](int comp) - { + auto lambda = [&](int comp) { simd16scalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)]; // Gamma-correct if (FormatTraits<DstFormat>::isSRGB) { - if (comp < 3) // Input format is always RGBA32_FLOAT. + if (comp < 3) // Input format is always RGBA32_FLOAT. { vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp); } diff --git a/src/gallium/drivers/swr/rasterizer/core/format_traits.h b/src/gallium/drivers/swr/rasterizer/core/format_traits.h index bc585dd175a..97e7d56e48e 100644 --- a/src/gallium/drivers/swr/rasterizer/core/format_traits.h +++ b/src/gallium/drivers/swr/rasterizer/core/format_traits.h @@ -1,32 +1,32 @@ /**************************************************************************** -* Copyright (C) 2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file format_traits.h -* -* @brief Format Traits. auto-generated file -* -* DO NOT EDIT -* -******************************************************************************/ + * Copyright (C) 2016 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file format_traits.h + * + * @brief Format Traits. auto-generated file + * + * DO NOT EDIT + * + ******************************************************************************/ #pragma once #include "format_types.h" @@ -35,13 +35,13 @@ ////////////////////////////////////////////////////////////////////////// /// FormatSwizzle - Component swizzle selects ////////////////////////////////////////////////////////////////////////// -template<uint32_t comp0 = 0, uint32_t comp1 = 0, uint32_t comp2 = 0, uint32_t comp3 = 0> +template <uint32_t comp0 = 0, uint32_t comp1 = 0, uint32_t comp2 = 0, uint32_t comp3 = 0> struct FormatSwizzle { // Return swizzle select for component. INLINE static uint32_t swizzle(uint32_t c) { - static const uint32_t s[4] = { comp0, comp1, comp2, comp3 }; + static const uint32_t s[4] = {comp0, comp1, comp2, comp3}; return s[c]; } }; @@ -49,41 +49,45 @@ struct FormatSwizzle ////////////////////////////////////////////////////////////////////////// /// FormatTraits - Format traits ////////////////////////////////////////////////////////////////////////// -template<SWR_FORMAT format> -struct FormatTraits : - ComponentTraits<SWR_TYPE_UNKNOWN, 0>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0> +template <SWR_FORMAT format> +struct FormatTraits : ComponentTraits<SWR_TYPE_UNKNOWN, 0>, FormatSwizzle<0>, Defaults<0, 0, 0, 0> { - static const uint32_t bpp{ 0 }; - static const uint32_t numComps{ 0 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; + static const uint32_t bpp{0}; + static const uint32_t numComps{0}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; }; ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32G32B32A32_FLOAT> - Format traits specialization for R32G32B32A32_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32G32B32A32_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 128 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32G32B32A32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, + 32, + SWR_TYPE_FLOAT, + 32, + SWR_TYPE_FLOAT, + 32, + SWR_TYPE_FLOAT, + 32>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{128}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32_32_32 TransposeT; typedef Format4<32, 32, 32, 32> FormatT; @@ -92,20 +96,21 @@ template<> struct FormatTraits<R32G32B32A32_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32G32B32A32_SINT> - Format traits specialization for R32G32B32A32_SINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32G32B32A32_SINT> : - ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 128 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32G32B32A32_SINT> + : ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{128}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32_32_32 TransposeT; typedef Format4<32, 32, 32, 32> FormatT; @@ -114,20 +119,21 @@ template<> struct FormatTraits<R32G32B32A32_SINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32G32B32A32_UINT> - Format traits specialization for R32G32B32A32_UINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32G32B32A32_UINT> : - ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 128 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32G32B32A32_UINT> + : ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{128}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32_32_32 TransposeT; typedef Format4<32, 32, 32, 32> FormatT; @@ -136,20 +142,20 @@ template<> struct FormatTraits<R32G32B32A32_UINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R64G64_FLOAT> - Format traits specialization for R64G64_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R64G64_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 128 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R64G64_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{128}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose64_64 TransposeT; typedef Format2<64, 64> FormatT; @@ -158,20 +164,27 @@ template<> struct FormatTraits<R64G64_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32G32B32X32_FLOAT> - Format traits specialization for R32G32B32X32_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32G32B32X32_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_UNUSED, 32>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 128 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32G32B32X32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, + 32, + SWR_TYPE_FLOAT, + 32, + SWR_TYPE_FLOAT, + 32, + SWR_TYPE_UNUSED, + 32>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{128}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32_32_32 TransposeT; typedef Format4<32, 32, 32, 32> FormatT; @@ -180,20 +193,27 @@ template<> struct FormatTraits<R32G32B32X32_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32G32B32A32_SSCALED> - Format traits specialization for R32G32B32A32_SSCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32G32B32A32_SSCALED> : - ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 128 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32G32B32A32_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED, + 32, + SWR_TYPE_SSCALED, + 32, + SWR_TYPE_SSCALED, + 32, + SWR_TYPE_SSCALED, + 32>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{128}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32_32_32 TransposeT; typedef Format4<32, 32, 32, 32> FormatT; @@ -202,20 +222,27 @@ template<> struct FormatTraits<R32G32B32A32_SSCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32G32B32A32_USCALED> - Format traits specialization for R32G32B32A32_USCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32G32B32A32_USCALED> : - ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 128 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32G32B32A32_USCALED> : ComponentTraits<SWR_TYPE_USCALED, + 32, + SWR_TYPE_USCALED, + 32, + SWR_TYPE_USCALED, + 32, + SWR_TYPE_USCALED, + 32>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{128}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32_32_32 TransposeT; typedef Format4<32, 32, 32, 32> FormatT; @@ -224,20 +251,27 @@ template<> struct FormatTraits<R32G32B32A32_USCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32G32B32A32_SFIXED> - Format traits specialization for R32G32B32A32_SFIXED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32G32B32A32_SFIXED> : - ComponentTraits<SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 128 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32G32B32A32_SFIXED> : ComponentTraits<SWR_TYPE_SFIXED, + 32, + SWR_TYPE_SFIXED, + 32, + SWR_TYPE_SFIXED, + 32, + SWR_TYPE_SFIXED, + 32>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{128}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32_32_32 TransposeT; typedef Format4<32, 32, 32, 32> FormatT; @@ -246,20 +280,21 @@ template<> struct FormatTraits<R32G32B32A32_SFIXED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32G32B32_FLOAT> - Format traits specialization for R32G32B32_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32G32B32_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 96 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32G32B32_FLOAT> + : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{96}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32_32 TransposeT; typedef Format3<32, 32, 32> FormatT; @@ -268,20 +303,21 @@ template<> struct FormatTraits<R32G32B32_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32G32B32_SINT> - Format traits specialization for R32G32B32_SINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32G32B32_SINT> : - ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 96 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32G32B32_SINT> + : ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{96}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32_32 TransposeT; typedef Format3<32, 32, 32> FormatT; @@ -290,20 +326,21 @@ template<> struct FormatTraits<R32G32B32_SINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32G32B32_UINT> - Format traits specialization for R32G32B32_UINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32G32B32_UINT> : - ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 96 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32G32B32_UINT> + : ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{96}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32_32 TransposeT; typedef Format3<32, 32, 32> FormatT; @@ -312,20 +349,21 @@ template<> struct FormatTraits<R32G32B32_UINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32G32B32_SSCALED> - Format traits specialization for R32G32B32_SSCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32G32B32_SSCALED> : - ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 96 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32G32B32_SSCALED> + : ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{96}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32_32 TransposeT; typedef Format3<32, 32, 32> FormatT; @@ -334,20 +372,21 @@ template<> struct FormatTraits<R32G32B32_SSCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32G32B32_USCALED> - Format traits specialization for R32G32B32_USCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32G32B32_USCALED> : - ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 96 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32G32B32_USCALED> + : ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{96}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32_32 TransposeT; typedef Format3<32, 32, 32> FormatT; @@ -356,20 +395,21 @@ template<> struct FormatTraits<R32G32B32_USCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32G32B32_SFIXED> - Format traits specialization for R32G32B32_SFIXED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32G32B32_SFIXED> : - ComponentTraits<SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 96 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32G32B32_SFIXED> + : ComponentTraits<SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{96}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32_32 TransposeT; typedef Format3<32, 32, 32> FormatT; @@ -378,20 +418,27 @@ template<> struct FormatTraits<R32G32B32_SFIXED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16G16B16A16_UNORM> - Format traits specialization for R16G16B16A16_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16G16B16A16_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16G16B16A16_UNORM> : ComponentTraits<SWR_TYPE_UNORM, + 16, + SWR_TYPE_UNORM, + 16, + SWR_TYPE_UNORM, + 16, + SWR_TYPE_UNORM, + 16>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16_16_16 TransposeT; typedef Format4<16, 16, 16, 16> FormatT; @@ -400,20 +447,27 @@ template<> struct FormatTraits<R16G16B16A16_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16G16B16A16_SNORM> - Format traits specialization for R16G16B16A16_SNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16G16B16A16_SNORM> : - ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16G16B16A16_SNORM> : ComponentTraits<SWR_TYPE_SNORM, + 16, + SWR_TYPE_SNORM, + 16, + SWR_TYPE_SNORM, + 16, + SWR_TYPE_SNORM, + 16>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16_16_16 TransposeT; typedef Format4<16, 16, 16, 16> FormatT; @@ -422,20 +476,21 @@ template<> struct FormatTraits<R16G16B16A16_SNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16G16B16A16_SINT> - Format traits specialization for R16G16B16A16_SINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16G16B16A16_SINT> : - ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16G16B16A16_SINT> + : ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16_16_16 TransposeT; typedef Format4<16, 16, 16, 16> FormatT; @@ -444,20 +499,21 @@ template<> struct FormatTraits<R16G16B16A16_SINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16G16B16A16_UINT> - Format traits specialization for R16G16B16A16_UINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16G16B16A16_UINT> : - ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16G16B16A16_UINT> + : ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16_16_16 TransposeT; typedef Format4<16, 16, 16, 16> FormatT; @@ -466,20 +522,27 @@ template<> struct FormatTraits<R16G16B16A16_UINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16G16B16A16_FLOAT> - Format traits specialization for R16G16B16A16_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16G16B16A16_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16G16B16A16_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, + 16, + SWR_TYPE_FLOAT, + 16, + SWR_TYPE_FLOAT, + 16, + SWR_TYPE_FLOAT, + 16>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16_16_16 TransposeT; typedef Format4<16, 16, 16, 16> FormatT; @@ -488,20 +551,20 @@ template<> struct FormatTraits<R16G16B16A16_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32G32_FLOAT> - Format traits specialization for R32G32_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32G32_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32G32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32 TransposeT; typedef Format2<32, 32> FormatT; @@ -510,20 +573,20 @@ template<> struct FormatTraits<R32G32_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32G32_SINT> - Format traits specialization for R32G32_SINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32G32_SINT> : - ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32G32_SINT> : ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32 TransposeT; typedef Format2<32, 32> FormatT; @@ -532,42 +595,44 @@ template<> struct FormatTraits<R32G32_SINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32G32_UINT> - Format traits specialization for R32G32_UINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32G32_UINT> : - ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32G32_UINT> : ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32 TransposeT; typedef Format2<32, 32> FormatT; }; ////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32_FLOAT_X8X24_TYPELESS> - Format traits specialization for R32_FLOAT_X8X24_TYPELESS +/// FormatTraits<R32_FLOAT_X8X24_TYPELESS> - Format traits specialization for +/// R32_FLOAT_X8X24_TYPELESS ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32_FLOAT_X8X24_TYPELESS> : - ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_UNUSED, 32>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> +template <> +struct FormatTraits<R32_FLOAT_X8X24_TYPELESS> + : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_UNUSED, 32>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> { - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; + static const uint32_t bpp{64}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32 TransposeT; typedef Format2<32, 32> FormatT; @@ -576,20 +641,21 @@ template<> struct FormatTraits<R32_FLOAT_X8X24_TYPELESS> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<X32_TYPELESS_G8X24_UINT> - Format traits specialization for X32_TYPELESS_G8X24_UINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<X32_TYPELESS_G8X24_UINT> : - ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UNUSED, 32>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<X32_TYPELESS_G8X24_UINT> + : ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UNUSED, 32>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32 TransposeT; typedef Format2<32, 32> FormatT; @@ -598,20 +664,20 @@ template<> struct FormatTraits<X32_TYPELESS_G8X24_UINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<L32A32_FLOAT> - Format traits specialization for L32A32_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<L32A32_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>, - FormatSwizzle<0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 1 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<L32A32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>, + FormatSwizzle<0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{2}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{1}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32 TransposeT; typedef Format2<32, 32> FormatT; @@ -620,20 +686,19 @@ template<> struct FormatTraits<L32A32_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R64_FLOAT> - Format traits specialization for R64_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R64_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 64>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R64_FLOAT> + : ComponentTraits<SWR_TYPE_FLOAT, 64>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<64> TransposeT; typedef Format1<64> FormatT; @@ -642,20 +707,27 @@ template<> struct FormatTraits<R64_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16G16B16X16_UNORM> - Format traits specialization for R16G16B16X16_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16G16B16X16_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNUSED, 16>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16G16B16X16_UNORM> : ComponentTraits<SWR_TYPE_UNORM, + 16, + SWR_TYPE_UNORM, + 16, + SWR_TYPE_UNORM, + 16, + SWR_TYPE_UNUSED, + 16>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16_16_16 TransposeT; typedef Format4<16, 16, 16, 16> FormatT; @@ -664,20 +736,27 @@ template<> struct FormatTraits<R16G16B16X16_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16G16B16X16_FLOAT> - Format traits specialization for R16G16B16X16_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16G16B16X16_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_UNUSED, 16>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16G16B16X16_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, + 16, + SWR_TYPE_FLOAT, + 16, + SWR_TYPE_FLOAT, + 16, + SWR_TYPE_UNUSED, + 16>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16_16_16 TransposeT; typedef Format4<16, 16, 16, 16> FormatT; @@ -686,20 +765,20 @@ template<> struct FormatTraits<R16G16B16X16_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<L32X32_FLOAT> - Format traits specialization for L32X32_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<L32X32_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>, - FormatSwizzle<0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<L32X32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>, + FormatSwizzle<0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32 TransposeT; typedef Format2<32, 32> FormatT; @@ -708,20 +787,20 @@ template<> struct FormatTraits<L32X32_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<I32X32_FLOAT> - Format traits specialization for I32X32_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<I32X32_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>, - FormatSwizzle<0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<I32X32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>, + FormatSwizzle<0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32 TransposeT; typedef Format2<32, 32> FormatT; @@ -730,20 +809,27 @@ template<> struct FormatTraits<I32X32_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16G16B16A16_SSCALED> - Format traits specialization for R16G16B16A16_SSCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16G16B16A16_SSCALED> : - ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16G16B16A16_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED, + 16, + SWR_TYPE_SSCALED, + 16, + SWR_TYPE_SSCALED, + 16, + SWR_TYPE_SSCALED, + 16>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16_16_16 TransposeT; typedef Format4<16, 16, 16, 16> FormatT; @@ -752,20 +838,27 @@ template<> struct FormatTraits<R16G16B16A16_SSCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16G16B16A16_USCALED> - Format traits specialization for R16G16B16A16_USCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16G16B16A16_USCALED> : - ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16G16B16A16_USCALED> : ComponentTraits<SWR_TYPE_USCALED, + 16, + SWR_TYPE_USCALED, + 16, + SWR_TYPE_USCALED, + 16, + SWR_TYPE_USCALED, + 16>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16_16_16 TransposeT; typedef Format4<16, 16, 16, 16> FormatT; @@ -774,20 +867,20 @@ template<> struct FormatTraits<R16G16B16A16_USCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32G32_SSCALED> - Format traits specialization for R32G32_SSCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32G32_SSCALED> : - ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32G32_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32 TransposeT; typedef Format2<32, 32> FormatT; @@ -796,20 +889,20 @@ template<> struct FormatTraits<R32G32_SSCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32G32_USCALED> - Format traits specialization for R32G32_USCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32G32_USCALED> : - ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32G32_USCALED> : ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32 TransposeT; typedef Format2<32, 32> FormatT; @@ -818,20 +911,20 @@ template<> struct FormatTraits<R32G32_USCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32G32_SFIXED> - Format traits specialization for R32G32_SFIXED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32G32_SFIXED> : - ComponentTraits<SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32G32_SFIXED> : ComponentTraits<SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose32_32 TransposeT; typedef Format2<32, 32> FormatT; @@ -840,20 +933,21 @@ template<> struct FormatTraits<R32G32_SFIXED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<B8G8R8A8_UNORM> - Format traits specialization for B8G8R8A8_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<B8G8R8A8_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<B8G8R8A8_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8_8_8 TransposeT; typedef Format4<8, 8, 8, 8> FormatT; @@ -862,20 +956,21 @@ template<> struct FormatTraits<B8G8R8A8_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<B8G8R8A8_UNORM_SRGB> - Format traits specialization for B8G8R8A8_UNORM_SRGB ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<B8G8R8A8_UNORM_SRGB> : - ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ true }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<B8G8R8A8_UNORM_SRGB> + : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{true}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8_8_8 TransposeT; typedef Format4<8, 8, 8, 8> FormatT; @@ -884,20 +979,27 @@ template<> struct FormatTraits<B8G8R8A8_UNORM_SRGB> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R10G10B10A2_UNORM> - Format traits specialization for R10G10B10A2_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R10G10B10A2_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R10G10B10A2_UNORM> : ComponentTraits<SWR_TYPE_UNORM, + 10, + SWR_TYPE_UNORM, + 10, + SWR_TYPE_UNORM, + 10, + SWR_TYPE_UNORM, + 2>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose10_10_10_2 TransposeT; typedef Format4<10, 10, 10, 2> FormatT; @@ -906,20 +1008,27 @@ template<> struct FormatTraits<R10G10B10A2_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R10G10B10A2_UNORM_SRGB> - Format traits specialization for R10G10B10A2_UNORM_SRGB ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R10G10B10A2_UNORM_SRGB> : - ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ true }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R10G10B10A2_UNORM_SRGB> : ComponentTraits<SWR_TYPE_UNORM, + 10, + SWR_TYPE_UNORM, + 10, + SWR_TYPE_UNORM, + 10, + SWR_TYPE_UNORM, + 2>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{true}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose10_10_10_2 TransposeT; typedef Format4<10, 10, 10, 2> FormatT; @@ -928,20 +1037,21 @@ template<> struct FormatTraits<R10G10B10A2_UNORM_SRGB> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R10G10B10A2_UINT> - Format traits specialization for R10G10B10A2_UINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R10G10B10A2_UINT> : - ComponentTraits<SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 2>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R10G10B10A2_UINT> + : ComponentTraits<SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 2>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose10_10_10_2 TransposeT; typedef Format4<10, 10, 10, 2> FormatT; @@ -950,20 +1060,21 @@ template<> struct FormatTraits<R10G10B10A2_UINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8G8B8A8_UNORM> - Format traits specialization for R8G8B8A8_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8G8B8A8_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8G8B8A8_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8_8_8 TransposeT; typedef Format4<8, 8, 8, 8> FormatT; @@ -972,20 +1083,21 @@ template<> struct FormatTraits<R8G8B8A8_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8G8B8A8_UNORM_SRGB> - Format traits specialization for R8G8B8A8_UNORM_SRGB ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8G8B8A8_UNORM_SRGB> : - ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ true }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8G8B8A8_UNORM_SRGB> + : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{true}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8_8_8 TransposeT; typedef Format4<8, 8, 8, 8> FormatT; @@ -994,20 +1106,21 @@ template<> struct FormatTraits<R8G8B8A8_UNORM_SRGB> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8G8B8A8_SNORM> - Format traits specialization for R8G8B8A8_SNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8G8B8A8_SNORM> : - ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8G8B8A8_SNORM> + : ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8_8_8 TransposeT; typedef Format4<8, 8, 8, 8> FormatT; @@ -1016,20 +1129,21 @@ template<> struct FormatTraits<R8G8B8A8_SNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8G8B8A8_SINT> - Format traits specialization for R8G8B8A8_SINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8G8B8A8_SINT> : - ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8G8B8A8_SINT> + : ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8_8_8 TransposeT; typedef Format4<8, 8, 8, 8> FormatT; @@ -1038,20 +1152,21 @@ template<> struct FormatTraits<R8G8B8A8_SINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8G8B8A8_UINT> - Format traits specialization for R8G8B8A8_UINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8G8B8A8_UINT> : - ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8G8B8A8_UINT> + : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8_8_8 TransposeT; typedef Format4<8, 8, 8, 8> FormatT; @@ -1060,20 +1175,20 @@ template<> struct FormatTraits<R8G8B8A8_UINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16G16_UNORM> - Format traits specialization for R16G16_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16G16_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16G16_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16 TransposeT; typedef Format2<16, 16> FormatT; @@ -1082,20 +1197,20 @@ template<> struct FormatTraits<R16G16_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16G16_SNORM> - Format traits specialization for R16G16_SNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16G16_SNORM> : - ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16G16_SNORM> : ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16 TransposeT; typedef Format2<16, 16> FormatT; @@ -1104,20 +1219,20 @@ template<> struct FormatTraits<R16G16_SNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16G16_SINT> - Format traits specialization for R16G16_SINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16G16_SINT> : - ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16G16_SINT> : ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16 TransposeT; typedef Format2<16, 16> FormatT; @@ -1126,20 +1241,20 @@ template<> struct FormatTraits<R16G16_SINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16G16_UINT> - Format traits specialization for R16G16_UINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16G16_UINT> : - ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16G16_UINT> : ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16 TransposeT; typedef Format2<16, 16> FormatT; @@ -1148,20 +1263,20 @@ template<> struct FormatTraits<R16G16_UINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16G16_FLOAT> - Format traits specialization for R16G16_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16G16_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16G16_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16 TransposeT; typedef Format2<16, 16> FormatT; @@ -1170,20 +1285,27 @@ template<> struct FormatTraits<R16G16_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<B10G10R10A2_UNORM> - Format traits specialization for B10G10R10A2_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<B10G10R10A2_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<B10G10R10A2_UNORM> : ComponentTraits<SWR_TYPE_UNORM, + 10, + SWR_TYPE_UNORM, + 10, + SWR_TYPE_UNORM, + 10, + SWR_TYPE_UNORM, + 2>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose10_10_10_2 TransposeT; typedef Format4<10, 10, 10, 2> FormatT; @@ -1192,20 +1314,27 @@ template<> struct FormatTraits<B10G10R10A2_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<B10G10R10A2_UNORM_SRGB> - Format traits specialization for B10G10R10A2_UNORM_SRGB ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<B10G10R10A2_UNORM_SRGB> : - ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ true }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<B10G10R10A2_UNORM_SRGB> : ComponentTraits<SWR_TYPE_UNORM, + 10, + SWR_TYPE_UNORM, + 10, + SWR_TYPE_UNORM, + 10, + SWR_TYPE_UNORM, + 2>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{true}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose10_10_10_2 TransposeT; typedef Format4<10, 10, 10, 2> FormatT; @@ -1214,42 +1343,51 @@ template<> struct FormatTraits<B10G10R10A2_UNORM_SRGB> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R11G11B10_FLOAT> - Format traits specialization for R11G11B10_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R11G11B10_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 11, SWR_TYPE_FLOAT, 11, SWR_TYPE_FLOAT, 10>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R11G11B10_FLOAT> + : ComponentTraits<SWR_TYPE_FLOAT, 11, SWR_TYPE_FLOAT, 11, SWR_TYPE_FLOAT, 10>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose11_11_10 TransposeT; typedef Format3<11, 11, 10> FormatT; }; ////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R10G10B10_FLOAT_A2_UNORM> - Format traits specialization for R10G10B10_FLOAT_A2_UNORM -////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R10G10B10_FLOAT_A2_UNORM> : - ComponentTraits<SWR_TYPE_FLOAT, 10, SWR_TYPE_FLOAT, 10, SWR_TYPE_FLOAT, 10, SWR_TYPE_UNORM, 2>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +/// FormatTraits<R10G10B10_FLOAT_A2_UNORM> - Format traits specialization for +/// R10G10B10_FLOAT_A2_UNORM +////////////////////////////////////////////////////////////////////////// +template <> +struct FormatTraits<R10G10B10_FLOAT_A2_UNORM> : ComponentTraits<SWR_TYPE_FLOAT, + 10, + SWR_TYPE_FLOAT, + 10, + SWR_TYPE_FLOAT, + 10, + SWR_TYPE_UNORM, + 2>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose10_10_10_2 TransposeT; typedef Format4<10, 10, 10, 2> FormatT; @@ -1258,20 +1396,19 @@ template<> struct FormatTraits<R10G10B10_FLOAT_A2_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32_SINT> - Format traits specialization for R32_SINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32_SINT> : - ComponentTraits<SWR_TYPE_SINT, 32>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32_SINT> + : ComponentTraits<SWR_TYPE_SINT, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<32> TransposeT; typedef Format1<32> FormatT; @@ -1280,20 +1417,19 @@ template<> struct FormatTraits<R32_SINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32_UINT> - Format traits specialization for R32_UINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32_UINT> : - ComponentTraits<SWR_TYPE_UINT, 32>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32_UINT> + : ComponentTraits<SWR_TYPE_UINT, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<32> TransposeT; typedef Format1<32> FormatT; @@ -1302,20 +1438,19 @@ template<> struct FormatTraits<R32_UINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32_FLOAT> - Format traits specialization for R32_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 32>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32_FLOAT> + : ComponentTraits<SWR_TYPE_FLOAT, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<32> TransposeT; typedef Format1<32> FormatT; @@ -1324,20 +1459,19 @@ template<> struct FormatTraits<R32_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R24_UNORM_X8_TYPELESS> - Format traits specialization for R24_UNORM_X8_TYPELESS ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R24_UNORM_X8_TYPELESS> : - ComponentTraits<SWR_TYPE_UNORM, 24>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R24_UNORM_X8_TYPELESS> + : ComponentTraits<SWR_TYPE_UNORM, 24>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<32> TransposeT; typedef Format1<24> FormatT; @@ -1346,20 +1480,19 @@ template<> struct FormatTraits<R24_UNORM_X8_TYPELESS> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<X24_TYPELESS_G8_UINT> - Format traits specialization for X24_TYPELESS_G8_UINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<X24_TYPELESS_G8_UINT> : - ComponentTraits<SWR_TYPE_UINT, 32>, - FormatSwizzle<1>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<X24_TYPELESS_G8_UINT> + : ComponentTraits<SWR_TYPE_UINT, 32>, FormatSwizzle<1>, Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<32> TransposeT; typedef Format1<32> FormatT; @@ -1368,20 +1501,19 @@ template<> struct FormatTraits<X24_TYPELESS_G8_UINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<L32_UNORM> - Format traits specialization for L32_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<L32_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 32>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<L32_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<32> TransposeT; typedef Format1<32> FormatT; @@ -1390,20 +1522,20 @@ template<> struct FormatTraits<L32_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<L16A16_UNORM> - Format traits specialization for L16A16_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<L16A16_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>, - FormatSwizzle<0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 1 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<L16A16_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>, + FormatSwizzle<0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{2}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{1}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16 TransposeT; typedef Format2<16, 16> FormatT; @@ -1412,20 +1544,20 @@ template<> struct FormatTraits<L16A16_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<I24X8_UNORM> - Format traits specialization for I24X8_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<I24X8_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 24, SWR_TYPE_UNORM, 8>, - FormatSwizzle<0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<I24X8_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 24, SWR_TYPE_UNORM, 8>, + FormatSwizzle<0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose24_8 TransposeT; typedef Format2<24, 8> FormatT; @@ -1434,20 +1566,20 @@ template<> struct FormatTraits<I24X8_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<L24X8_UNORM> - Format traits specialization for L24X8_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<L24X8_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 24, SWR_TYPE_UNORM, 8>, - FormatSwizzle<0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<L24X8_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 24, SWR_TYPE_UNORM, 8>, + FormatSwizzle<0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose24_8 TransposeT; typedef Format2<24, 8> FormatT; @@ -1456,20 +1588,19 @@ template<> struct FormatTraits<L24X8_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<I32_FLOAT> - Format traits specialization for I32_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<I32_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 32>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<I32_FLOAT> + : ComponentTraits<SWR_TYPE_FLOAT, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<32> TransposeT; typedef Format1<32> FormatT; @@ -1478,20 +1609,19 @@ template<> struct FormatTraits<I32_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<L32_FLOAT> - Format traits specialization for L32_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<L32_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 32>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<L32_FLOAT> + : ComponentTraits<SWR_TYPE_FLOAT, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<32> TransposeT; typedef Format1<32> FormatT; @@ -1500,20 +1630,19 @@ template<> struct FormatTraits<L32_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<A32_FLOAT> - Format traits specialization for A32_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<A32_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 32>, - FormatSwizzle<3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<A32_FLOAT> + : ComponentTraits<SWR_TYPE_FLOAT, 32>, FormatSwizzle<3>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{1}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<32> TransposeT; typedef Format1<32> FormatT; @@ -1522,20 +1651,21 @@ template<> struct FormatTraits<A32_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<B8G8R8X8_UNORM> - Format traits specialization for B8G8R8X8_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<B8G8R8X8_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<B8G8R8X8_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8_8_8 TransposeT; typedef Format4<8, 8, 8, 8> FormatT; @@ -1544,20 +1674,21 @@ template<> struct FormatTraits<B8G8R8X8_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<B8G8R8X8_UNORM_SRGB> - Format traits specialization for B8G8R8X8_UNORM_SRGB ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<B8G8R8X8_UNORM_SRGB> : - ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ true }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<B8G8R8X8_UNORM_SRGB> + : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{true}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8_8_8 TransposeT; typedef Format4<8, 8, 8, 8> FormatT; @@ -1566,20 +1697,21 @@ template<> struct FormatTraits<B8G8R8X8_UNORM_SRGB> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8G8B8X8_UNORM> - Format traits specialization for R8G8B8X8_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8G8B8X8_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8G8B8X8_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8_8_8 TransposeT; typedef Format4<8, 8, 8, 8> FormatT; @@ -1588,20 +1720,21 @@ template<> struct FormatTraits<R8G8B8X8_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8G8B8X8_UNORM_SRGB> - Format traits specialization for R8G8B8X8_UNORM_SRGB ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8G8B8X8_UNORM_SRGB> : - ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ true }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8G8B8X8_UNORM_SRGB> + : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{true}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8_8_8 TransposeT; typedef Format4<8, 8, 8, 8> FormatT; @@ -1610,20 +1743,21 @@ template<> struct FormatTraits<R8G8B8X8_UNORM_SRGB> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R9G9B9E5_SHAREDEXP> - Format traits specialization for R9G9B9E5_SHAREDEXP ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R9G9B9E5_SHAREDEXP> : - ComponentTraits<SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 5>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R9G9B9E5_SHAREDEXP> + : ComponentTraits<SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 5>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose9_9_9_5 TransposeT; typedef Format4<9, 9, 9, 5> FormatT; @@ -1632,20 +1766,27 @@ template<> struct FormatTraits<R9G9B9E5_SHAREDEXP> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<B10G10R10X2_UNORM> - Format traits specialization for B10G10R10X2_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<B10G10R10X2_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNUSED, 2>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<B10G10R10X2_UNORM> : ComponentTraits<SWR_TYPE_UNORM, + 10, + SWR_TYPE_UNORM, + 10, + SWR_TYPE_UNORM, + 10, + SWR_TYPE_UNUSED, + 2>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose10_10_10_2 TransposeT; typedef Format4<10, 10, 10, 2> FormatT; @@ -1654,20 +1795,20 @@ template<> struct FormatTraits<B10G10R10X2_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<L16A16_FLOAT> - Format traits specialization for L16A16_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<L16A16_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>, - FormatSwizzle<0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 1 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<L16A16_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>, + FormatSwizzle<0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{2}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{1}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16 TransposeT; typedef Format2<16, 16> FormatT; @@ -1676,20 +1817,27 @@ template<> struct FormatTraits<L16A16_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R10G10B10X2_USCALED> - Format traits specialization for R10G10B10X2_USCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R10G10B10X2_USCALED> : - ComponentTraits<SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_UNUSED, 2>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R10G10B10X2_USCALED> : ComponentTraits<SWR_TYPE_USCALED, + 10, + SWR_TYPE_USCALED, + 10, + SWR_TYPE_USCALED, + 10, + SWR_TYPE_UNUSED, + 2>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose10_10_10_2 TransposeT; typedef Format4<10, 10, 10, 2> FormatT; @@ -1698,20 +1846,27 @@ template<> struct FormatTraits<R10G10B10X2_USCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8G8B8A8_SSCALED> - Format traits specialization for R8G8B8A8_SSCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8G8B8A8_SSCALED> : - ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8G8B8A8_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED, + 8, + SWR_TYPE_SSCALED, + 8, + SWR_TYPE_SSCALED, + 8, + SWR_TYPE_SSCALED, + 8>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8_8_8 TransposeT; typedef Format4<8, 8, 8, 8> FormatT; @@ -1720,20 +1875,27 @@ template<> struct FormatTraits<R8G8B8A8_SSCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8G8B8A8_USCALED> - Format traits specialization for R8G8B8A8_USCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8G8B8A8_USCALED> : - ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8G8B8A8_USCALED> : ComponentTraits<SWR_TYPE_USCALED, + 8, + SWR_TYPE_USCALED, + 8, + SWR_TYPE_USCALED, + 8, + SWR_TYPE_USCALED, + 8>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8_8_8 TransposeT; typedef Format4<8, 8, 8, 8> FormatT; @@ -1742,20 +1904,20 @@ template<> struct FormatTraits<R8G8B8A8_USCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16G16_SSCALED> - Format traits specialization for R16G16_SSCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16G16_SSCALED> : - ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16G16_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16 TransposeT; typedef Format2<16, 16> FormatT; @@ -1764,20 +1926,20 @@ template<> struct FormatTraits<R16G16_SSCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16G16_USCALED> - Format traits specialization for R16G16_USCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16G16_USCALED> : - ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16G16_USCALED> : ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16 TransposeT; typedef Format2<16, 16> FormatT; @@ -1786,20 +1948,19 @@ template<> struct FormatTraits<R16G16_USCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32_SSCALED> - Format traits specialization for R32_SSCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32_SSCALED> : - ComponentTraits<SWR_TYPE_SSCALED, 32>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32_SSCALED> + : ComponentTraits<SWR_TYPE_SSCALED, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<32> TransposeT; typedef Format1<32> FormatT; @@ -1808,20 +1969,19 @@ template<> struct FormatTraits<R32_SSCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32_USCALED> - Format traits specialization for R32_USCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32_USCALED> : - ComponentTraits<SWR_TYPE_USCALED, 32>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32_USCALED> + : ComponentTraits<SWR_TYPE_USCALED, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<32> TransposeT; typedef Format1<32> FormatT; @@ -1830,20 +1990,21 @@ template<> struct FormatTraits<R32_USCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<B5G6R5_UNORM> - Format traits specialization for B5G6R5_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<B5G6R5_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 6, SWR_TYPE_UNORM, 5>, - FormatSwizzle<2, 1, 0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<B5G6R5_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 6, SWR_TYPE_UNORM, 5>, + FormatSwizzle<2, 1, 0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose5_6_5 TransposeT; typedef Format3<5, 6, 5> FormatT; @@ -1852,20 +2013,21 @@ template<> struct FormatTraits<B5G6R5_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<B5G6R5_UNORM_SRGB> - Format traits specialization for B5G6R5_UNORM_SRGB ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<B5G6R5_UNORM_SRGB> : - ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 6, SWR_TYPE_UNORM, 5>, - FormatSwizzle<2, 1, 0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ true }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<B5G6R5_UNORM_SRGB> + : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 6, SWR_TYPE_UNORM, 5>, + FormatSwizzle<2, 1, 0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{true}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose5_6_5 TransposeT; typedef Format3<5, 6, 5> FormatT; @@ -1874,20 +2036,21 @@ template<> struct FormatTraits<B5G6R5_UNORM_SRGB> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<B5G5R5A1_UNORM> - Format traits specialization for B5G5R5A1_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<B5G5R5A1_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 1>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<B5G5R5A1_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 1>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose5_5_5_1 TransposeT; typedef Format4<5, 5, 5, 1> FormatT; @@ -1896,20 +2059,21 @@ template<> struct FormatTraits<B5G5R5A1_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<B5G5R5A1_UNORM_SRGB> - Format traits specialization for B5G5R5A1_UNORM_SRGB ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<B5G5R5A1_UNORM_SRGB> : - ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 1>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ true }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<B5G5R5A1_UNORM_SRGB> + : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 1>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{true}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose5_5_5_1 TransposeT; typedef Format4<5, 5, 5, 1> FormatT; @@ -1918,20 +2082,21 @@ template<> struct FormatTraits<B5G5R5A1_UNORM_SRGB> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<B4G4R4A4_UNORM> - Format traits specialization for B4G4R4A4_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<B4G4R4A4_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<B4G4R4A4_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose4_4_4_4 TransposeT; typedef Format4<4, 4, 4, 4> FormatT; @@ -1940,20 +2105,21 @@ template<> struct FormatTraits<B4G4R4A4_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<B4G4R4A4_UNORM_SRGB> - Format traits specialization for B4G4R4A4_UNORM_SRGB ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<B4G4R4A4_UNORM_SRGB> : - ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ true }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<B4G4R4A4_UNORM_SRGB> + : ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{true}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose4_4_4_4 TransposeT; typedef Format4<4, 4, 4, 4> FormatT; @@ -1962,20 +2128,20 @@ template<> struct FormatTraits<B4G4R4A4_UNORM_SRGB> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8G8_UNORM> - Format traits specialization for R8G8_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8G8_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8G8_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8 TransposeT; typedef Format2<8, 8> FormatT; @@ -1984,20 +2150,20 @@ template<> struct FormatTraits<R8G8_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8G8_SNORM> - Format traits specialization for R8G8_SNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8G8_SNORM> : - ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8G8_SNORM> : ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8 TransposeT; typedef Format2<8, 8> FormatT; @@ -2006,20 +2172,20 @@ template<> struct FormatTraits<R8G8_SNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8G8_SINT> - Format traits specialization for R8G8_SINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8G8_SINT> : - ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8G8_SINT> : ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8 TransposeT; typedef Format2<8, 8> FormatT; @@ -2028,20 +2194,20 @@ template<> struct FormatTraits<R8G8_SINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8G8_UINT> - Format traits specialization for R8G8_UINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8G8_UINT> : - ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8G8_UINT> : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8 TransposeT; typedef Format2<8, 8> FormatT; @@ -2050,20 +2216,19 @@ template<> struct FormatTraits<R8G8_UINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16_UNORM> - Format traits specialization for R16_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 16>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<16> TransposeT; typedef Format1<16> FormatT; @@ -2072,20 +2237,19 @@ template<> struct FormatTraits<R16_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16_SNORM> - Format traits specialization for R16_SNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16_SNORM> : - ComponentTraits<SWR_TYPE_SNORM, 16>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16_SNORM> + : ComponentTraits<SWR_TYPE_SNORM, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<16> TransposeT; typedef Format1<16> FormatT; @@ -2094,20 +2258,19 @@ template<> struct FormatTraits<R16_SNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16_SINT> - Format traits specialization for R16_SINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16_SINT> : - ComponentTraits<SWR_TYPE_SINT, 16>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16_SINT> + : ComponentTraits<SWR_TYPE_SINT, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<16> TransposeT; typedef Format1<16> FormatT; @@ -2116,20 +2279,19 @@ template<> struct FormatTraits<R16_SINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16_UINT> - Format traits specialization for R16_UINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16_UINT> : - ComponentTraits<SWR_TYPE_UINT, 16>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16_UINT> + : ComponentTraits<SWR_TYPE_UINT, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<16> TransposeT; typedef Format1<16> FormatT; @@ -2138,20 +2300,19 @@ template<> struct FormatTraits<R16_UINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16_FLOAT> - Format traits specialization for R16_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 16>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16_FLOAT> + : ComponentTraits<SWR_TYPE_FLOAT, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<16> TransposeT; typedef Format1<16> FormatT; @@ -2160,20 +2321,19 @@ template<> struct FormatTraits<R16_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<I16_UNORM> - Format traits specialization for I16_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<I16_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 16>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<I16_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<16> TransposeT; typedef Format1<16> FormatT; @@ -2182,20 +2342,19 @@ template<> struct FormatTraits<I16_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<L16_UNORM> - Format traits specialization for L16_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<L16_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 16>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<L16_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<16> TransposeT; typedef Format1<16> FormatT; @@ -2204,20 +2363,19 @@ template<> struct FormatTraits<L16_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<A16_UNORM> - Format traits specialization for A16_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<A16_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 16>, - FormatSwizzle<3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<A16_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 16>, FormatSwizzle<3>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{1}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<16> TransposeT; typedef Format1<16> FormatT; @@ -2226,20 +2384,20 @@ template<> struct FormatTraits<A16_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<L8A8_UNORM> - Format traits specialization for L8A8_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<L8A8_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, - FormatSwizzle<0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 1 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<L8A8_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, + FormatSwizzle<0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{2}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{1}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8 TransposeT; typedef Format2<8, 8> FormatT; @@ -2248,20 +2406,19 @@ template<> struct FormatTraits<L8A8_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<I16_FLOAT> - Format traits specialization for I16_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<I16_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 16>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<I16_FLOAT> + : ComponentTraits<SWR_TYPE_FLOAT, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<16> TransposeT; typedef Format1<16> FormatT; @@ -2270,20 +2427,19 @@ template<> struct FormatTraits<I16_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<L16_FLOAT> - Format traits specialization for L16_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<L16_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 16>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<L16_FLOAT> + : ComponentTraits<SWR_TYPE_FLOAT, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<16> TransposeT; typedef Format1<16> FormatT; @@ -2292,20 +2448,19 @@ template<> struct FormatTraits<L16_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<A16_FLOAT> - Format traits specialization for A16_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<A16_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 16>, - FormatSwizzle<3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<A16_FLOAT> + : ComponentTraits<SWR_TYPE_FLOAT, 16>, FormatSwizzle<3>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{1}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<16> TransposeT; typedef Format1<16> FormatT; @@ -2314,20 +2469,20 @@ template<> struct FormatTraits<A16_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<L8A8_UNORM_SRGB> - Format traits specialization for L8A8_UNORM_SRGB ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<L8A8_UNORM_SRGB> : - ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, - FormatSwizzle<0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 1 }; - static const bool isSRGB{ true }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<L8A8_UNORM_SRGB> : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, + FormatSwizzle<0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{2}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{1}; + static const bool isSRGB{true}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8 TransposeT; typedef Format2<8, 8> FormatT; @@ -2336,20 +2491,21 @@ template<> struct FormatTraits<L8A8_UNORM_SRGB> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<B5G5R5X1_UNORM> - Format traits specialization for B5G5R5X1_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<B5G5R5X1_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNUSED, 1>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<B5G5R5X1_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNUSED, 1>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose5_5_5_1 TransposeT; typedef Format4<5, 5, 5, 1> FormatT; @@ -2358,20 +2514,21 @@ template<> struct FormatTraits<B5G5R5X1_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<B5G5R5X1_UNORM_SRGB> - Format traits specialization for B5G5R5X1_UNORM_SRGB ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<B5G5R5X1_UNORM_SRGB> : - ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNUSED, 1>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ true }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<B5G5R5X1_UNORM_SRGB> + : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNUSED, 1>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{true}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose5_5_5_1 TransposeT; typedef Format4<5, 5, 5, 1> FormatT; @@ -2380,20 +2537,20 @@ template<> struct FormatTraits<B5G5R5X1_UNORM_SRGB> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8G8_SSCALED> - Format traits specialization for R8G8_SSCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8G8_SSCALED> : - ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8G8_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8 TransposeT; typedef Format2<8, 8> FormatT; @@ -2402,20 +2559,20 @@ template<> struct FormatTraits<R8G8_SSCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8G8_USCALED> - Format traits specialization for R8G8_USCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8G8_USCALED> : - ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8G8_USCALED> : ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>, + FormatSwizzle<0, 1>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{2}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8 TransposeT; typedef Format2<8, 8> FormatT; @@ -2424,20 +2581,19 @@ template<> struct FormatTraits<R8G8_USCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16_SSCALED> - Format traits specialization for R16_SSCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16_SSCALED> : - ComponentTraits<SWR_TYPE_SSCALED, 16>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16_SSCALED> + : ComponentTraits<SWR_TYPE_SSCALED, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<16> TransposeT; typedef Format1<16> FormatT; @@ -2446,20 +2602,19 @@ template<> struct FormatTraits<R16_SSCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16_USCALED> - Format traits specialization for R16_USCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16_USCALED> : - ComponentTraits<SWR_TYPE_USCALED, 16>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16_USCALED> + : ComponentTraits<SWR_TYPE_USCALED, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<16> TransposeT; typedef Format1<16> FormatT; @@ -2468,20 +2623,21 @@ template<> struct FormatTraits<R16_USCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<A1B5G5R5_UNORM> - Format traits specialization for A1B5G5R5_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<A1B5G5R5_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 1, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5>, - FormatSwizzle<3, 2, 1, 0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<A1B5G5R5_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 1, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5>, + FormatSwizzle<3, 2, 1, 0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose1_5_5_5 TransposeT; typedef Format4<1, 5, 5, 5> FormatT; @@ -2490,20 +2646,21 @@ template<> struct FormatTraits<A1B5G5R5_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<A4B4G4R4_UNORM> - Format traits specialization for A4B4G4R4_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<A4B4G4R4_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>, - FormatSwizzle<3, 2, 1, 0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<A4B4G4R4_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>, + FormatSwizzle<3, 2, 1, 0>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose4_4_4_4 TransposeT; typedef Format4<4, 4, 4, 4> FormatT; @@ -2512,20 +2669,20 @@ template<> struct FormatTraits<A4B4G4R4_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<L8A8_UINT> - Format traits specialization for L8A8_UINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<L8A8_UINT> : - ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, - FormatSwizzle<0, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 1 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<L8A8_UINT> : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, + FormatSwizzle<0, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{2}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{1}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8 TransposeT; typedef Format2<8, 8> FormatT; @@ -2534,20 +2691,20 @@ template<> struct FormatTraits<L8A8_UINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<L8A8_SINT> - Format traits specialization for L8A8_SINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<L8A8_SINT> : - ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>, - FormatSwizzle<0, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 16 }; - static const uint32_t numComps{ 2 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 1 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<L8A8_SINT> : ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>, + FormatSwizzle<0, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{16}; + static const uint32_t numComps{2}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{1}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8 TransposeT; typedef Format2<8, 8> FormatT; @@ -2556,20 +2713,19 @@ template<> struct FormatTraits<L8A8_SINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8_UNORM> - Format traits specialization for R8_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 8 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{8}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -2578,20 +2734,19 @@ template<> struct FormatTraits<R8_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8_SNORM> - Format traits specialization for R8_SNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8_SNORM> : - ComponentTraits<SWR_TYPE_SNORM, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 8 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8_SNORM> + : ComponentTraits<SWR_TYPE_SNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{8}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -2600,20 +2755,19 @@ template<> struct FormatTraits<R8_SNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8_SINT> - Format traits specialization for R8_SINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8_SINT> : - ComponentTraits<SWR_TYPE_SINT, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 8 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8_SINT> + : ComponentTraits<SWR_TYPE_SINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{8}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -2622,20 +2776,19 @@ template<> struct FormatTraits<R8_SINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8_UINT> - Format traits specialization for R8_UINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8_UINT> : - ComponentTraits<SWR_TYPE_UINT, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 8 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8_UINT> + : ComponentTraits<SWR_TYPE_UINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{8}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -2644,20 +2797,19 @@ template<> struct FormatTraits<R8_UINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<A8_UNORM> - Format traits specialization for A8_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<A8_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 8>, - FormatSwizzle<3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 8 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<A8_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<3>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{8}; + static const uint32_t numComps{1}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -2666,20 +2818,19 @@ template<> struct FormatTraits<A8_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<I8_UNORM> - Format traits specialization for I8_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<I8_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 8 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<I8_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{8}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -2688,20 +2839,19 @@ template<> struct FormatTraits<I8_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<L8_UNORM> - Format traits specialization for L8_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<L8_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 8 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<L8_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{8}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -2710,20 +2860,19 @@ template<> struct FormatTraits<L8_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8_SSCALED> - Format traits specialization for R8_SSCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8_SSCALED> : - ComponentTraits<SWR_TYPE_SSCALED, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 8 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8_SSCALED> + : ComponentTraits<SWR_TYPE_SSCALED, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{8}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -2732,20 +2881,19 @@ template<> struct FormatTraits<R8_SSCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8_USCALED> - Format traits specialization for R8_USCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8_USCALED> : - ComponentTraits<SWR_TYPE_USCALED, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 8 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8_USCALED> + : ComponentTraits<SWR_TYPE_USCALED, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{8}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -2754,20 +2902,19 @@ template<> struct FormatTraits<R8_USCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<L8_UNORM_SRGB> - Format traits specialization for L8_UNORM_SRGB ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<L8_UNORM_SRGB> : - ComponentTraits<SWR_TYPE_UNORM, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 8 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ true }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<L8_UNORM_SRGB> + : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{8}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{true}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -2776,20 +2923,19 @@ template<> struct FormatTraits<L8_UNORM_SRGB> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<L8_UINT> - Format traits specialization for L8_UINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<L8_UINT> : - ComponentTraits<SWR_TYPE_UINT, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 8 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<L8_UINT> + : ComponentTraits<SWR_TYPE_UINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{8}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -2798,20 +2944,19 @@ template<> struct FormatTraits<L8_UINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<L8_SINT> - Format traits specialization for L8_SINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<L8_SINT> : - ComponentTraits<SWR_TYPE_SINT, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 8 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<L8_SINT> + : ComponentTraits<SWR_TYPE_SINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{8}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -2820,20 +2965,19 @@ template<> struct FormatTraits<L8_SINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<I8_UINT> - Format traits specialization for I8_UINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<I8_UINT> : - ComponentTraits<SWR_TYPE_UINT, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 8 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<I8_UINT> + : ComponentTraits<SWR_TYPE_UINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{8}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -2842,20 +2986,19 @@ template<> struct FormatTraits<I8_UINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<I8_SINT> - Format traits specialization for I8_SINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<I8_SINT> : - ComponentTraits<SWR_TYPE_SINT, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 8 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<I8_SINT> + : ComponentTraits<SWR_TYPE_SINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{8}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -2864,20 +3007,19 @@ template<> struct FormatTraits<I8_SINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<DXT1_RGB_SRGB> - Format traits specialization for DXT1_RGB_SRGB ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<DXT1_RGB_SRGB> : - ComponentTraits<SWR_TYPE_UNORM, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ true }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 4 }; - static const uint32_t bcHeight{ 4 }; +template <> +struct FormatTraits<DXT1_RGB_SRGB> + : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{1}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{true}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{4}; + static const uint32_t bcHeight{4}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -2886,20 +3028,21 @@ template<> struct FormatTraits<DXT1_RGB_SRGB> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<YCRCB_SWAPUVY> - Format traits specialization for YCRCB_SWAPUVY ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<YCRCB_SWAPUVY> : - ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ true }; - static const uint32_t bcWidth{ 2 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<YCRCB_SWAPUVY> + : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{true}; + static const uint32_t bcWidth{2}; + static const uint32_t bcHeight{1}; typedef Transpose8_8_8_8 TransposeT; typedef Format4<8, 8, 8, 8> FormatT; @@ -2908,20 +3051,19 @@ template<> struct FormatTraits<YCRCB_SWAPUVY> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<BC1_UNORM> - Format traits specialization for BC1_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<BC1_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ true }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 4 }; - static const uint32_t bcHeight{ 4 }; +template <> +struct FormatTraits<BC1_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{1}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{true}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{4}; + static const uint32_t bcHeight{4}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -2930,20 +3072,19 @@ template<> struct FormatTraits<BC1_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<BC2_UNORM> - Format traits specialization for BC2_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<BC2_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 128 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ true }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 4 }; - static const uint32_t bcHeight{ 4 }; +template <> +struct FormatTraits<BC2_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{128}; + static const uint32_t numComps{1}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{true}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{4}; + static const uint32_t bcHeight{4}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -2952,20 +3093,19 @@ template<> struct FormatTraits<BC2_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<BC3_UNORM> - Format traits specialization for BC3_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<BC3_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 128 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ true }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 4 }; - static const uint32_t bcHeight{ 4 }; +template <> +struct FormatTraits<BC3_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{128}; + static const uint32_t numComps{1}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{true}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{4}; + static const uint32_t bcHeight{4}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -2974,20 +3114,19 @@ template<> struct FormatTraits<BC3_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<BC4_UNORM> - Format traits specialization for BC4_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<BC4_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ true }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 4 }; - static const uint32_t bcHeight{ 4 }; +template <> +struct FormatTraits<BC4_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{1}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{true}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{4}; + static const uint32_t bcHeight{4}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -2996,20 +3135,19 @@ template<> struct FormatTraits<BC4_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<BC5_UNORM> - Format traits specialization for BC5_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<BC5_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 128 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ true }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 4 }; - static const uint32_t bcHeight{ 4 }; +template <> +struct FormatTraits<BC5_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{128}; + static const uint32_t numComps{1}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{true}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{4}; + static const uint32_t bcHeight{4}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -3018,20 +3156,19 @@ template<> struct FormatTraits<BC5_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<BC1_UNORM_SRGB> - Format traits specialization for BC1_UNORM_SRGB ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<BC1_UNORM_SRGB> : - ComponentTraits<SWR_TYPE_UNORM, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ true }; - static const bool isBC{ true }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 4 }; - static const uint32_t bcHeight{ 4 }; +template <> +struct FormatTraits<BC1_UNORM_SRGB> + : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{1}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{true}; + static const bool isBC{true}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{4}; + static const uint32_t bcHeight{4}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -3040,20 +3177,19 @@ template<> struct FormatTraits<BC1_UNORM_SRGB> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<BC2_UNORM_SRGB> - Format traits specialization for BC2_UNORM_SRGB ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<BC2_UNORM_SRGB> : - ComponentTraits<SWR_TYPE_UNORM, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 128 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ true }; - static const bool isBC{ true }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 4 }; - static const uint32_t bcHeight{ 4 }; +template <> +struct FormatTraits<BC2_UNORM_SRGB> + : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{128}; + static const uint32_t numComps{1}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{true}; + static const bool isBC{true}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{4}; + static const uint32_t bcHeight{4}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -3062,20 +3198,19 @@ template<> struct FormatTraits<BC2_UNORM_SRGB> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<BC3_UNORM_SRGB> - Format traits specialization for BC3_UNORM_SRGB ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<BC3_UNORM_SRGB> : - ComponentTraits<SWR_TYPE_UNORM, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 128 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ true }; - static const bool isBC{ true }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 4 }; - static const uint32_t bcHeight{ 4 }; +template <> +struct FormatTraits<BC3_UNORM_SRGB> + : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{128}; + static const uint32_t numComps{1}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{true}; + static const bool isBC{true}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{4}; + static const uint32_t bcHeight{4}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -3084,20 +3219,21 @@ template<> struct FormatTraits<BC3_UNORM_SRGB> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<YCRCB_SWAPUV> - Format traits specialization for YCRCB_SWAPUV ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<YCRCB_SWAPUV> : - ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ true }; - static const uint32_t bcWidth{ 2 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<YCRCB_SWAPUV> + : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{true}; + static const uint32_t bcWidth{2}; + static const uint32_t bcHeight{1}; typedef Transpose8_8_8_8 TransposeT; typedef Format4<8, 8, 8, 8> FormatT; @@ -3106,20 +3242,19 @@ template<> struct FormatTraits<YCRCB_SWAPUV> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<DXT1_RGB> - Format traits specialization for DXT1_RGB ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<DXT1_RGB> : - ComponentTraits<SWR_TYPE_UNORM, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ true }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 4 }; - static const uint32_t bcHeight{ 4 }; +template <> +struct FormatTraits<DXT1_RGB> + : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{1}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{true}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{4}; + static const uint32_t bcHeight{4}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -3128,20 +3263,21 @@ template<> struct FormatTraits<DXT1_RGB> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8G8B8_UNORM> - Format traits specialization for R8G8B8_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8G8B8_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 24 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8G8B8_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{24}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8_8 TransposeT; typedef Format3<8, 8, 8> FormatT; @@ -3150,20 +3286,21 @@ template<> struct FormatTraits<R8G8B8_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8G8B8_SNORM> - Format traits specialization for R8G8B8_SNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8G8B8_SNORM> : - ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 24 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8G8B8_SNORM> + : ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{24}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8_8 TransposeT; typedef Format3<8, 8, 8> FormatT; @@ -3172,20 +3309,21 @@ template<> struct FormatTraits<R8G8B8_SNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8G8B8_SSCALED> - Format traits specialization for R8G8B8_SSCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8G8B8_SSCALED> : - ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 24 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8G8B8_SSCALED> + : ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{24}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8_8 TransposeT; typedef Format3<8, 8, 8> FormatT; @@ -3194,20 +3332,21 @@ template<> struct FormatTraits<R8G8B8_SSCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8G8B8_USCALED> - Format traits specialization for R8G8B8_USCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8G8B8_USCALED> : - ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 24 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8G8B8_USCALED> + : ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{24}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8_8 TransposeT; typedef Format3<8, 8, 8> FormatT; @@ -3216,20 +3355,27 @@ template<> struct FormatTraits<R8G8B8_USCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R64G64B64A64_FLOAT> - Format traits specialization for R64G64B64A64_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R64G64B64A64_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 256 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R64G64B64A64_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, + 64, + SWR_TYPE_FLOAT, + 64, + SWR_TYPE_FLOAT, + 64, + SWR_TYPE_FLOAT, + 64>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{256}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose64_64_64_64 TransposeT; typedef Format4<64, 64, 64, 64> FormatT; @@ -3238,20 +3384,21 @@ template<> struct FormatTraits<R64G64B64A64_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R64G64B64_FLOAT> - Format traits specialization for R64G64B64_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R64G64B64_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 192 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R64G64B64_FLOAT> + : ComponentTraits<SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{192}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose64_64_64 TransposeT; typedef Format3<64, 64, 64> FormatT; @@ -3260,20 +3407,19 @@ template<> struct FormatTraits<R64G64B64_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<BC4_SNORM> - Format traits specialization for BC4_SNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<BC4_SNORM> : - ComponentTraits<SWR_TYPE_SNORM, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 64 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ true }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 4 }; - static const uint32_t bcHeight{ 4 }; +template <> +struct FormatTraits<BC4_SNORM> + : ComponentTraits<SWR_TYPE_SNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{64}; + static const uint32_t numComps{1}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{true}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{4}; + static const uint32_t bcHeight{4}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -3282,20 +3428,19 @@ template<> struct FormatTraits<BC4_SNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<BC5_SNORM> - Format traits specialization for BC5_SNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<BC5_SNORM> : - ComponentTraits<SWR_TYPE_SNORM, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 128 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ true }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 4 }; - static const uint32_t bcHeight{ 4 }; +template <> +struct FormatTraits<BC5_SNORM> + : ComponentTraits<SWR_TYPE_SNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{128}; + static const uint32_t numComps{1}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{true}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{4}; + static const uint32_t bcHeight{4}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -3304,20 +3449,21 @@ template<> struct FormatTraits<BC5_SNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16G16B16_FLOAT> - Format traits specialization for R16G16B16_FLOAT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16G16B16_FLOAT> : - ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 48 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16G16B16_FLOAT> + : ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{48}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16_16 TransposeT; typedef Format3<16, 16, 16> FormatT; @@ -3326,20 +3472,21 @@ template<> struct FormatTraits<R16G16B16_FLOAT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16G16B16_UNORM> - Format traits specialization for R16G16B16_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16G16B16_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 48 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16G16B16_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{48}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16_16 TransposeT; typedef Format3<16, 16, 16> FormatT; @@ -3348,20 +3495,21 @@ template<> struct FormatTraits<R16G16B16_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16G16B16_SNORM> - Format traits specialization for R16G16B16_SNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16G16B16_SNORM> : - ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 48 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16G16B16_SNORM> + : ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{48}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16_16 TransposeT; typedef Format3<16, 16, 16> FormatT; @@ -3370,20 +3518,21 @@ template<> struct FormatTraits<R16G16B16_SNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16G16B16_SSCALED> - Format traits specialization for R16G16B16_SSCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16G16B16_SSCALED> : - ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 48 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16G16B16_SSCALED> + : ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{48}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16_16 TransposeT; typedef Format3<16, 16, 16> FormatT; @@ -3392,20 +3541,21 @@ template<> struct FormatTraits<R16G16B16_SSCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16G16B16_USCALED> - Format traits specialization for R16G16B16_USCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16G16B16_USCALED> : - ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 48 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16G16B16_USCALED> + : ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{48}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16_16 TransposeT; typedef Format3<16, 16, 16> FormatT; @@ -3414,20 +3564,19 @@ template<> struct FormatTraits<R16G16B16_USCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<BC6H_SF16> - Format traits specialization for BC6H_SF16 ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<BC6H_SF16> : - ComponentTraits<SWR_TYPE_SNORM, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 128 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ true }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 4 }; - static const uint32_t bcHeight{ 4 }; +template <> +struct FormatTraits<BC6H_SF16> + : ComponentTraits<SWR_TYPE_SNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{128}; + static const uint32_t numComps{1}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{true}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{4}; + static const uint32_t bcHeight{4}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -3436,20 +3585,19 @@ template<> struct FormatTraits<BC6H_SF16> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<BC7_UNORM> - Format traits specialization for BC7_UNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<BC7_UNORM> : - ComponentTraits<SWR_TYPE_UNORM, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 128 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ true }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 4 }; - static const uint32_t bcHeight{ 4 }; +template <> +struct FormatTraits<BC7_UNORM> + : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{128}; + static const uint32_t numComps{1}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{true}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{4}; + static const uint32_t bcHeight{4}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -3458,20 +3606,19 @@ template<> struct FormatTraits<BC7_UNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<BC7_UNORM_SRGB> - Format traits specialization for BC7_UNORM_SRGB ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<BC7_UNORM_SRGB> : - ComponentTraits<SWR_TYPE_UNORM, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 128 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ true }; - static const bool isBC{ true }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 4 }; - static const uint32_t bcHeight{ 4 }; +template <> +struct FormatTraits<BC7_UNORM_SRGB> + : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{128}; + static const uint32_t numComps{1}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{true}; + static const bool isBC{true}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{4}; + static const uint32_t bcHeight{4}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -3480,20 +3627,19 @@ template<> struct FormatTraits<BC7_UNORM_SRGB> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<BC6H_UF16> - Format traits specialization for BC6H_UF16 ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<BC6H_UF16> : - ComponentTraits<SWR_TYPE_UNORM, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 128 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ true }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 4 }; - static const uint32_t bcHeight{ 4 }; +template <> +struct FormatTraits<BC6H_UF16> + : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{128}; + static const uint32_t numComps{1}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{true}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{4}; + static const uint32_t bcHeight{4}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; @@ -3502,20 +3648,21 @@ template<> struct FormatTraits<BC6H_UF16> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8G8B8_UNORM_SRGB> - Format traits specialization for R8G8B8_UNORM_SRGB ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8G8B8_UNORM_SRGB> : - ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 24 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ true }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8G8B8_UNORM_SRGB> + : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{24}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{true}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8_8 TransposeT; typedef Format3<8, 8, 8> FormatT; @@ -3524,20 +3671,21 @@ template<> struct FormatTraits<R8G8B8_UNORM_SRGB> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16G16B16_UINT> - Format traits specialization for R16G16B16_UINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16G16B16_UINT> : - ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 48 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16G16B16_UINT> + : ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{48}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16_16 TransposeT; typedef Format3<16, 16, 16> FormatT; @@ -3546,20 +3694,21 @@ template<> struct FormatTraits<R16G16B16_UINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R16G16B16_SINT> - Format traits specialization for R16G16B16_SINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R16G16B16_SINT> : - ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 48 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R16G16B16_SINT> + : ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{48}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose16_16_16 TransposeT; typedef Format3<16, 16, 16> FormatT; @@ -3568,20 +3717,19 @@ template<> struct FormatTraits<R16G16B16_SINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R32_SFIXED> - Format traits specialization for R32_SFIXED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R32_SFIXED> : - ComponentTraits<SWR_TYPE_SFIXED, 32>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R32_SFIXED> + : ComponentTraits<SWR_TYPE_SFIXED, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<32> TransposeT; typedef Format1<32> FormatT; @@ -3590,20 +3738,27 @@ template<> struct FormatTraits<R32_SFIXED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R10G10B10A2_SNORM> - Format traits specialization for R10G10B10A2_SNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R10G10B10A2_SNORM> : - ComponentTraits<SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 2>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R10G10B10A2_SNORM> : ComponentTraits<SWR_TYPE_SNORM, + 10, + SWR_TYPE_SNORM, + 10, + SWR_TYPE_SNORM, + 10, + SWR_TYPE_SNORM, + 2>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose10_10_10_2 TransposeT; typedef Format4<10, 10, 10, 2> FormatT; @@ -3612,20 +3767,27 @@ template<> struct FormatTraits<R10G10B10A2_SNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R10G10B10A2_USCALED> - Format traits specialization for R10G10B10A2_USCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R10G10B10A2_USCALED> : - ComponentTraits<SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 2>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R10G10B10A2_USCALED> : ComponentTraits<SWR_TYPE_USCALED, + 10, + SWR_TYPE_USCALED, + 10, + SWR_TYPE_USCALED, + 10, + SWR_TYPE_USCALED, + 2>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose10_10_10_2 TransposeT; typedef Format4<10, 10, 10, 2> FormatT; @@ -3634,20 +3796,27 @@ template<> struct FormatTraits<R10G10B10A2_USCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R10G10B10A2_SSCALED> - Format traits specialization for R10G10B10A2_SSCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R10G10B10A2_SSCALED> : - ComponentTraits<SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 2>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R10G10B10A2_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED, + 10, + SWR_TYPE_SSCALED, + 10, + SWR_TYPE_SSCALED, + 10, + SWR_TYPE_SSCALED, + 2>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose10_10_10_2 TransposeT; typedef Format4<10, 10, 10, 2> FormatT; @@ -3656,20 +3825,21 @@ template<> struct FormatTraits<R10G10B10A2_SSCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R10G10B10A2_SINT> - Format traits specialization for R10G10B10A2_SINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R10G10B10A2_SINT> : - ComponentTraits<SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 2>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R10G10B10A2_SINT> + : ComponentTraits<SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 2>, + FormatSwizzle<0, 1, 2, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose10_10_10_2 TransposeT; typedef Format4<10, 10, 10, 2> FormatT; @@ -3678,20 +3848,27 @@ template<> struct FormatTraits<R10G10B10A2_SINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<B10G10R10A2_SNORM> - Format traits specialization for B10G10R10A2_SNORM ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<B10G10R10A2_SNORM> : - ComponentTraits<SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 2>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<B10G10R10A2_SNORM> : ComponentTraits<SWR_TYPE_SNORM, + 10, + SWR_TYPE_SNORM, + 10, + SWR_TYPE_SNORM, + 10, + SWR_TYPE_SNORM, + 2>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose10_10_10_2 TransposeT; typedef Format4<10, 10, 10, 2> FormatT; @@ -3700,20 +3877,27 @@ template<> struct FormatTraits<B10G10R10A2_SNORM> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<B10G10R10A2_USCALED> - Format traits specialization for B10G10R10A2_USCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<B10G10R10A2_USCALED> : - ComponentTraits<SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 2>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<B10G10R10A2_USCALED> : ComponentTraits<SWR_TYPE_USCALED, + 10, + SWR_TYPE_USCALED, + 10, + SWR_TYPE_USCALED, + 10, + SWR_TYPE_USCALED, + 2>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose10_10_10_2 TransposeT; typedef Format4<10, 10, 10, 2> FormatT; @@ -3722,20 +3906,27 @@ template<> struct FormatTraits<B10G10R10A2_USCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<B10G10R10A2_SSCALED> - Format traits specialization for B10G10R10A2_SSCALED ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<B10G10R10A2_SSCALED> : - ComponentTraits<SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 2>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<B10G10R10A2_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED, + 10, + SWR_TYPE_SSCALED, + 10, + SWR_TYPE_SSCALED, + 10, + SWR_TYPE_SSCALED, + 2>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x3f800000> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose10_10_10_2 TransposeT; typedef Format4<10, 10, 10, 2> FormatT; @@ -3744,20 +3935,21 @@ template<> struct FormatTraits<B10G10R10A2_SSCALED> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<B10G10R10A2_UINT> - Format traits specialization for B10G10R10A2_UINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<B10G10R10A2_UINT> : - ComponentTraits<SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 2>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<B10G10R10A2_UINT> + : ComponentTraits<SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 2>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose10_10_10_2 TransposeT; typedef Format4<10, 10, 10, 2> FormatT; @@ -3766,20 +3958,21 @@ template<> struct FormatTraits<B10G10R10A2_UINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<B10G10R10A2_SINT> - Format traits specialization for B10G10R10A2_SINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<B10G10R10A2_SINT> : - ComponentTraits<SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 2>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 32 }; - static const uint32_t numComps{ 4 }; - static const bool hasAlpha{ true }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<B10G10R10A2_SINT> + : ComponentTraits<SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 2>, + FormatSwizzle<2, 1, 0, 3>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{32}; + static const uint32_t numComps{4}; + static const bool hasAlpha{true}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose10_10_10_2 TransposeT; typedef Format4<10, 10, 10, 2> FormatT; @@ -3788,20 +3981,21 @@ template<> struct FormatTraits<B10G10R10A2_SINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8G8B8_UINT> - Format traits specialization for R8G8B8_UINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8G8B8_UINT> : - ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 24 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8G8B8_UINT> + : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{24}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8_8 TransposeT; typedef Format3<8, 8, 8> FormatT; @@ -3810,20 +4004,21 @@ template<> struct FormatTraits<R8G8B8_UINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<R8G8B8_SINT> - Format traits specialization for R8G8B8_SINT ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<R8G8B8_SINT> : - ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 24 }; - static const uint32_t numComps{ 3 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 0 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<R8G8B8_SINT> + : ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>, + FormatSwizzle<0, 1, 2>, + Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{24}; + static const uint32_t numComps{3}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{0}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef Transpose8_8_8 TransposeT; typedef Format3<8, 8, 8> FormatT; @@ -3832,20 +4027,19 @@ template<> struct FormatTraits<R8G8B8_SINT> : ////////////////////////////////////////////////////////////////////////// /// FormatTraits<RAW> - Format traits specialization for RAW ////////////////////////////////////////////////////////////////////////// -template<> struct FormatTraits<RAW> : - ComponentTraits<SWR_TYPE_UINT, 8>, - FormatSwizzle<0>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{ 8 }; - static const uint32_t numComps{ 1 }; - static const bool hasAlpha{ false }; - static const uint32_t alphaComp{ 3 }; - static const bool isSRGB{ false }; - static const bool isBC{ false }; - static const bool isSubsampled{ false }; - static const uint32_t bcWidth{ 1 }; - static const uint32_t bcHeight{ 1 }; +template <> +struct FormatTraits<RAW> + : ComponentTraits<SWR_TYPE_UINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1> +{ + static const uint32_t bpp{8}; + static const uint32_t numComps{1}; + static const bool hasAlpha{false}; + static const uint32_t alphaComp{3}; + static const bool isSRGB{false}; + static const bool isBC{false}; + static const bool isSubsampled{false}; + static const uint32_t bcWidth{1}; + static const uint32_t bcHeight{1}; typedef TransposeSingleComponent<8> TransposeT; typedef Format1<8> FormatT; diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h index c3327c1d40b..518da829d58 100644 --- a/src/gallium/drivers/swr/rasterizer/core/format_types.h +++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h @@ -1,30 +1,30 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file formats.h -* -* @brief Definitions for SWR_FORMAT functions. -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file formats.h + * + * @brief Definitions for SWR_FORMAT functions. + * + ******************************************************************************/ #pragma once #include "utils.h" @@ -36,16 +36,16 @@ template <uint32_t NumBits, bool Signed = false> struct PackTraits { - static const uint32_t MyNumBits = NumBits; - static simdscalar loadSOA(const uint8_t *pSrc) = delete; - static void storeSOA(uint8_t *pDst, simdscalar const &src) = delete; - static simdscalar unpack(simdscalar &in) = delete; - static simdscalar pack(simdscalar &in) = delete; + static const uint32_t MyNumBits = NumBits; + static simdscalar loadSOA(const uint8_t* pSrc) = delete; + static void storeSOA(uint8_t* pDst, simdscalar const& src) = delete; + static simdscalar unpack(simdscalar& in) = delete; + static simdscalar pack(simdscalar& in) = delete; #if ENABLE_AVX512_SIMD16 - static simd16scalar loadSOA_16(const uint8_t *pSrc) = delete; - static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src) = delete; - static simd16scalar unpack(simd16scalar &in) = delete; - static simd16scalar pack(simd16scalar &in) = delete; + static simd16scalar loadSOA_16(const uint8_t* pSrc) = delete; + static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) = delete; + static simd16scalar unpack(simd16scalar& in) = delete; + static simd16scalar pack(simd16scalar& in) = delete; #endif }; @@ -57,15 +57,15 @@ struct PackTraits<0, false> { static const uint32_t MyNumBits = 0; - static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_setzero_ps(); } - static void storeSOA(uint8_t *pDst, simdscalar const &src) { return; } - static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); } - static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); } + static simdscalar loadSOA(const uint8_t* pSrc) { return _simd_setzero_ps(); } + static void storeSOA(uint8_t* pDst, simdscalar const& src) { return; } + static simdscalar unpack(simdscalar& in) { return _simd_setzero_ps(); } + static simdscalar pack(simdscalar& in) { return _simd_setzero_ps(); } #if ENABLE_AVX512_SIMD16 - static simd16scalar loadSOA_16(const uint8_t *pSrc) { return _simd16_setzero_ps(); } - static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src) { return; } - static simd16scalar unpack(simd16scalar &in) { return _simd16_setzero_ps(); } - static simd16scalar pack(simd16scalar &in) { return _simd16_setzero_ps(); } + static simd16scalar loadSOA_16(const uint8_t* pSrc) { return _simd16_setzero_ps(); } + static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) { return; } + static simd16scalar unpack(simd16scalar& in) { return _simd16_setzero_ps(); } + static simd16scalar pack(simd16scalar& in) { return _simd16_setzero_ps(); } #endif }; @@ -77,18 +77,18 @@ struct PackTraits<8, false> { static const uint32_t MyNumBits = 8; - static simdscalar loadSOA(const uint8_t *pSrc) + static simdscalar loadSOA(const uint8_t* pSrc) { #if KNOB_SIMD_WIDTH == 8 __m256 result = _mm256_setzero_ps(); - __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc)); + __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc)); return _mm256_insertf128_ps(result, vLo, 0); #else #error Unsupported vector width #endif } - static void storeSOA(uint8_t *pDst, simdscalar const &src) + static void storeSOA(uint8_t* pDst, simdscalar const& src) { // store simd bytes #if KNOB_SIMD_WIDTH == 8 @@ -98,31 +98,33 @@ struct PackTraits<8, false> #endif } - static simdscalar unpack(simdscalar &in) + static simdscalar unpack(simdscalar& in) { #if KNOB_SIMD_WIDTH == 8 #if KNOB_ARCH <= KNOB_ARCH_AVX - __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); + __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); __m128i resLo = _mm_cvtepu8_epi32(src); - __m128i resHi = _mm_shuffle_epi8(src, - _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004)); + __m128i resHi = + _mm_shuffle_epi8(src, _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004)); __m256i result = _mm256_castsi128_si256(resLo); - result = _mm256_insertf128_si256(result, resHi, 1); - return simdscalar{ _mm256_castsi256_ps(result) }; + result = _mm256_insertf128_si256(result, resHi, 1); + return simdscalar{_mm256_castsi256_ps(result)}; #else - return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); + return _mm256_castsi256_ps( + _mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); #endif #else #error Unsupported vector width #endif } - static simdscalar pack(simdscalar &in) + static simdscalar pack(simdscalar& in) { #if KNOB_SIMD_WIDTH == 8 simdscalari src = _simd_castps_si(in); - __m128i res16 = _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)); + __m128i res16 = + _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)); __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128()); return _mm256_castsi256_ps(_mm256_castsi128_si256(res8)); #else @@ -131,51 +133,64 @@ struct PackTraits<8, false> } #if ENABLE_AVX512_SIMD16 - static simd16scalar loadSOA_16(const uint8_t *pSrc) + static simd16scalar loadSOA_16(const uint8_t* pSrc) { - simd16scalar result = _simd16_setzero_ps(); - simdscalar resultlo = _simd_setzero_ps(); + simd16scalar result = _simd16_setzero_ps(); + simdscalar resultlo = _simd_setzero_ps(); - const __m128 src = _mm_load_ps(reinterpret_cast<const float *>(pSrc)); + const __m128 src = _mm_load_ps(reinterpret_cast<const float*>(pSrc)); resultlo = _mm256_insertf128_ps(resultlo, src, 0); - result = _simd16_insert_ps(result, resultlo, 0); + result = _simd16_insert_ps(result, resultlo, 0); return result; } - static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src) + static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) { // store simd16 bytes - _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0))); + _mm_store_ps(reinterpret_cast<float*>(pDst), + _mm256_castps256_ps128(_simd16_extract_ps(src, 0))); } - static simd16scalar unpack(simd16scalar &in) + static simd16scalar unpack(simd16scalar& in) { - simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))); + simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))); simd16scalari result = _simd16_cvtepu8_epi32(tmp); return _simd16_castsi_ps(result); } - static simd16scalar pack(simd16scalar &in) + static simd16scalar pack(simd16scalar& in) { simd16scalari result = _simd16_setzero_si(); - simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b) - simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF + simdscalari inlo = + _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b) + simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF - simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b) - simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b) + simdscalari permlo = + _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b) + simdscalari permhi = + _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b) - simdscalari pack = _simd_packus_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b) + simdscalari pack = _simd_packus_epi32( + permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b) const simdscalari zero = _simd_setzero_si(); - permlo = _simd_permute2f128_si(pack, zero, 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b) - permhi = _simd_permute2f128_si(pack, zero, 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b) + permlo = _simd_permute2f128_si( + pack, + zero, + 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b) + permhi = _simd_permute2f128_si( + pack, + zero, + 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b) - pack = _simd_packus_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b) + pack = _simd_packus_epi16(permlo, + permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 + // 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b) result = _simd16_insert_si(result, pack, 0); @@ -192,18 +207,18 @@ struct PackTraits<8, true> { static const uint32_t MyNumBits = 8; - static simdscalar loadSOA(const uint8_t *pSrc) + static simdscalar loadSOA(const uint8_t* pSrc) { #if KNOB_SIMD_WIDTH == 8 __m256 result = _mm256_setzero_ps(); - __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc)); + __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc)); return _mm256_insertf128_ps(result, vLo, 0); #else #error Unsupported vector width #endif } - static void storeSOA(uint8_t *pDst, simdscalar const &src) + static void storeSOA(uint8_t* pDst, simdscalar const& src) { // store simd bytes #if KNOB_SIMD_WIDTH == 8 @@ -213,32 +228,34 @@ struct PackTraits<8, true> #endif } - static simdscalar unpack(simdscalar &in) + static simdscalar unpack(simdscalar& in) { #if KNOB_SIMD_WIDTH == 8 #if KNOB_ARCH <= KNOB_ARCH_AVX SWR_INVALID("I think this may be incorrect."); - __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); + __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); __m128i resLo = _mm_cvtepi8_epi32(src); - __m128i resHi = _mm_shuffle_epi8(src, - _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004)); + __m128i resHi = + _mm_shuffle_epi8(src, _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004)); __m256i result = _mm256_castsi128_si256(resLo); - result = _mm256_insertf128_si256(result, resHi, 1); + result = _mm256_insertf128_si256(result, resHi, 1); return _mm256_castsi256_ps(result); #else - return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); + return _mm256_castsi256_ps( + _mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); #endif #else #error Unsupported vector width #endif } - static simdscalar pack(simdscalar &in) + static simdscalar pack(simdscalar& in) { #if KNOB_SIMD_WIDTH == 8 simdscalari src = _simd_castps_si(in); - __m128i res16 = _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)); + __m128i res16 = + _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)); __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128()); return _mm256_castsi256_ps(_mm256_castsi128_si256(res8)); #else @@ -247,51 +264,64 @@ struct PackTraits<8, true> } #if ENABLE_AVX512_SIMD16 - static simd16scalar loadSOA_16(const uint8_t *pSrc) + static simd16scalar loadSOA_16(const uint8_t* pSrc) { - simd16scalar result = _simd16_setzero_ps(); - simdscalar resultlo = _simd_setzero_ps(); + simd16scalar result = _simd16_setzero_ps(); + simdscalar resultlo = _simd_setzero_ps(); - const __m128 src = _mm_load_ps(reinterpret_cast<const float *>(pSrc)); + const __m128 src = _mm_load_ps(reinterpret_cast<const float*>(pSrc)); resultlo = _mm256_insertf128_ps(resultlo, src, 0); - result = _simd16_insert_ps(result, resultlo, 0); + result = _simd16_insert_ps(result, resultlo, 0); return result; } - static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src) + static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) { // store simd16 bytes - _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0))); + _mm_store_ps(reinterpret_cast<float*>(pDst), + _mm256_castps256_ps128(_simd16_extract_ps(src, 0))); } - static simd16scalar unpack(simd16scalar &in) + static simd16scalar unpack(simd16scalar& in) { - simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))); + simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))); simd16scalari result = _simd16_cvtepu8_epi32(tmp); return _simd16_castsi_ps(result); } - static simd16scalar pack(simd16scalar &in) + static simd16scalar pack(simd16scalar& in) { simd16scalari result = _simd16_setzero_si(); - simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b) - simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF + simdscalari inlo = + _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b) + simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF - simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b) - simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b) + simdscalari permlo = + _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b) + simdscalari permhi = + _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b) - simdscalari pack = _simd_packs_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b) + simdscalari pack = _simd_packs_epi32( + permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b) const simdscalari zero = _simd_setzero_si(); - permlo = _simd_permute2f128_si(pack, zero, 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b) - permhi = _simd_permute2f128_si(pack, zero, 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b) + permlo = _simd_permute2f128_si( + pack, + zero, + 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b) + permhi = _simd_permute2f128_si( + pack, + zero, + 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b) - pack = _simd_packs_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b) + pack = + _simd_packs_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 + // 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b) result = _simd16_insert_si(result, pack, 0); @@ -308,18 +338,18 @@ struct PackTraits<16, false> { static const uint32_t MyNumBits = 16; - static simdscalar loadSOA(const uint8_t *pSrc) + static simdscalar loadSOA(const uint8_t* pSrc) { #if KNOB_SIMD_WIDTH == 8 __m256 result = _mm256_setzero_ps(); - __m128 vLo = _mm_load_ps((const float*)pSrc); + __m128 vLo = _mm_load_ps((const float*)pSrc); return _mm256_insertf128_ps(result, vLo, 0); #else #error Unsupported vector width #endif } - static void storeSOA(uint8_t *pDst, simdscalar const &src) + static void storeSOA(uint8_t* pDst, simdscalar const& src) { #if KNOB_SIMD_WIDTH == 8 // store 16B (2B * 8) @@ -329,31 +359,33 @@ struct PackTraits<16, false> #endif } - static simdscalar unpack(simdscalar &in) + static simdscalar unpack(simdscalar& in) { #if KNOB_SIMD_WIDTH == 8 #if KNOB_ARCH <= KNOB_ARCH_AVX - __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); + __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); __m128i resLo = _mm_cvtepu16_epi32(src); - __m128i resHi = _mm_shuffle_epi8(src, - _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908)); + __m128i resHi = + _mm_shuffle_epi8(src, _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908)); __m256i result = _mm256_castsi128_si256(resLo); - result = _mm256_insertf128_si256(result, resHi, 1); + result = _mm256_insertf128_si256(result, resHi, 1); return _mm256_castsi256_ps(result); #else - return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); + return _mm256_castsi256_ps( + _mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); #endif #else #error Unsupported vector width #endif } - static simdscalar pack(simdscalar &in) + static simdscalar pack(simdscalar& in) { #if KNOB_SIMD_WIDTH == 8 simdscalari src = _simd_castps_si(in); - __m256i res = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1))); + __m256i res = _mm256_castsi128_si256( + _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1))); return _mm256_castsi256_ps(res); #else #error Unsupported vector width @@ -361,37 +393,45 @@ struct PackTraits<16, false> } #if ENABLE_AVX512_SIMD16 - static simd16scalar loadSOA_16(const uint8_t *pSrc) + static simd16scalar loadSOA_16(const uint8_t* pSrc) { simd16scalar result = _simd16_setzero_ps(); - simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float *>(pSrc)); + simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float*>(pSrc)); result = _simd16_insert_ps(result, resultlo, 0); return result; } - static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src) + static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) { - _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0)); + _simd_store_ps(reinterpret_cast<float*>(pDst), _simd16_extract_ps(src, 0)); } - static simd16scalar unpack(simd16scalar &in) + static simd16scalar unpack(simd16scalar& in) { simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0))); return _simd16_castsi_ps(result); } - static simd16scalar pack(simd16scalar &in) + static simd16scalar pack(simd16scalar& in) { const simd16scalari zero = _simd16_setzero_si(); - simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b) - simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00 + simd16scalari permlo = _simd16_permute2f128_si( + _simd16_castps_si(in), + zero, + 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b) + simd16scalari permhi = _simd16_permute2f128_si( + _simd16_castps_si(in), + zero, + 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00 - simd16scalari result = _simd16_packus_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b) + simd16scalari result = _simd16_packus_epi32( + permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 + // 00 00 00 00 00 00 00 00 00 (16b) return _simd16_castsi_ps(result); } @@ -406,18 +446,18 @@ struct PackTraits<16, true> { static const uint32_t MyNumBits = 16; - static simdscalar loadSOA(const uint8_t *pSrc) + static simdscalar loadSOA(const uint8_t* pSrc) { #if KNOB_SIMD_WIDTH == 8 __m256 result = _mm256_setzero_ps(); - __m128 vLo = _mm_load_ps((const float*)pSrc); + __m128 vLo = _mm_load_ps((const float*)pSrc); return _mm256_insertf128_ps(result, vLo, 0); #else #error Unsupported vector width #endif } - static void storeSOA(uint8_t *pDst, simdscalar const &src) + static void storeSOA(uint8_t* pDst, simdscalar const& src) { #if KNOB_SIMD_WIDTH == 8 // store 16B (2B * 8) @@ -427,32 +467,34 @@ struct PackTraits<16, true> #endif } - static simdscalar unpack(simdscalar &in) + static simdscalar unpack(simdscalar& in) { #if KNOB_SIMD_WIDTH == 8 #if KNOB_ARCH <= KNOB_ARCH_AVX SWR_INVALID("I think this may be incorrect."); - __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); + __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); __m128i resLo = _mm_cvtepi16_epi32(src); - __m128i resHi = _mm_shuffle_epi8(src, - _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908)); + __m128i resHi = + _mm_shuffle_epi8(src, _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908)); __m256i result = _mm256_castsi128_si256(resLo); - result = _mm256_insertf128_si256(result, resHi, 1); + result = _mm256_insertf128_si256(result, resHi, 1); return _mm256_castsi256_ps(result); #else - return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); + return _mm256_castsi256_ps( + _mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); #endif #else #error Unsupported vector width #endif } - static simdscalar pack(simdscalar &in) + static simdscalar pack(simdscalar& in) { #if KNOB_SIMD_WIDTH == 8 simdscalari src = _simd_castps_si(in); - __m256i res = _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1))); + __m256i res = _mm256_castsi128_si256( + _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1))); return _mm256_castsi256_ps(res); #else #error Unsupported vector width @@ -460,37 +502,45 @@ struct PackTraits<16, true> } #if ENABLE_AVX512_SIMD16 - static simd16scalar loadSOA_16(const uint8_t *pSrc) + static simd16scalar loadSOA_16(const uint8_t* pSrc) { simd16scalar result = _simd16_setzero_ps(); - simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float *>(pSrc)); + simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float*>(pSrc)); result = _simd16_insert_ps(result, resultlo, 0); return result; } - static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src) + static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) { - _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0)); + _simd_store_ps(reinterpret_cast<float*>(pDst), _simd16_extract_ps(src, 0)); } - static simd16scalar unpack(simd16scalar &in) + static simd16scalar unpack(simd16scalar& in) { simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0))); return _simd16_castsi_ps(result); } - static simd16scalar pack(simd16scalar &in) + static simd16scalar pack(simd16scalar& in) { const simd16scalari zero = _simd16_setzero_si(); - simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b) - simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00 + simd16scalari permlo = _simd16_permute2f128_si( + _simd16_castps_si(in), + zero, + 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b) + simd16scalari permhi = _simd16_permute2f128_si( + _simd16_castps_si(in), + zero, + 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00 - simd16scalari result = _simd16_packs_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b) + simd16scalari result = _simd16_packs_epi32( + permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 + // 00 00 00 00 00 00 00 00 00 (16b) return _simd16_castsi_ps(result); } @@ -505,188 +555,281 @@ struct PackTraits<32, false> { static const uint32_t MyNumBits = 32; - static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_load_ps((const float*)pSrc); } - static void storeSOA(uint8_t *pDst, simdscalar const &src) { _simd_store_ps((float*)pDst, src); } - static simdscalar unpack(simdscalar &in) { return in; } - static simdscalar pack(simdscalar &in) { return in; } -#if ENABLE_AVX512_SIMD16 - - static simd16scalar loadSOA_16(const uint8_t *pSrc) + static simdscalar loadSOA(const uint8_t* pSrc) { return _simd_load_ps((const float*)pSrc); } + static void storeSOA(uint8_t* pDst, simdscalar const& src) { - return _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); + _simd_store_ps((float*)pDst, src); } + static simdscalar unpack(simdscalar& in) { return in; } + static simdscalar pack(simdscalar& in) { return in; } +#if ENABLE_AVX512_SIMD16 - static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src) + static simd16scalar loadSOA_16(const uint8_t* pSrc) { - _simd16_store_ps(reinterpret_cast<float *>(pDst), src); + return _simd16_load_ps(reinterpret_cast<const float*>(pSrc)); } - static simd16scalar unpack(simd16scalar &in) + static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) { - return in; + _simd16_store_ps(reinterpret_cast<float*>(pDst), src); } - static simd16scalar pack(simd16scalar &in) - { - return in; - } + static simd16scalar unpack(simd16scalar& in) { return in; } + + static simd16scalar pack(simd16scalar& in) { return in; } #endif }; ////////////////////////////////////////////////////////////////////////// /// TypeTraits - Format type traits. ////////////////////////////////////////////////////////////////////////// -template<SWR_TYPE type, uint32_t NumBits> +template <SWR_TYPE type, uint32_t NumBits> struct TypeTraits : PackTraits<NumBits> { static const SWR_TYPE MyType = type; - static float toFloat() { return 0.0; } - static float fromFloat() { SWR_NOT_IMPL; return 0.0; } - static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); } + static float toFloat() { return 0.0; } + static float fromFloat() + { + SWR_NOT_IMPL; + return 0.0; + } + static simdscalar convertSrgb(simdscalar& in) + { + SWR_NOT_IMPL; + return _simd_setzero_ps(); + } }; ////////////////////////////////////////////////////////////////////////// /// TypeTraits - Format type traits specialization for UINT8 ////////////////////////////////////////////////////////////////////////// -template<> struct TypeTraits<SWR_TYPE_UINT, 8> : PackTraits<8> +template <> +struct TypeTraits<SWR_TYPE_UINT, 8> : PackTraits<8> { static const SWR_TYPE MyType = SWR_TYPE_UINT; - static float toFloat() { return 0.0; } - static float fromFloat() { SWR_NOT_IMPL; return 0.0; } - static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); } + static float toFloat() { return 0.0; } + static float fromFloat() + { + SWR_NOT_IMPL; + return 0.0; + } + static simdscalar convertSrgb(simdscalar& in) + { + SWR_NOT_IMPL; + return _simd_setzero_ps(); + } }; ////////////////////////////////////////////////////////////////////////// /// TypeTraits - Format type traits specialization for UINT8 ////////////////////////////////////////////////////////////////////////// -template<> struct TypeTraits<SWR_TYPE_SINT, 8> : PackTraits<8, true> +template <> +struct TypeTraits<SWR_TYPE_SINT, 8> : PackTraits<8, true> { static const SWR_TYPE MyType = SWR_TYPE_SINT; - static float toFloat() { return 0.0; } - static float fromFloat() { SWR_NOT_IMPL; return 0.0; } - static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); } + static float toFloat() { return 0.0; } + static float fromFloat() + { + SWR_NOT_IMPL; + return 0.0; + } + static simdscalar convertSrgb(simdscalar& in) + { + SWR_NOT_IMPL; + return _simd_setzero_ps(); + } }; ////////////////////////////////////////////////////////////////////////// /// TypeTraits - Format type traits specialization for UINT16 ////////////////////////////////////////////////////////////////////////// -template<> struct TypeTraits<SWR_TYPE_UINT, 16> : PackTraits<16> +template <> +struct TypeTraits<SWR_TYPE_UINT, 16> : PackTraits<16> { static const SWR_TYPE MyType = SWR_TYPE_UINT; - static float toFloat() { return 0.0; } - static float fromFloat() { SWR_NOT_IMPL; return 0.0; } - static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); } + static float toFloat() { return 0.0; } + static float fromFloat() + { + SWR_NOT_IMPL; + return 0.0; + } + static simdscalar convertSrgb(simdscalar& in) + { + SWR_NOT_IMPL; + return _simd_setzero_ps(); + } }; ////////////////////////////////////////////////////////////////////////// /// TypeTraits - Format type traits specialization for SINT16 ////////////////////////////////////////////////////////////////////////// -template<> struct TypeTraits<SWR_TYPE_SINT, 16> : PackTraits<16, true> +template <> +struct TypeTraits<SWR_TYPE_SINT, 16> : PackTraits<16, true> { static const SWR_TYPE MyType = SWR_TYPE_SINT; - static float toFloat() { return 0.0; } - static float fromFloat() { SWR_NOT_IMPL; return 0.0; } - static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); } + static float toFloat() { return 0.0; } + static float fromFloat() + { + SWR_NOT_IMPL; + return 0.0; + } + static simdscalar convertSrgb(simdscalar& in) + { + SWR_NOT_IMPL; + return _simd_setzero_ps(); + } }; ////////////////////////////////////////////////////////////////////////// /// TypeTraits - Format type traits specialization for UINT32 ////////////////////////////////////////////////////////////////////////// -template<> struct TypeTraits<SWR_TYPE_UINT, 32> : PackTraits<32> +template <> +struct TypeTraits<SWR_TYPE_UINT, 32> : PackTraits<32> { static const SWR_TYPE MyType = SWR_TYPE_UINT; - static float toFloat() { return 0.0; } - static float fromFloat() { SWR_NOT_IMPL; return 0.0; } - static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); } + static float toFloat() { return 0.0; } + static float fromFloat() + { + SWR_NOT_IMPL; + return 0.0; + } + static simdscalar convertSrgb(simdscalar& in) + { + SWR_NOT_IMPL; + return _simd_setzero_ps(); + } }; ////////////////////////////////////////////////////////////////////////// /// TypeTraits - Format type traits specialization for UINT32 ////////////////////////////////////////////////////////////////////////// -template<> struct TypeTraits<SWR_TYPE_SINT, 32> : PackTraits<32> +template <> +struct TypeTraits<SWR_TYPE_SINT, 32> : PackTraits<32> { static const SWR_TYPE MyType = SWR_TYPE_SINT; - static float toFloat() { return 0.0; } - static float fromFloat() { SWR_NOT_IMPL; return 0.0; } - static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); } + static float toFloat() { return 0.0; } + static float fromFloat() + { + SWR_NOT_IMPL; + return 0.0; + } + static simdscalar convertSrgb(simdscalar& in) + { + SWR_NOT_IMPL; + return _simd_setzero_ps(); + } }; ////////////////////////////////////////////////////////////////////////// /// TypeTraits - Format type traits specialization for UNORM5 ////////////////////////////////////////////////////////////////////////// -template<> struct TypeTraits<SWR_TYPE_UNORM, 5> : PackTraits<5> +template <> +struct TypeTraits<SWR_TYPE_UNORM, 5> : PackTraits<5> { static const SWR_TYPE MyType = SWR_TYPE_UNORM; - static float toFloat() { return 1.0f / 31.0f; } - static float fromFloat() { return 31.0f; } - static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); } + static float toFloat() { return 1.0f / 31.0f; } + static float fromFloat() { return 31.0f; } + static simdscalar convertSrgb(simdscalar& in) + { + SWR_NOT_IMPL; + return _simd_setzero_ps(); + } }; ////////////////////////////////////////////////////////////////////////// /// TypeTraits - Format type traits specialization for UNORM6 ////////////////////////////////////////////////////////////////////////// -template<> struct TypeTraits<SWR_TYPE_UNORM, 6> : PackTraits<6> +template <> +struct TypeTraits<SWR_TYPE_UNORM, 6> : PackTraits<6> { static const SWR_TYPE MyType = SWR_TYPE_UNORM; - static float toFloat() { return 1.0f / 63.0f; } - static float fromFloat() { return 63.0f; } - static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); } + static float toFloat() { return 1.0f / 63.0f; } + static float fromFloat() { return 63.0f; } + static simdscalar convertSrgb(simdscalar& in) + { + SWR_NOT_IMPL; + return _simd_setzero_ps(); + } }; ////////////////////////////////////////////////////////////////////////// /// TypeTraits - Format type traits specialization for UNORM8 ////////////////////////////////////////////////////////////////////////// -template<> struct TypeTraits<SWR_TYPE_UNORM, 8> : PackTraits<8> +template <> +struct TypeTraits<SWR_TYPE_UNORM, 8> : PackTraits<8> { static const SWR_TYPE MyType = SWR_TYPE_UNORM; - static float toFloat() { return 1.0f / 255.0f; } - static float fromFloat() { return 255.0f; } - static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); } + static float toFloat() { return 1.0f / 255.0f; } + static float fromFloat() { return 255.0f; } + static simdscalar convertSrgb(simdscalar& in) + { + SWR_NOT_IMPL; + return _simd_setzero_ps(); + } }; ////////////////////////////////////////////////////////////////////////// /// TypeTraits - Format type traits specialization for UNORM8 ////////////////////////////////////////////////////////////////////////// -template<> struct TypeTraits<SWR_TYPE_SNORM, 8> : PackTraits<8, true> +template <> +struct TypeTraits<SWR_TYPE_SNORM, 8> : PackTraits<8, true> { static const SWR_TYPE MyType = SWR_TYPE_SNORM; - static float toFloat() { return 1.0f / 127.0f; } - static float fromFloat() { return 127.0f; } - static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); } + static float toFloat() { return 1.0f / 127.0f; } + static float fromFloat() { return 127.0f; } + static simdscalar convertSrgb(simdscalar& in) + { + SWR_NOT_IMPL; + return _simd_setzero_ps(); + } }; ////////////////////////////////////////////////////////////////////////// /// TypeTraits - Format type traits specialization for UNORM16 ////////////////////////////////////////////////////////////////////////// -template<> struct TypeTraits<SWR_TYPE_UNORM, 16> : PackTraits<16> +template <> +struct TypeTraits<SWR_TYPE_UNORM, 16> : PackTraits<16> { static const SWR_TYPE MyType = SWR_TYPE_UNORM; - static float toFloat() { return 1.0f / 65535.0f; } - static float fromFloat() { return 65535.0f; } - static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); } + static float toFloat() { return 1.0f / 65535.0f; } + static float fromFloat() { return 65535.0f; } + static simdscalar convertSrgb(simdscalar& in) + { + SWR_NOT_IMPL; + return _simd_setzero_ps(); + } }; ////////////////////////////////////////////////////////////////////////// /// TypeTraits - Format type traits specialization for SNORM16 ////////////////////////////////////////////////////////////////////////// -template<> struct TypeTraits<SWR_TYPE_SNORM, 16> : PackTraits<16, true> +template <> +struct TypeTraits<SWR_TYPE_SNORM, 16> : PackTraits<16, true> { static const SWR_TYPE MyType = SWR_TYPE_UNORM; - static float toFloat() { return 1.0f / 32767.0f; } - static float fromFloat() { return 32767.0f; } - static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); } + static float toFloat() { return 1.0f / 32767.0f; } + static float fromFloat() { return 32767.0f; } + static simdscalar convertSrgb(simdscalar& in) + { + SWR_NOT_IMPL; + return _simd_setzero_ps(); + } }; ////////////////////////////////////////////////////////////////////////// /// TypeTraits - Format type traits specialization for UNORM24 ////////////////////////////////////////////////////////////////////////// -template<> -struct TypeTraits < SWR_TYPE_UNORM, 24 > : PackTraits<32> +template <> +struct TypeTraits<SWR_TYPE_UNORM, 24> : PackTraits<32> { static const SWR_TYPE MyType = SWR_TYPE_UNORM; - static float toFloat() { return 1.0f / 16777215.0f; } - static float fromFloat() { return 16777215.0f; } - static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); } + static float toFloat() { return 1.0f / 16777215.0f; } + static float fromFloat() { return 16777215.0f; } + static simdscalar convertSrgb(simdscalar& in) + { + SWR_NOT_IMPL; + return _simd_setzero_ps(); + } }; ////////////////////////////////////////////////////////////////////////// @@ -697,44 +840,47 @@ struct TypeTraits < SWR_TYPE_UNORM, 24 > : PackTraits<32> #include "math.h" -template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden > -inline static __m128 fastpow(__m128 arg) { +template <unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden> +inline static __m128 fastpow(__m128 arg) +{ __m128 ret = arg; - static const __m128 factor = _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f) - * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum)); + static const __m128 factor = + _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f) * + powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum)); // Apply a constant pre-correction factor. ret = _mm_mul_ps(ret, factor); // Reinterpret arg as integer to obtain logarithm. - //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret)); + // asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret)); ret = _mm_cvtepi32_ps(_mm_castps_si128(ret)); // Multiply logarithm by power. ret = _mm_mul_ps(ret, _mm_set1_ps(1.0f * expnum / expden)); // Convert back to "integer" to exponentiate. - //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret)); + // asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret)); ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret)); return ret; } -inline static __m128 pow512_4(__m128 arg) { +inline static __m128 pow512_4(__m128 arg) +{ // 5/12 is too small, so compute the 4th root of 20/12 instead. // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow. // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3 - __m128 xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg); + __m128 xf = fastpow<2, 3, int(0.629960524947437 * 1e9), int(1e9)>(arg); __m128 xover = _mm_mul_ps(arg, xf); - __m128 xfm1 = _mm_rsqrt_ps(xf); - __m128 x2 = _mm_mul_ps(arg, arg); + __m128 xfm1 = _mm_rsqrt_ps(xf); + __m128 x2 = _mm_mul_ps(arg, arg); __m128 xunder = _mm_mul_ps(x2, xfm1); // sqrt2 * over + 2 * sqrt2 * under __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), - _mm_add_ps(xover, xunder)); + _mm_add_ps(xover, xunder)); xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); @@ -743,17 +889,15 @@ inline static __m128 pow512_4(__m128 arg) { inline static __m128 powf_wrapper(__m128 Base, float Exp) { - float *f = (float *)(&Base); + float* f = (float*)(&Base); - return _mm_set_ps(powf(f[3], Exp), - powf(f[2], Exp), - powf(f[1], Exp), - powf(f[0], Exp)); + return _mm_set_ps(powf(f[3], Exp), powf(f[2], Exp), powf(f[1], Exp), powf(f[0], Exp)); } static inline __m128 ConvertFloatToSRGB2(__m128& Src) { - // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value + // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float + // value __m128i CmpToSRGBThresholdMask = TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f), Src)); // squeeze the mask down to 16 bits (4 bits per DWORD) @@ -779,7 +923,7 @@ static inline __m128 ConvertFloatToSRGB2(__m128& Src) #else __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f); #endif - f = _mm_mul_ps(f, _mm_set1_ps(1.055f)); + f = _mm_mul_ps(f, _mm_set1_ps(1.055f)); Result = _mm_sub_ps(f, _mm_set1_ps(0.055f)); } else @@ -800,11 +944,12 @@ static inline __m128 ConvertFloatToSRGB2(__m128& Src) f = _mm_sub_ps(f, _mm_set1_ps(0.055f)); // Clear the alpha (is garbage after the sub) - __m128i i = _mm_and_si128(TO_M128i(f), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)); + __m128i i = _mm_and_si128(TO_M128i(f), + _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)); __m128i LessThanPart = _mm_and_si128(CmpToSRGBThresholdMask, TO_M128i(Src_0RGB_mul_denorm)); __m128i GreaterEqualPart = _mm_andnot_si128(CmpToSRGBThresholdMask, i); - __m128i CombinedParts = _mm_or_si128(LessThanPart, GreaterEqualPart); + __m128i CombinedParts = _mm_or_si128(LessThanPart, GreaterEqualPart); Result = TO_M128(CombinedParts); } @@ -813,43 +958,45 @@ static inline __m128 ConvertFloatToSRGB2(__m128& Src) } #if ENABLE_AVX512_SIMD16 -template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden > -inline static simd16scalar SIMDCALL fastpow(simd16scalar const &value) +template <unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden> +inline static simd16scalar SIMDCALL fastpow(simd16scalar const& value) { - static const float factor1 = exp2(127.0f * expden / expnum - 127.0f) - * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum); + static const float factor1 = exp2(127.0f * expden / expnum - 127.0f) * + powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum); // Apply a constant pre-correction factor. simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(factor1)); // Reinterpret arg as integer to obtain logarithm. - //asm("cvtdq2ps %1, %0" : "=x" (result) : "x" (result)); + // asm("cvtdq2ps %1, %0" : "=x" (result) : "x" (result)); result = _simd16_cvtepi32_ps(_simd16_castps_si(result)); // Multiply logarithm by power. result = _simd16_mul_ps(result, _simd16_set1_ps(1.0f * expnum / expden)); // Convert back to "integer" to exponentiate. - //asm("cvtps2dq %1, %0" : "=x" (result) : "x" (result)); + // asm("cvtps2dq %1, %0" : "=x" (result) : "x" (result)); result = _simd16_castsi_ps(_simd16_cvtps_epi32(result)); return result; } -inline static simd16scalar SIMDCALL pow512_4(simd16scalar const &arg) +inline static simd16scalar SIMDCALL pow512_4(simd16scalar const& arg) { // 5/12 is too small, so compute the 4th root of 20/12 instead. // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow. // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3 - simd16scalar xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg); + simd16scalar xf = fastpow<2, 3, int(0.629960524947437 * 1e9), int(1e9)>(arg); simd16scalar xover = _simd16_mul_ps(arg, xf); - simd16scalar xfm1 = _simd16_rsqrt_ps(xf); - simd16scalar x2 = _simd16_mul_ps(arg, arg); + simd16scalar xfm1 = _simd16_rsqrt_ps(xf); + simd16scalar x2 = _simd16_mul_ps(arg, arg); simd16scalar xunder = _simd16_mul_ps(x2, xfm1); // sqrt2 * over + 2 * sqrt2 * under - simd16scalar xavg = _simd16_mul_ps(_simd16_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), _simd16_add_ps(xover, xunder)); + simd16scalar xavg = + _simd16_mul_ps(_simd16_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), + _simd16_add_ps(xover, xunder)); xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg)); xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg)); @@ -857,28 +1004,26 @@ inline static simd16scalar SIMDCALL pow512_4(simd16scalar const &arg) return xavg; } -inline static simd16scalar SIMDCALL powf_wrapper(const simd16scalar &base, float exp) +inline static simd16scalar SIMDCALL powf_wrapper(const simd16scalar& base, float exp) { - const float *f = reinterpret_cast<const float *>(&base); - - return _simd16_set_ps( - powf(f[15], exp), - powf(f[14], exp), - powf(f[13], exp), - powf(f[12], exp), - powf(f[11], exp), - powf(f[10], exp), - powf(f[ 9], exp), - powf(f[ 8], exp), - powf(f[ 7], exp), - powf(f[ 6], exp), - powf(f[ 5], exp), - powf(f[ 4], exp), - powf(f[ 3], exp), - powf(f[ 2], exp), - powf(f[ 1], exp), - powf(f[ 0], exp) - ); + const float* f = reinterpret_cast<const float*>(&base); + + return _simd16_set_ps(powf(f[15], exp), + powf(f[14], exp), + powf(f[13], exp), + powf(f[12], exp), + powf(f[11], exp), + powf(f[10], exp), + powf(f[9], exp), + powf(f[8], exp), + powf(f[7], exp), + powf(f[6], exp), + powf(f[5], exp), + powf(f[4], exp), + powf(f[3], exp), + powf(f[2], exp), + powf(f[1], exp), + powf(f[0], exp)); } // float to SRGB conversion formula @@ -888,7 +1033,7 @@ inline static simd16scalar SIMDCALL powf_wrapper(const simd16scalar &base, float // else // value = 1.055f * pow(value, 1.0f / 2.4f) - 0.055f; // -static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar &value) +static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar& value) { // create a mask where the source is < the minimal SRGB float value const simd16mask mask = _simd16_cmplt_ps_mask(value, _simd16_set1_ps(0.0031308f)); @@ -913,7 +1058,8 @@ static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar &value) // only native AVX512 can directly use the computed mask for the blend operation result = _mm512_mask_blend_ps(mask, result2, result); #else - result = _simd16_blendv_ps(result2, result, _simd16_cmplt_ps(value, _simd16_set1_ps(0.0031308f))); + result = _simd16_blendv_ps( + result2, result, _simd16_cmplt_ps(value, _simd16_set1_ps(0.0031308f))); #endif } @@ -924,88 +1070,100 @@ static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar &value) ////////////////////////////////////////////////////////////////////////// /// TypeTraits - Format type traits specialization for FLOAT16 ////////////////////////////////////////////////////////////////////////// -template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16> +template <> +struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16> { static const SWR_TYPE MyType = SWR_TYPE_FLOAT; - static float toFloat() { return 1.0f; } - static float fromFloat() { return 1.0f; } - static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); } + static float toFloat() { return 1.0f; } + static float fromFloat() { return 1.0f; } + static simdscalar convertSrgb(simdscalar& in) + { + SWR_NOT_IMPL; + return _simd_setzero_ps(); + } - static simdscalar pack(const simdscalar &in) + static simdscalar pack(const simdscalar& in) { #if KNOB_SIMD_WIDTH == 8 #if (KNOB_ARCH == KNOB_ARCH_AVX) // input is 8 packed float32, output is 8 packed float16 simdscalari src = _simd_castps_si(in); - static const uint32_t FLOAT_EXP_BITS = 8; + static const uint32_t FLOAT_EXP_BITS = 8; static const uint32_t FLOAT_MANTISSA_BITS = 23; static const uint32_t FLOAT_MANTISSA_MASK = (1U << FLOAT_MANTISSA_BITS) - 1; static const uint32_t FLOAT_EXP_MASK = ((1U << FLOAT_EXP_BITS) - 1) << FLOAT_MANTISSA_BITS; - static const uint32_t HALF_EXP_BITS = 5; + static const uint32_t HALF_EXP_BITS = 5; static const uint32_t HALF_MANTISSA_BITS = 10; static const uint32_t HALF_EXP_MASK = ((1U << HALF_EXP_BITS) - 1) << HALF_MANTISSA_BITS; // minimum exponent required, exponents below this are flushed to 0. - static const int32_t HALF_EXP_MIN = -14; + static const int32_t HALF_EXP_MIN = -14; static const int32_t FLOAT_EXP_BIAS = 127; - static const int32_t FLOAT_EXP_MIN = HALF_EXP_MIN + FLOAT_EXP_BIAS; - static const int32_t FLOAT_EXP_MIN_FTZ = FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand + static const int32_t FLOAT_EXP_MIN = HALF_EXP_MIN + FLOAT_EXP_BIAS; + static const int32_t FLOAT_EXP_MIN_FTZ = + FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand // maximum exponent required, exponents above this are set to infinity - static const int32_t HALF_EXP_MAX = 15; + static const int32_t HALF_EXP_MAX = 15; static const int32_t FLOAT_EXP_MAX = HALF_EXP_MAX + FLOAT_EXP_BIAS; - const simdscalari vSignMask = _simd_set1_epi32(0x80000000); - const simdscalari vExpMask = _simd_set1_epi32(FLOAT_EXP_MASK); - const simdscalari vManMask = _simd_set1_epi32(FLOAT_MANTISSA_MASK); - const simdscalari vExpMin = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS)); - const simdscalari vExpMinFtz = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS)); - const simdscalari vExpMax = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS)); + const simdscalari vSignMask = _simd_set1_epi32(0x80000000); + const simdscalari vExpMask = _simd_set1_epi32(FLOAT_EXP_MASK); + const simdscalari vManMask = _simd_set1_epi32(FLOAT_MANTISSA_MASK); + const simdscalari vExpMin = + _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS)); + const simdscalari vExpMinFtz = + _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS)); + const simdscalari vExpMax = + _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS)); - simdscalari vSign = _simd_and_si(src, vSignMask); - simdscalari vExp = _simd_and_si(src, vExpMask); - simdscalari vMan = _simd_and_si(src, vManMask); + simdscalari vSign = _simd_and_si(src, vSignMask); + simdscalari vExp = _simd_and_si(src, vExpMask); + simdscalari vMan = _simd_and_si(src, vManMask); simdscalari vFTZMask = _simd_cmplt_epi32(vExp, vExpMinFtz); simdscalari vDenormMask = _simd_andnot_si(vFTZMask, _simd_cmplt_epi32(vExp, vExpMin)); simdscalari vInfMask = _simd_cmpeq_epi32(vExpMask, vExp); simdscalari vClampMask = _simd_andnot_si(vInfMask, _simd_cmplt_epi32(vExpMax, vExp)); - simdscalari vHalfExp = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS)); + simdscalari vHalfExp = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin), + _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS)); // pack output 16-bits into the lower 16-bits of each 32-bit channel - simdscalari vDst = _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK)); - vDst = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS)); + simdscalari vDst = + _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK)); + vDst = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS)); // Flush To Zero - vDst = _simd_andnot_si(vFTZMask, vDst); + vDst = _simd_andnot_si(vFTZMask, vDst); // Apply Infinites / NaN - vDst = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK))); + vDst = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK))); // Apply clamps vDst = _simd_andnot_si(vClampMask, vDst); - vDst = _simd_or_si(vDst, - _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF))); + vDst = _simd_or_si(vDst, _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF))); // Compute Denormals (subnormals) if (!_mm256_testz_si256(vDenormMask, vDenormMask)) { - uint32_t *pDenormMask = (uint32_t*)&vDenormMask; - uint32_t *pExp = (uint32_t*)&vExp; - uint32_t *pMan = (uint32_t*)&vMan; - uint32_t *pDst = (uint32_t*)&vDst; + uint32_t* pDenormMask = (uint32_t*)&vDenormMask; + uint32_t* pExp = (uint32_t*)&vExp; + uint32_t* pMan = (uint32_t*)&vMan; + uint32_t* pDst = (uint32_t*)&vDst; for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) { if (pDenormMask[i]) { // Need to compute subnormal value uint32_t exponent = pExp[i] >> FLOAT_MANTISSA_BITS; - uint32_t mantissa = pMan[i] | - (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s. Make it explicit + uint32_t mantissa = + pMan[i] | (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s. + // Make it explicit - pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) + (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS)); + pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) + + (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS)); } } } @@ -1014,7 +1172,8 @@ template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16> vDst = _simd_or_si(vDst, _simd_srli_epi32(vSign, 16)); // Pack to lower 128-bits - vDst = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1))); + vDst = _mm256_castsi128_si256( + _mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1))); #if 0 #if !defined(NDEBUG) @@ -1037,7 +1196,7 @@ template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16> #endif } - static simdscalar unpack(const simdscalar &in) + static simdscalar unpack(const simdscalar& in) { // input is 8 packed float16, output is 8 packed float32 SWR_NOT_IMPL; // @todo @@ -1045,10 +1204,10 @@ template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16> } #if ENABLE_AVX512_SIMD16 - static simd16scalar pack(const simd16scalar &in) + static simd16scalar pack(const simd16scalar& in) { - simd16scalari result = _simd16_setzero_si(); - simdscalari resultlo = _simd_setzero_si(); + simd16scalari result = _simd16_setzero_si(); + simdscalari resultlo = _simd_setzero_si(); #if (KNOB_ARCH == KNOB_ARCH_AVX) simdscalar simdlo = pack(_simd16_extract_ps(in, 0)); @@ -1070,7 +1229,7 @@ template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16> return _simd16_castsi_ps(result); } - static simd16scalar unpack(const simd16scalar &in) + static simd16scalar unpack(const simd16scalar& in) { // input is 16 packed float16, output is 16 packed float32 SWR_NOT_IMPL; // @todo @@ -1082,12 +1241,13 @@ template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16> ////////////////////////////////////////////////////////////////////////// /// TypeTraits - Format type traits specialization for FLOAT32 ////////////////////////////////////////////////////////////////////////// -template<> struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32> +template <> +struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32> { - static const SWR_TYPE MyType = SWR_TYPE_FLOAT; - static float toFloat() { return 1.0f; } - static float fromFloat() { return 1.0f; } - static inline simdscalar convertSrgb(simdscalar &in) + static const SWR_TYPE MyType = SWR_TYPE_FLOAT; + static float toFloat() { return 1.0f; } + static float fromFloat() { return 1.0f; } + static inline simdscalar convertSrgb(simdscalar& in) { #if KNOB_SIMD_WIDTH == 8 __m128 srcLo = _mm256_extractf128_ps(in, 0); @@ -1105,10 +1265,7 @@ template<> struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32> } #if ENABLE_AVX512_SIMD16 - static inline simd16scalar convertSrgb(simd16scalar &in) - { - return ConvertFloatToSRGB2(in); - } + static inline simd16scalar convertSrgb(simd16scalar& in) { return ConvertFloatToSRGB2(in); } #endif }; @@ -1139,7 +1296,7 @@ struct FormatIntType<bits, false, true> ////////////////////////////////////////////////////////////////////////// /// Format1 - Bitfield for single component formats. ////////////////////////////////////////////////////////////////////////// -template<uint32_t x> +template <uint32_t x> union Format1 { typedef typename FormatIntType<x>::TYPE TYPE; @@ -1153,11 +1310,11 @@ union Format1 { TYPE g : x; }; - struct + struct { TYPE b : x; }; - struct + struct { TYPE a : x; }; @@ -1166,7 +1323,7 @@ union Format1 ////////////////////////////////////////////////////////////////////////// /// Format2 - Bitfield for 2 component formats. ////////////////////////////////////////////////////////////////////////// -template<uint32_t x, uint32_t y> +template <uint32_t x, uint32_t y> union Format2 { typedef typename FormatIntType<x + y>::TYPE TYPE; @@ -1187,7 +1344,7 @@ union Format2 ////////////////////////////////////////////////////////////////////////// /// Format3 - Bitfield for 3 component formats. ////////////////////////////////////////////////////////////////////////// -template<uint32_t x, uint32_t y, uint32_t z> +template <uint32_t x, uint32_t y, uint32_t z> union Format3 { typedef typename FormatIntType<x + y + z>::TYPE TYPE; @@ -1198,13 +1355,13 @@ union Format3 TYPE g : y; TYPE b : z; }; - TYPE a; ///@note This is here to provide full template needed in Formats. + TYPE a; ///@note This is here to provide full template needed in Formats. }; ////////////////////////////////////////////////////////////////////////// /// Format4 - Bitfield for 4 component formats. ////////////////////////////////////////////////////////////////////////// -template<uint32_t x, uint32_t y, uint32_t z, uint32_t w> +template <uint32_t x, uint32_t y, uint32_t z, uint32_t w> struct Format4 { typedef typename FormatIntType<x + y + z + w>::TYPE TYPE; @@ -1218,12 +1375,12 @@ struct Format4 ////////////////////////////////////////////////////////////////////////// /// ComponentTraits - Default components ////////////////////////////////////////////////////////////////////////// -template<uint32_t x, uint32_t y, uint32_t z, uint32_t w> +template <uint32_t x, uint32_t y, uint32_t z, uint32_t w> struct Defaults { INLINE static uint32_t GetDefault(uint32_t comp) { - static const uint32_t defaults[4]{ x, y, z, w }; + static const uint32_t defaults[4]{x, y, z, w}; return defaults[comp]; } }; @@ -1231,25 +1388,31 @@ struct Defaults ////////////////////////////////////////////////////////////////////////// /// ComponentTraits - Component type traits. ////////////////////////////////////////////////////////////////////////// -template<SWR_TYPE X, uint32_t NumBitsX, SWR_TYPE Y = SWR_TYPE_UNKNOWN, uint32_t NumBitsY = 0, SWR_TYPE Z = SWR_TYPE_UNKNOWN, uint32_t NumBitsZ = 0, SWR_TYPE W = SWR_TYPE_UNKNOWN, uint32_t NumBitsW = 0> +template <SWR_TYPE X, + uint32_t NumBitsX, + SWR_TYPE Y = SWR_TYPE_UNKNOWN, + uint32_t NumBitsY = 0, + SWR_TYPE Z = SWR_TYPE_UNKNOWN, + uint32_t NumBitsZ = 0, + SWR_TYPE W = SWR_TYPE_UNKNOWN, + uint32_t NumBitsW = 0> struct ComponentTraits { INLINE static SWR_TYPE GetType(uint32_t comp) { - static const SWR_TYPE CompType[4]{ X, Y, Z, W }; + static const SWR_TYPE CompType[4]{X, Y, Z, W}; return CompType[comp]; } INLINE static constexpr uint32_t GetConstBPC(uint32_t comp) { - return (comp == 3) ? NumBitsW : - ((comp == 2) ? NumBitsZ : - ((comp == 1) ? NumBitsY : NumBitsX) ); + return (comp == 3) ? NumBitsW + : ((comp == 2) ? NumBitsZ : ((comp == 1) ? NumBitsY : NumBitsX)); } INLINE static uint32_t GetBPC(uint32_t comp) { - static const uint32_t MyBpc[4]{ NumBitsX, NumBitsY, NumBitsZ, NumBitsW }; + static const uint32_t MyBpc[4]{NumBitsX, NumBitsY, NumBitsZ, NumBitsW}; return MyBpc[comp]; } @@ -1285,7 +1448,6 @@ struct ComponentTraits } SWR_INVALID("Invalid component: %d", comp); return TypeTraits<X, NumBitsX>::toFloat(); - } INLINE static float fromFloat(uint32_t comp) @@ -1322,7 +1484,7 @@ struct ComponentTraits return TypeTraits<X, NumBitsX>::loadSOA(pSrc); } - INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simdscalar const &src) + INLINE static void storeSOA(uint32_t comp, uint8_t* pDst, simdscalar const& src) { switch (comp) { @@ -1342,19 +1504,23 @@ struct ComponentTraits SWR_INVALID("Invalid component: %d", comp); } - INLINE static simdscalar unpack(uint32_t comp, simdscalar &in) + INLINE static simdscalar unpack(uint32_t comp, simdscalar& in) { simdscalar out; switch (comp) { case 0: - out = TypeTraits<X, NumBitsX>::unpack(in); break; + out = TypeTraits<X, NumBitsX>::unpack(in); + break; case 1: - out = TypeTraits<Y, NumBitsY>::unpack(in); break; + out = TypeTraits<Y, NumBitsY>::unpack(in); + break; case 2: - out = TypeTraits<Z, NumBitsZ>::unpack(in); break; + out = TypeTraits<Z, NumBitsZ>::unpack(in); + break; case 3: - out = TypeTraits<W, NumBitsW>::unpack(in); break; + out = TypeTraits<W, NumBitsW>::unpack(in); + break; default: SWR_INVALID("Invalid component: %d", comp); out = in; @@ -1363,19 +1529,23 @@ struct ComponentTraits return out; } - INLINE static simdscalar pack(uint32_t comp, simdscalar &in) + INLINE static simdscalar pack(uint32_t comp, simdscalar& in) { simdscalar out; switch (comp) { case 0: - out = TypeTraits<X, NumBitsX>::pack(in); break; + out = TypeTraits<X, NumBitsX>::pack(in); + break; case 1: - out = TypeTraits<Y, NumBitsY>::pack(in); break; + out = TypeTraits<Y, NumBitsY>::pack(in); + break; case 2: - out = TypeTraits<Z, NumBitsZ>::pack(in); break; + out = TypeTraits<Z, NumBitsZ>::pack(in); + break; case 3: - out = TypeTraits<W, NumBitsW>::pack(in); break; + out = TypeTraits<W, NumBitsW>::pack(in); + break; default: SWR_INVALID("Invalid component: %d", comp); out = in; @@ -1384,7 +1554,7 @@ struct ComponentTraits return out; } - INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar &in) + INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar& in) { switch (comp) { @@ -1419,7 +1589,7 @@ struct ComponentTraits return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc); } - INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar const &src) + INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t* pDst, simd16scalar const& src) { switch (comp) { @@ -1440,7 +1610,7 @@ struct ComponentTraits TypeTraits<X, NumBitsX>::storeSOA(pDst, src); } - INLINE static simd16scalar unpack(uint32_t comp, simd16scalar &in) + INLINE static simd16scalar unpack(uint32_t comp, simd16scalar& in) { switch (comp) { @@ -1457,7 +1627,7 @@ struct ComponentTraits return TypeTraits<X, NumBitsX>::unpack(in); } - INLINE static simd16scalar pack(uint32_t comp, simd16scalar &in) + INLINE static simd16scalar pack(uint32_t comp, simd16scalar& in) { switch (comp) { @@ -1474,7 +1644,7 @@ struct ComponentTraits return TypeTraits<X, NumBitsX>::pack(in); } - INLINE static simd16scalar convertSrgb(uint32_t comp, simd16scalar &in) + INLINE static simd16scalar convertSrgb(uint32_t comp, simd16scalar& in) { switch (comp) { diff --git a/src/gallium/drivers/swr/rasterizer/core/format_utils.h b/src/gallium/drivers/swr/rasterizer/core/format_utils.h index 576f14bcafd..b51755dab50 100644 --- a/src/gallium/drivers/swr/rasterizer/core/format_utils.h +++ b/src/gallium/drivers/swr/rasterizer/core/format_utils.h @@ -1,37 +1,37 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file utils.h -* -* @brief Utilities used by SWR core related to pixel formats. -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file utils.h + * + * @brief Utilities used by SWR core related to pixel formats. + * + ******************************************************************************/ #pragma once #include "core/utils.h" #include "common/simdintrin.h" INLINE -void vTranspose(simd4scalar &row0, simd4scalar &row1, simd4scalar &row2, simd4scalar &row3) +void vTranspose(simd4scalar& row0, simd4scalar& row1, simd4scalar& row2, simd4scalar& row3) { simd4scalari row0i = SIMD128::castps_si(row0); simd4scalari row1i = SIMD128::castps_si(row1); @@ -39,8 +39,8 @@ void vTranspose(simd4scalar &row0, simd4scalar &row1, simd4scalar &row2, simd4sc simd4scalari row3i = SIMD128::castps_si(row3); simd4scalari vTemp = row2i; - row2i = SIMD128::unpacklo_epi32(row2i, row3i); - vTemp = SIMD128::unpackhi_epi32(vTemp, row3i); + row2i = SIMD128::unpacklo_epi32(row2i, row3i); + vTemp = SIMD128::unpackhi_epi32(vTemp, row3i); row3i = row0i; row0i = SIMD128::unpacklo_epi32(row0i, row1i); @@ -61,11 +61,11 @@ void vTranspose(simd4scalar &row0, simd4scalar &row1, simd4scalar &row2, simd4sc } INLINE -void vTranspose(simd4scalari &row0, simd4scalari &row1, simd4scalari &row2, simd4scalari &row3) +void vTranspose(simd4scalari& row0, simd4scalari& row1, simd4scalari& row2, simd4scalari& row3) { simd4scalari vTemp = row2; - row2 = SIMD128::unpacklo_epi32(row2, row3); - vTemp = SIMD128::unpackhi_epi32(vTemp, row3); + row2 = SIMD128::unpacklo_epi32(row2, row3); + vTemp = SIMD128::unpackhi_epi32(vTemp, row3); row3 = row0; row0 = SIMD128::unpacklo_epi32(row0, row1); @@ -82,17 +82,20 @@ void vTranspose(simd4scalari &row0, simd4scalari &row1, simd4scalari &row2, simd #if KNOB_SIMD_WIDTH == 8 INLINE -void vTranspose3x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2) +void vTranspose3x8(simd4scalar (&vDst)[8], + const simdscalar& vSrc0, + const simdscalar& vSrc1, + const simdscalar& vSrc2) { - simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 - simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps()); //y0w0y1w1 y4w4y5w5 - simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 - simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 + simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); // x0z0x1z1 x4z4x5z5 + simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps()); // y0w0y1w1 y4w4y5w5 + simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); // x0y0z0w0 x4y4z4w4 + simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); // x1y1z1w1 x5y5z5w5 - r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 - r1rx = _simd_unpackhi_ps(vSrc1, _simd_setzero_ps()); //y2w2y3w3 y6w6yw77 - simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 - simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 + r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); // x2z2x3z3 x6z6x7z7 + r1rx = _simd_unpackhi_ps(vSrc1, _simd_setzero_ps()); // y2w2y3w3 y6w6yw77 + simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); // x2y2z2w2 x6y6z6w6 + simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); // x3y3z3w3 x7y7z7w7 vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0); vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0); @@ -106,17 +109,21 @@ void vTranspose3x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdsc } INLINE -void vTranspose4x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3) +void vTranspose4x8(simd4scalar (&vDst)[8], + const simdscalar& vSrc0, + const simdscalar& vSrc1, + const simdscalar& vSrc2, + const simdscalar& vSrc3) { - simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 - simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5 - simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 - simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 + simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); // x0z0x1z1 x4z4x5z5 + simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3); // y0w0y1w1 y4w4y5w5 + simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); // x0y0z0w0 x4y4z4w4 + simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); // x1y1z1w1 x5y5z5w5 - r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 - r1rx = _simd_unpackhi_ps(vSrc1, vSrc3); //y2w2y3w3 y6w6yw77 - simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 - simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 + r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); // x2z2x3z3 x6z6x7z7 + r1rx = _simd_unpackhi_ps(vSrc1, vSrc3); // y2w2y3w3 y6w6yw77 + simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); // x2y2z2w2 x6y6z6w6 + simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); // x3y3z3w3 x7y7z7w7 vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0); vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0); @@ -131,9 +138,29 @@ void vTranspose4x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdsc #if ENABLE_AVX512_SIMD16 INLINE -void vTranspose4x16(simd16scalar(&dst)[4], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2, const simd16scalar &src3) +void vTranspose4x16(simd16scalar (&dst)[4], + const simd16scalar& src0, + const simd16scalar& src1, + const simd16scalar& src2, + const simd16scalar& src3) { - const simd16scalari perm = _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking + const simd16scalari perm = + _simd16_set_epi32(15, + 11, + 7, + 3, + 14, + 10, + 6, + 2, + 13, + 9, + 5, + 1, + 12, + 8, + 4, + 0); // pre-permute input to setup the right order after all the unpacking simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g @@ -153,46 +180,69 @@ void vTranspose4x16(simd16scalar(&dst)[4], const simd16scalar &src0, const simd1 #endif INLINE -void vTranspose8x8(simdscalar (&vDst)[8], const simdscalar &vMask0, const simdscalar &vMask1, const simdscalar &vMask2, const simdscalar &vMask3, const simdscalar &vMask4, const simdscalar &vMask5, const simdscalar &vMask6, const simdscalar &vMask7) +void vTranspose8x8(simdscalar (&vDst)[8], + const simdscalar& vMask0, + const simdscalar& vMask1, + const simdscalar& vMask2, + const simdscalar& vMask3, + const simdscalar& vMask4, + const simdscalar& vMask5, + const simdscalar& vMask6, + const simdscalar& vMask7) { - simdscalar __t0 = _simd_unpacklo_ps(vMask0, vMask1); - simdscalar __t1 = _simd_unpackhi_ps(vMask0, vMask1); - simdscalar __t2 = _simd_unpacklo_ps(vMask2, vMask3); - simdscalar __t3 = _simd_unpackhi_ps(vMask2, vMask3); - simdscalar __t4 = _simd_unpacklo_ps(vMask4, vMask5); - simdscalar __t5 = _simd_unpackhi_ps(vMask4, vMask5); - simdscalar __t6 = _simd_unpacklo_ps(vMask6, vMask7); - simdscalar __t7 = _simd_unpackhi_ps(vMask6, vMask7); - simdscalar __tt0 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0)); - simdscalar __tt1 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2)); - simdscalar __tt2 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0)); - simdscalar __tt3 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2)); - simdscalar __tt4 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0)); - simdscalar __tt5 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2)); - simdscalar __tt6 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0)); - simdscalar __tt7 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2)); - vDst[0] = _simd_permute2f128_ps(__tt0, __tt4, 0x20); - vDst[1] = _simd_permute2f128_ps(__tt1, __tt5, 0x20); - vDst[2] = _simd_permute2f128_ps(__tt2, __tt6, 0x20); - vDst[3] = _simd_permute2f128_ps(__tt3, __tt7, 0x20); - vDst[4] = _simd_permute2f128_ps(__tt0, __tt4, 0x31); - vDst[5] = _simd_permute2f128_ps(__tt1, __tt5, 0x31); - vDst[6] = _simd_permute2f128_ps(__tt2, __tt6, 0x31); - vDst[7] = _simd_permute2f128_ps(__tt3, __tt7, 0x31); + simdscalar __t0 = _simd_unpacklo_ps(vMask0, vMask1); + simdscalar __t1 = _simd_unpackhi_ps(vMask0, vMask1); + simdscalar __t2 = _simd_unpacklo_ps(vMask2, vMask3); + simdscalar __t3 = _simd_unpackhi_ps(vMask2, vMask3); + simdscalar __t4 = _simd_unpacklo_ps(vMask4, vMask5); + simdscalar __t5 = _simd_unpackhi_ps(vMask4, vMask5); + simdscalar __t6 = _simd_unpacklo_ps(vMask6, vMask7); + simdscalar __t7 = _simd_unpackhi_ps(vMask6, vMask7); + simdscalar __tt0 = _simd_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); + simdscalar __tt1 = _simd_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); + simdscalar __tt2 = _simd_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); + simdscalar __tt3 = _simd_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); + simdscalar __tt4 = _simd_shuffle_ps(__t4, __t6, _MM_SHUFFLE(1, 0, 1, 0)); + simdscalar __tt5 = _simd_shuffle_ps(__t4, __t6, _MM_SHUFFLE(3, 2, 3, 2)); + simdscalar __tt6 = _simd_shuffle_ps(__t5, __t7, _MM_SHUFFLE(1, 0, 1, 0)); + simdscalar __tt7 = _simd_shuffle_ps(__t5, __t7, _MM_SHUFFLE(3, 2, 3, 2)); + vDst[0] = _simd_permute2f128_ps(__tt0, __tt4, 0x20); + vDst[1] = _simd_permute2f128_ps(__tt1, __tt5, 0x20); + vDst[2] = _simd_permute2f128_ps(__tt2, __tt6, 0x20); + vDst[3] = _simd_permute2f128_ps(__tt3, __tt7, 0x20); + vDst[4] = _simd_permute2f128_ps(__tt0, __tt4, 0x31); + vDst[5] = _simd_permute2f128_ps(__tt1, __tt5, 0x31); + vDst[6] = _simd_permute2f128_ps(__tt2, __tt6, 0x31); + vDst[7] = _simd_permute2f128_ps(__tt3, __tt7, 0x31); } INLINE -void vTranspose8x8(simdscalar (&vDst)[8], const simdscalari &vMask0, const simdscalari &vMask1, const simdscalari &vMask2, const simdscalari &vMask3, const simdscalari &vMask4, const simdscalari &vMask5, const simdscalari &vMask6, const simdscalari &vMask7) +void vTranspose8x8(simdscalar (&vDst)[8], + const simdscalari& vMask0, + const simdscalari& vMask1, + const simdscalari& vMask2, + const simdscalari& vMask3, + const simdscalari& vMask4, + const simdscalari& vMask5, + const simdscalari& vMask6, + const simdscalari& vMask7) { - vTranspose8x8(vDst, _simd_castsi_ps(vMask0), _simd_castsi_ps(vMask1), _simd_castsi_ps(vMask2), _simd_castsi_ps(vMask3), - _simd_castsi_ps(vMask4), _simd_castsi_ps(vMask5), _simd_castsi_ps(vMask6), _simd_castsi_ps(vMask7)); + vTranspose8x8(vDst, + _simd_castsi_ps(vMask0), + _simd_castsi_ps(vMask1), + _simd_castsi_ps(vMask2), + _simd_castsi_ps(vMask3), + _simd_castsi_ps(vMask4), + _simd_castsi_ps(vMask5), + _simd_castsi_ps(vMask6), + _simd_castsi_ps(vMask7)); } #endif ////////////////////////////////////////////////////////////////////////// /// TranposeSingleComponent ////////////////////////////////////////////////////////////////////////// -template<uint32_t bpp> +template <uint32_t bpp> struct TransposeSingleComponent { ////////////////////////////////////////////////////////////////////////// @@ -227,23 +277,38 @@ struct Transpose8_8_8_8 #if KNOB_SIMD_WIDTH == 8 #if KNOB_ARCH <= KNOB_ARCH_AVX - simd4scalari c0c1 = src.v4[0]; // rrrrrrrrgggggggg - simd4scalari c2c3 = SIMD128::castps_si(_simd_extractf128_ps(_simd_castsi_ps(src), 1)); // bbbbbbbbaaaaaaaa - simd4scalari c0c2 = SIMD128::unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb - simd4scalari c1c3 = SIMD128::unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa - simd4scalari c01 = SIMD128::unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg - simd4scalari c23 = SIMD128::unpackhi_epi8(c0c2, c1c3); // babababababababa - simd4scalari c0123lo = SIMD128::unpacklo_epi16(c01, c23); // rgbargbargbargba - simd4scalari c0123hi = SIMD128::unpackhi_epi16(c01, c23); // rgbargbargbargba + simd4scalari c0c1 = src.v4[0]; // rrrrrrrrgggggggg + simd4scalari c2c3 = + SIMD128::castps_si(_simd_extractf128_ps(_simd_castsi_ps(src), 1)); // bbbbbbbbaaaaaaaa + simd4scalari c0c2 = SIMD128::unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb + simd4scalari c1c3 = SIMD128::unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa + simd4scalari c01 = SIMD128::unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg + simd4scalari c23 = SIMD128::unpackhi_epi8(c0c2, c1c3); // babababababababa + simd4scalari c0123lo = SIMD128::unpacklo_epi16(c01, c23); // rgbargbargbargba + simd4scalari c0123hi = SIMD128::unpackhi_epi16(c01, c23); // rgbargbargbargba SIMD128::store_si((simd4scalari*)pDst, c0123lo); SIMD128::store_si((simd4scalari*)(pDst + 16), c0123hi); #else simdscalari dst01 = _simd_shuffle_epi8(src, - _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800)); + _simd_set_epi32(0x0f078080, + 0x0e068080, + 0x0d058080, + 0x0c048080, + 0x80800b03, + 0x80800a02, + 0x80800901, + 0x80800800)); simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01); - dst23 = _simd_shuffle_epi8(dst23, - _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080)); - simdscalari dst = _simd_or_si(dst01, dst23); + dst23 = _simd_shuffle_epi8(dst23, + _simd_set_epi32(0x80800f07, + 0x80800e06, + 0x80800d05, + 0x80800c04, + 0x0b038080, + 0x0a028080, + 0x09018080, + 0x08008080)); + simdscalari dst = _simd_or_si(dst01, dst23); _simd_store_si((simdscalari*)pDst, dst); #endif #else @@ -254,23 +319,28 @@ struct Transpose8_8_8_8 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) { - simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc)); // rrrrrrrrrrrrrrrr - simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg - simd4scalari src2 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb - simd4scalari src3 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 3); // aaaaaaaaaaaaaaaa + simd4scalari src0 = + SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc)); // rrrrrrrrrrrrrrrr + simd4scalari src1 = + SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1); // gggggggggggggggg + simd4scalari src2 = + SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb + simd4scalari src3 = + SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 3); // aaaaaaaaaaaaaaaa simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0); simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1); simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2); simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3); - simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8); + simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8); simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16); simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24); simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3)); - _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba + _simd16_store_si(reinterpret_cast<simd16scalari*>(pDst), + dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba } #endif }; @@ -305,9 +375,9 @@ struct Transpose8_8 #if KNOB_SIMD_WIDTH == 8 simdscalari src = _simd_load_si((const simdscalari*)pSrc); - simd4scalari rg = src.v4[0]; // rrrrrrrr gggggggg - simd4scalari g = SIMD128::unpackhi_epi64(rg, rg); // gggggggg gggggggg - rg = SIMD128::unpacklo_epi8(rg, g); + simd4scalari rg = src.v4[0]; // rrrrrrrr gggggggg + simd4scalari g = SIMD128::unpackhi_epi64(rg, rg); // gggggggg gggggggg + rg = SIMD128::unpacklo_epi8(rg, g); SIMD128::store_si((simd4scalari*)pDst, rg); #else #error Unsupported vector width @@ -317,8 +387,10 @@ struct Transpose8_8 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) { - simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc)); // rrrrrrrrrrrrrrrr - simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg + simd4scalari src0 = + SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc)); // rrrrrrrrrrrrrrrr + simd4scalari src1 = + SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1); // gggggggggggggggg simdscalari cvt0 = _simd_cvtepu8_epi16(src0); simdscalari cvt1 = _simd_cvtepu8_epi16(src1); @@ -327,7 +399,8 @@ struct Transpose8_8 simdscalari dst = _simd_or_si(cvt0, shl1); - _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg + _simd_store_si(reinterpret_cast<simdscalari*>(pDst), + dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg } #endif }; @@ -352,13 +425,13 @@ struct Transpose32_32_32_32 simd4scalar vDst[8]; vTranspose4x8(vDst, src0, src1, src2, src3); SIMD128::store_ps((float*)pDst, vDst[0]); - SIMD128::store_ps((float*)pDst+4, vDst[1]); - SIMD128::store_ps((float*)pDst+8, vDst[2]); - SIMD128::store_ps((float*)pDst+12, vDst[3]); - SIMD128::store_ps((float*)pDst+16, vDst[4]); - SIMD128::store_ps((float*)pDst+20, vDst[5]); - SIMD128::store_ps((float*)pDst+24, vDst[6]); - SIMD128::store_ps((float*)pDst+28, vDst[7]); + SIMD128::store_ps((float*)pDst + 4, vDst[1]); + SIMD128::store_ps((float*)pDst + 8, vDst[2]); + SIMD128::store_ps((float*)pDst + 12, vDst[3]); + SIMD128::store_ps((float*)pDst + 16, vDst[4]); + SIMD128::store_ps((float*)pDst + 20, vDst[5]); + SIMD128::store_ps((float*)pDst + 24, vDst[6]); + SIMD128::store_ps((float*)pDst + 28, vDst[7]); #else #error Unsupported vector width #endif @@ -367,19 +440,19 @@ struct Transpose32_32_32_32 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) { - simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); - simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16); - simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32); - simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 48); + simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc)); + simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16); + simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 32); + simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 48); simd16scalar dst[4]; vTranspose4x16(dst, src0, src1, src2, src3); - _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst[0]); - _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]); - _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]); - _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]); + _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst[0]); + _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst[1]); + _simd16_store_ps(reinterpret_cast<float*>(pDst) + 32, dst[2]); + _simd16_store_ps(reinterpret_cast<float*>(pDst) + 48, dst[3]); } #endif }; @@ -418,19 +491,19 @@ struct Transpose32_32_32 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) { - simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); - simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16); - simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32); + simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc)); + simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16); + simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 32); simd16scalar src3 = _simd16_setzero_ps(); simd16scalar dst[4]; vTranspose4x16(dst, src0, src1, src2, src3); - _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst[0]); - _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]); - _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]); - _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]); + _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst[0]); + _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst[1]); + _simd16_store_ps(reinterpret_cast<float*>(pDst) + 32, dst[2]); + _simd16_store_ps(reinterpret_cast<float*>(pDst) + 48, dst[3]); } #endif }; @@ -447,11 +520,11 @@ struct Transpose32_32 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) { #if KNOB_SIMD_WIDTH == 8 - const float* pfSrc = (const float*)pSrc; - simd4scalar src_r0 = SIMD128::load_ps(pfSrc + 0); - simd4scalar src_r1 = SIMD128::load_ps(pfSrc + 4); - simd4scalar src_g0 = SIMD128::load_ps(pfSrc + 8); - simd4scalar src_g1 = SIMD128::load_ps(pfSrc + 12); + const float* pfSrc = (const float*)pSrc; + simd4scalar src_r0 = SIMD128::load_ps(pfSrc + 0); + simd4scalar src_r1 = SIMD128::load_ps(pfSrc + 4); + simd4scalar src_g0 = SIMD128::load_ps(pfSrc + 8); + simd4scalar src_g1 = SIMD128::load_ps(pfSrc + 12); simd4scalar dst0 = SIMD128::unpacklo_ps(src_r0, src_g0); simd4scalar dst1 = SIMD128::unpackhi_ps(src_r0, src_g0); @@ -471,20 +544,36 @@ struct Transpose32_32 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) { - simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); // rrrrrrrrrrrrrrrr - simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16); // gggggggggggggggg - - simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD - simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF - - simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7 - simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF - - simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7 - simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF - - _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg - _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg + simd16scalar src0 = + _simd16_load_ps(reinterpret_cast<const float*>(pSrc)); // rrrrrrrrrrrrrrrr + simd16scalar src1 = + _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16); // gggggggggggggggg + + simd16scalar tmp0 = + _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD + simd16scalar tmp1 = + _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF + + simd16scalar per0 = _simd16_permute2f128_ps( + tmp0, + tmp1, + 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7 + simd16scalar per1 = _simd16_permute2f128_ps( + tmp0, + tmp1, + 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF + + simd16scalar dst0 = _simd16_permute2f128_ps( + per0, + per0, + 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7 + simd16scalar dst1 = _simd16_permute2f128_ps( + per1, + per1, + 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF + + _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg + _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg } #endif }; @@ -531,30 +620,38 @@ struct Transpose16_16_16_16 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) { - simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr - simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg - simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb - simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 3); // aaaaaaaaaaaaaaaa - - simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB - simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF - simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB - simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF - - simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 - simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB - simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD - simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF - - simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 - simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 - simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB - simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF - - _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgbargbargbargba - _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgbargbargbargba - _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2); // rgbargbargbargba - _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3); // rgbargbargbargba + simdscalari src0 = + _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr + simdscalari src1 = + _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg + simdscalari src2 = + _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb + simdscalari src3 = + _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 3); // aaaaaaaaaaaaaaaa + + simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB + simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF + simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB + simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF + + simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 + simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB + simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD + simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF + + simdscalari dst0 = _simd_permute2f128_si( + tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 + simdscalari dst1 = _simd_permute2f128_si( + tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 + simdscalari dst2 = _simd_permute2f128_si( + tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB + simdscalari dst3 = _simd_permute2f128_si( + tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF + + _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgbargbargbargba + _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgbargbargbargba + _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2); // rgbargbargbargba + _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3); // rgbargbargbargba } #endif }; @@ -600,30 +697,37 @@ struct Transpose16_16_16 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) { - simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr - simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg - simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb - simdscalari src3 = _simd_setzero_si(); // aaaaaaaaaaaaaaaa - - simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB - simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF - simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB - simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF - - simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 - simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB - simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD - simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF - - simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 - simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 - simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB - simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF - - _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgbargbargbargba - _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgbargbargbargba - _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2); // rgbargbargbargba - _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3); // rgbargbargbargba + simdscalari src0 = + _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr + simdscalari src1 = + _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg + simdscalari src2 = + _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb + simdscalari src3 = _simd_setzero_si(); // aaaaaaaaaaaaaaaa + + simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB + simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF + simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB + simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF + + simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 + simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB + simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD + simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF + + simdscalari dst0 = _simd_permute2f128_si( + tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 + simdscalari dst1 = _simd_permute2f128_si( + tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 + simdscalari dst2 = _simd_permute2f128_si( + tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB + simdscalari dst3 = _simd_permute2f128_si( + tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF + + _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgbargbargbargba + _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgbargbargbargba + _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2); // rgbargbargbargba + _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3); // rgbargbargbargba } #endif }; @@ -661,17 +765,21 @@ struct Transpose16_16 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) { - simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr - simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg + simdscalari src0 = + _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr + simdscalari src1 = + _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg - simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB - simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF + simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB + simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF - simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7 - simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF + simdscalari dst0 = _simd_permute2f128_si( + tmp0, tmp1, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7 + simdscalari dst1 = _simd_permute2f128_si( + tmp0, tmp1, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF - _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg - _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg + _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg + _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg } #endif }; @@ -879,4 +987,3 @@ struct Transpose64_64_64_64 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; #endif }; - diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index 47c0662e5ee..b0d9f05b91b 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -1,31 +1,31 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file frontend.cpp -* -* @brief Implementation for Frontend which handles vertex processing, -* primitive assembly, clipping, binning, etc. -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file frontend.cpp + * + * @brief Implementation for Frontend which handles vertex processing, + * primitive assembly, clipping, binning, etc. + * + ******************************************************************************/ #include "api.h" #include "frontend.h" @@ -45,7 +45,8 @@ /// @brief Helper macro to generate a bitmask static INLINE uint32_t GenMask(uint32_t numBits) { - SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__); + SWR_ASSERT( + numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__); return ((1U << numBits) - 1); } @@ -56,17 +57,13 @@ static INLINE uint32_t GenMask(uint32_t numBits) /// @param workerId - thread's worker id. Even thread has a unique id. /// @param pUserData - Pointer to user data passed back to sync callback. /// @todo This should go away when we switch this to use compute threading. -void ProcessSync( - SWR_CONTEXT *pContext, - DRAW_CONTEXT *pDC, - uint32_t workerId, - void *pUserData) +void ProcessSync(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData) { BE_WORK work; - work.type = SYNC; + work.type = SYNC; work.pfnWork = ProcessSyncBE; - MacroTileMgr *pTileMgr = pDC->pTileMgr; + MacroTileMgr* pTileMgr = pDC->pTileMgr; pTileMgr->enqueue(0, 0, &work); } @@ -76,17 +73,13 @@ void ProcessSync( /// @param pDC - pointer to draw context. /// @param workerId - thread's worker id. Even thread has a unique id. /// @param pUserData - Pointer to user data passed back to sync callback. -void ProcessShutdown( - SWR_CONTEXT *pContext, - DRAW_CONTEXT *pDC, - uint32_t workerId, - void *pUserData) +void ProcessShutdown(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData) { BE_WORK work; - work.type = SHUTDOWN; + work.type = SHUTDOWN; work.pfnWork = ProcessShutdownBE; - MacroTileMgr *pTileMgr = pDC->pTileMgr; + MacroTileMgr* pTileMgr = pDC->pTileMgr; // Enqueue at least 1 work item for each worker thread // account for number of numa nodes uint32_t numNumaNodes = pContext->threadPool.numaMask + 1; @@ -107,14 +100,10 @@ void ProcessShutdown( /// @param workerId - thread's worker id. Even thread has a unique id. /// @param pUserData - Pointer to user data passed back to clear callback. /// @todo This should go away when we switch this to use compute threading. -void ProcessClear( - SWR_CONTEXT *pContext, - DRAW_CONTEXT *pDC, - uint32_t workerId, - void *pUserData) +void ProcessClear(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData) { - CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData; - MacroTileMgr *pTileMgr = pDC->pTileMgr; + CLEAR_DESC* pDesc = (CLEAR_DESC*)pUserData; + MacroTileMgr* pTileMgr = pDC->pTileMgr; // queue a clear to each macro tile // compute macro tile bounds for the specified rect @@ -124,8 +113,8 @@ void ProcessClear( uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM; BE_WORK work; - work.type = CLEAR; - work.pfnWork = ProcessClearBE; + work.type = CLEAR; + work.pfnWork = ProcessClearBE; work.desc.clear = *pDesc; for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y) @@ -144,15 +133,11 @@ void ProcessClear( /// @param workerId - thread's worker id. Even thread has a unique id. /// @param pUserData - Pointer to user data passed back to callback. /// @todo This should go away when we switch this to use compute threading. -void ProcessStoreTiles( - SWR_CONTEXT *pContext, - DRAW_CONTEXT *pDC, - uint32_t workerId, - void *pUserData) +void ProcessStoreTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData) { RDTSC_BEGIN(FEProcessStoreTiles, pDC->drawId); - MacroTileMgr *pTileMgr = pDC->pTileMgr; - STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData; + MacroTileMgr* pTileMgr = pDC->pTileMgr; + STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData; // queue a store to each macro tile // compute macro tile bounds for the specified rect @@ -163,8 +148,8 @@ void ProcessStoreTiles( // store tiles BE_WORK work; - work.type = STORETILES; - work.pfnWork = ProcessStoreTilesBE; + work.type = STORETILES; + work.pfnWork = ProcessStoreTilesBE; work.desc.storeTiles = *pDesc; for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y) @@ -185,15 +170,14 @@ void ProcessStoreTiles( /// @param workerId - thread's worker id. Even thread has a unique id. /// @param pUserData - Pointer to user data passed back to callback. /// @todo This should go away when we switch this to use compute threading. -void ProcessDiscardInvalidateTiles( - SWR_CONTEXT *pContext, - DRAW_CONTEXT *pDC, - uint32_t workerId, - void *pUserData) +void ProcessDiscardInvalidateTiles(SWR_CONTEXT* pContext, + DRAW_CONTEXT* pDC, + uint32_t workerId, + void* pUserData) { RDTSC_BEGIN(FEProcessInvalidateTiles, pDC->drawId); - DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData; - MacroTileMgr *pTileMgr = pDC->pTileMgr; + DISCARD_INVALIDATE_TILES_DESC* pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData; + MacroTileMgr* pTileMgr = pDC->pTileMgr; // compute macro tile bounds for the specified rect uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM; @@ -218,8 +202,8 @@ void ProcessDiscardInvalidateTiles( // load tiles BE_WORK work; - work.type = DISCARDINVALIDATETILES; - work.pfnWork = ProcessDiscardInvalidateTilesBE; + work.type = DISCARDINVALIDATETILES; + work.pfnWork = ProcessDiscardInvalidateTilesBE; work.desc.discardInvalidateTiles = *pDesc; for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x) @@ -238,27 +222,40 @@ void ProcessDiscardInvalidateTiles( /// @param mode - primitive topology for draw operation. /// @param numPrims - number of vertices or indices for draw. /// @todo Frontend needs to be refactored. This will go in appropriate place then. -uint32_t GetNumPrims( - PRIMITIVE_TOPOLOGY mode, - uint32_t numPrims) +uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims) { switch (mode) { - case TOP_POINT_LIST: return numPrims; - case TOP_TRIANGLE_LIST: return numPrims / 3; - case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2; - case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2; - case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1; - case TOP_QUAD_LIST: return numPrims / 4; - case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2; - case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1; - case TOP_LINE_LIST: return numPrims / 2; - case TOP_LINE_LOOP: return numPrims; - case TOP_RECT_LIST: return numPrims / 3; - case TOP_LINE_LIST_ADJ: return numPrims / 4; - case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3; - case TOP_TRI_LIST_ADJ: return numPrims / 6; - case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2; + case TOP_POINT_LIST: + return numPrims; + case TOP_TRIANGLE_LIST: + return numPrims / 3; + case TOP_TRIANGLE_STRIP: + return numPrims < 3 ? 0 : numPrims - 2; + case TOP_TRIANGLE_FAN: + return numPrims < 3 ? 0 : numPrims - 2; + case TOP_TRIANGLE_DISC: + return numPrims < 2 ? 0 : numPrims - 1; + case TOP_QUAD_LIST: + return numPrims / 4; + case TOP_QUAD_STRIP: + return numPrims < 4 ? 0 : (numPrims - 2) / 2; + case TOP_LINE_STRIP: + return numPrims < 2 ? 0 : numPrims - 1; + case TOP_LINE_LIST: + return numPrims / 2; + case TOP_LINE_LOOP: + return numPrims; + case TOP_RECT_LIST: + return numPrims / 3; + case TOP_LINE_LIST_ADJ: + return numPrims / 4; + case TOP_LISTSTRIP_ADJ: + return numPrims < 3 ? 0 : numPrims - 3; + case TOP_TRI_LIST_ADJ: + return numPrims / 6; + case TOP_TRI_STRIP_ADJ: + return numPrims < 4 ? 0 : (numPrims / 2) - 2; case TOP_PATCHLIST_1: case TOP_PATCHLIST_2: @@ -314,27 +311,40 @@ uint32_t GetNumPrims( /// @brief Computes the number of verts given the number of primitives. /// @param mode - primitive topology for draw operation. /// @param numPrims - number of primitives for draw. -uint32_t GetNumVerts( - PRIMITIVE_TOPOLOGY mode, - uint32_t numPrims) +uint32_t GetNumVerts(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims) { switch (mode) { - case TOP_POINT_LIST: return numPrims; - case TOP_TRIANGLE_LIST: return numPrims * 3; - case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0; - case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0; - case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0; - case TOP_QUAD_LIST: return numPrims * 4; - case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0; - case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0; - case TOP_LINE_LIST: return numPrims * 2; - case TOP_LINE_LOOP: return numPrims; - case TOP_RECT_LIST: return numPrims * 3; - case TOP_LINE_LIST_ADJ: return numPrims * 4; - case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0; - case TOP_TRI_LIST_ADJ: return numPrims * 6; - case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0; + case TOP_POINT_LIST: + return numPrims; + case TOP_TRIANGLE_LIST: + return numPrims * 3; + case TOP_TRIANGLE_STRIP: + return numPrims ? numPrims + 2 : 0; + case TOP_TRIANGLE_FAN: + return numPrims ? numPrims + 2 : 0; + case TOP_TRIANGLE_DISC: + return numPrims ? numPrims + 1 : 0; + case TOP_QUAD_LIST: + return numPrims * 4; + case TOP_QUAD_STRIP: + return numPrims ? numPrims * 2 + 2 : 0; + case TOP_LINE_STRIP: + return numPrims ? numPrims + 1 : 0; + case TOP_LINE_LIST: + return numPrims * 2; + case TOP_LINE_LOOP: + return numPrims; + case TOP_RECT_LIST: + return numPrims * 3; + case TOP_LINE_LIST_ADJ: + return numPrims * 4; + case TOP_LISTSTRIP_ADJ: + return numPrims ? numPrims + 3 : 0; + case TOP_TRI_LIST_ADJ: + return numPrims * 6; + case TOP_TRI_STRIP_ADJ: + return numPrims ? (numPrims + 2) * 2 : 0; case TOP_PATCHLIST_1: case TOP_PATCHLIST_2: @@ -465,10 +475,15 @@ INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVert switch (topology) { case TOP_LISTSTRIP_ADJ: - case TOP_LINE_LIST_ADJ: numVerts = 4; break; + case TOP_LINE_LIST_ADJ: + numVerts = 4; + break; case TOP_TRI_STRIP_ADJ: - case TOP_TRI_LIST_ADJ: numVerts = 6; break; - default: break; + case TOP_TRI_LIST_ADJ: + numVerts = 6; + break; + default: + break; } } @@ -480,14 +495,16 @@ INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVert /// @param numWorkItems - Number of items being worked on by a SIMD. static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining) { - uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining; + uint32_t numActive = + (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining; uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0; return _simd_castps_si(_simd_vmask_ps(mask)); } static INLINE simd16scalari GenerateMask16(uint32_t numItemsRemaining) { - uint32_t numActive = (numItemsRemaining >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : numItemsRemaining; + uint32_t numActive = + (numItemsRemaining >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : numItemsRemaining; uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0; return _simd16_castps_si(_simd16_vmask_ps(mask)); } @@ -499,23 +516,20 @@ static INLINE simd16scalari GenerateMask16(uint32_t numItemsRemaining) /// @param workerId - thread's worker id. Even thread has a unique id. /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris) static void StreamOut( - DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - uint32_t* pPrimData, - uint32_t streamIndex) + DRAW_CONTEXT* pDC, PA_STATE& pa, uint32_t workerId, uint32_t* pPrimData, uint32_t streamIndex) { RDTSC_BEGIN(FEStreamout, pDC->drawId); - const API_STATE& state = GetApiState(pDC); - const SWR_STREAMOUT_STATE &soState = state.soState; + const API_STATE& state = GetApiState(pDC); + const SWR_STREAMOUT_STATE& soState = state.soState; uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false); - // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex. + // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each + // vertex. uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t); - SWR_STREAMOUT_CONTEXT soContext = { 0 }; + SWR_STREAMOUT_CONTEXT soContext = {0}; // Setup buffer state pointers. for (uint32_t i = 0; i < 4; ++i) @@ -527,14 +541,14 @@ static void StreamOut( for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex) { - DWORD slot = 0; + DWORD slot = 0; uint64_t soMask = soState.streamMasks[streamIndex]; // Write all entries into primitive data buffer for SOS. while (_BitScanForward64(&slot, soMask)) { - simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide) - uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex]; + simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide) + uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex]; pa.AssembleSingle(paSlot, primIndex, attrib); // Attribute offset is relative offset from start of vertex. @@ -546,7 +560,8 @@ static void StreamOut( // Store each vertex's attrib at appropriate locations in pPrimData buffer. for (uint32_t v = 0; v < soVertsPerPrim; ++v) { - uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride); + uint32_t* pPrimDataAttrib = + pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride); _mm_store_ps((float*)pPrimDataAttrib, attrib[v]); } @@ -554,11 +569,12 @@ static void StreamOut( soMask &= ~(uint64_t(1) << slot); } - // Update pPrimData pointer + // Update pPrimData pointer soContext.pPrimData = pPrimData; // Call SOS - SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function."); + SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, + "Trying to execute uninitialized streamout jit function."); state.pfnSoFunc[streamIndex](soContext); } @@ -620,7 +636,10 @@ INLINE static T RoundDownEven(T value) /// /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS /// -void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount) +void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex* vertex_simd16, + const simdvertex* vertex, + uint32_t vertexCount, + uint32_t attribCount) { SWR_ASSERT(vertex); SWR_ASSERT(vertex_simd16); @@ -634,11 +653,13 @@ void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const si { for (uint32_t k = 0; k < 4; k += 1) { - temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0); + temp.attrib[j][k] = + _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0); if ((i + 1) < vertexCount) { - temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1); + temp.attrib[j][k] = + _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1); } } } @@ -658,9 +679,7 @@ void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const si /// then return the remaining amount of work. /// @param curIndex - The start index for the SIMD. /// @param maxIndex - The last index for all work items. -static INLINE uint32_t GetNumInvocations( - uint32_t curIndex, - uint32_t maxIndex) +static INLINE uint32_t GetNumInvocations(uint32_t curIndex, uint32_t maxIndex) { uint32_t remainder = (maxIndex - curIndex); #if USE_SIMD16_FRONTEND @@ -680,17 +699,20 @@ static INLINE uint32_t GetNumInvocations( /// @param pStreamIdBase - pointer to the stream ID buffer /// @param numEmittedVerts - Number of total verts emitted by the GS /// @param pCutBuffer - output buffer to write cuts to -void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer) +void ProcessStreamIdBuffer(uint32_t stream, + uint8_t* pStreamIdBase, + uint32_t numEmittedVerts, + uint8_t* pCutBuffer) { SWR_ASSERT(stream < MAX_SO_STREAMS); - uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8; + uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8; uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U); for (uint32_t b = 0; b < numOutputBytes; ++b) { - uint8_t curInputByte = pStreamIdBase[2*b]; - uint8_t outByte = 0; + uint8_t curInputByte = pStreamIdBase[2 * b]; + uint8_t outByte = 0; for (uint32_t i = 0; i < 4; ++i) { if ((curInputByte & 0x3) != stream) @@ -720,16 +742,17 @@ struct GsBuffers uint8_t* pGsIn; uint8_t* pGsOut[KNOB_SIMD_WIDTH]; uint8_t* pGsTransposed; - void* pStreamCutBuffer; + void* pStreamCutBuffer; }; ////////////////////////////////////////////////////////////////////////// /// @brief Transposes GS output from SOA to AOS to feed the primitive assembler -/// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive assembler +/// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive +/// assembler /// @param pSrc - Buffer of vertices in SOA form written by the geometry shader /// @param numVerts - Number of vertices outputted by the GS /// @param numAttribs - Number of attributes per vertex -template<typename SIMD_T, uint32_t SimdWidth> +template <typename SIMD_T, uint32_t SimdWidth> void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs) { uint32_t srcVertexStride = numAttribs * sizeof(float) * 4; @@ -743,7 +766,7 @@ void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t } auto vGatherOffsets = SIMD_T::load_si((Integer<SIMD_T>*)&gatherOffsets[0]); - uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth; + uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth; uint32_t remainingVerts = numVerts; for (uint32_t s = 0; s < numSimd; ++s) @@ -753,21 +776,36 @@ void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t // Compute mask to prevent src overflow uint32_t mask = std::min(remainingVerts, SimdWidth); - mask = GenMask(mask); - auto vMask = SIMD_T::vmask_ps(mask); - auto viMask = SIMD_T::castps_si(vMask); + mask = GenMask(mask); + auto vMask = SIMD_T::vmask_ps(mask); + auto viMask = SIMD_T::castps_si(vMask); for (uint32_t a = 0; a < numAttribs; ++a) { - auto attribGatherX = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask); - auto attribGatherY = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float)), vGatherOffsets, vMask); - auto attribGatherZ = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 2), vGatherOffsets, vMask); - auto attribGatherW = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 3), vGatherOffsets, vMask); + auto attribGatherX = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>( + SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask); + auto attribGatherY = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>( + SIMD_T::setzero_ps(), + (const float*)(pSrcBase + sizeof(float)), + vGatherOffsets, + vMask); + auto attribGatherZ = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>( + SIMD_T::setzero_ps(), + (const float*)(pSrcBase + sizeof(float) * 2), + vGatherOffsets, + vMask); + auto attribGatherW = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>( + SIMD_T::setzero_ps(), + (const float*)(pSrcBase + sizeof(float) * 3), + vGatherOffsets, + vMask); SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX); SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>)), viMask, attribGatherY); - SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>) * 2), viMask, attribGatherZ); - SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>) * 3), viMask, attribGatherW); + SIMD_T::maskstore_ps( + (float*)(pDstBase + sizeof(Float<SIMD_T>) * 2), viMask, attribGatherZ); + SIMD_T::maskstore_ps( + (float*)(pDstBase + sizeof(Float<SIMD_T>) * 3), viMask, attribGatherW); pSrcBase += sizeof(float) * 4; pDstBase += sizeof(Float<SIMD_T>) * 4; @@ -783,38 +821,35 @@ void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t /// @param workerId - thread's worker id. Even thread has a unique id. /// @param pa - The primitive assembly object. /// @param pGsOut - output stream for GS -template < - typename HasStreamOutT, - typename HasRastT> -static void GeometryShaderStage( - DRAW_CONTEXT *pDC, - uint32_t workerId, - PA_STATE& pa, - GsBuffers* pGsBuffers, - uint32_t* pSoPrimData, +template <typename HasStreamOutT, typename HasRastT> +static void GeometryShaderStage(DRAW_CONTEXT* pDC, + uint32_t workerId, + PA_STATE& pa, + GsBuffers* pGsBuffers, + uint32_t* pSoPrimData, #if USE_SIMD16_FRONTEND - uint32_t numPrims_simd8, + uint32_t numPrims_simd8, #endif - simdscalari const &primID) + simdscalari const& primID) { RDTSC_BEGIN(FEGeometryShader, pDC->drawId); void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; - const API_STATE& state = GetApiState(pDC); + const API_STATE& state = GetApiState(pDC); const SWR_GS_STATE* pState = &state.gsState; - SWR_GS_CONTEXT gsContext; + SWR_GS_CONTEXT gsContext; - static uint8_t sNullBuffer[128] = { 0 }; + static uint8_t sNullBuffer[128] = {0}; for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) { gsContext.pStreams[i] = pGsBuffers->pGsOut[i]; } - gsContext.pVerts = (simdvector*)pGsBuffers->pGsIn; + gsContext.pVerts = (simdvector*)pGsBuffers->pGsIn; gsContext.PrimitiveID = primID; - uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true); + uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true); simdvector attrib[MAX_NUM_VERTS_PER_PRIM]; // assemble all attributes for the input primitive @@ -822,7 +857,7 @@ static void GeometryShaderStage( for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot) { uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot; - uint32_t attribSlot = pState->vertexAttribOffset + slot; + uint32_t attribSlot = pState->vertexAttribOffset + slot; pa.Assemble(srcAttribSlot, attrib); for (uint32_t i = 0; i < numVertsPerPrim; ++i) @@ -843,13 +878,13 @@ static void GeometryShaderStage( #if USE_SIMD16_FRONTEND uint32_t numInputPrims = numPrims_simd8; #else - uint32_t numInputPrims = pa.NumPrims(); + uint32_t numInputPrims = pa.NumPrims(); #endif for (uint32_t instance = 0; instance < pState->instanceCount; ++instance) { gsContext.InstanceID = instance; - gsContext.mask = GenerateMask(numInputPrims); + gsContext.mask = GenerateMask(numInputPrims); // execute the geometry shader state.pfnGsFunc(GetPrivateState(pDC), pWorkerData, &gsContext); @@ -868,25 +903,43 @@ static void GeometryShaderStage( { switch (pState->outputTopology) { - case TOP_RECT_LIST: pfnClipFunc = ClipRectangles_simd16; break; - case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles_simd16; break; - case TOP_LINE_STRIP: pfnClipFunc = ClipLines_simd16; break; - case TOP_POINT_LIST: pfnClipFunc = ClipPoints_simd16; break; - default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology); + case TOP_RECT_LIST: + pfnClipFunc = ClipRectangles_simd16; + break; + case TOP_TRIANGLE_STRIP: + pfnClipFunc = ClipTriangles_simd16; + break; + case TOP_LINE_STRIP: + pfnClipFunc = ClipLines_simd16; + break; + case TOP_POINT_LIST: + pfnClipFunc = ClipPoints_simd16; + break; + default: + SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology); } } #else - PFN_PROCESS_PRIMS pfnClipFunc = nullptr; + PFN_PROCESS_PRIMS pfnClipFunc = nullptr; if (HasRastT::value) { switch (pState->outputTopology) { - case TOP_RECT_LIST: pfnClipFunc = ClipRectangles; break; - case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break; - case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break; - case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break; - default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology); + case TOP_RECT_LIST: + pfnClipFunc = ClipRectangles; + break; + case TOP_TRIANGLE_STRIP: + pfnClipFunc = ClipTriangles; + break; + case TOP_LINE_STRIP: + pfnClipFunc = ClipLines; + break; + case TOP_POINT_LIST: + pfnClipFunc = ClipPoints; + break; + default: + SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology); } } @@ -922,29 +975,37 @@ static void GeometryShaderStage( } uint8_t* pBase = pInstanceBase + instance * pState->allocationSize; - uint8_t* pCutBase = pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset; + uint8_t* pCutBase = + pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset; uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset; #if USE_SIMD16_FRONTEND - TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize); + TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, + pVertexBaseAOS, + vertexCount, + pState->outputVertexSize); #else - TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize); + TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, + pVertexBaseAOS, + vertexCount, + pState->outputVertexSize); #endif uint32_t numAttribs = state.feNumAttributes; for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream) { - bool processCutVerts = false; - uint8_t* pCutBuffer = pCutBase; + bool processCutVerts = false; + uint8_t* pCutBuffer = pCutBase; // assign default stream ID, only relevant when GS is outputting a single stream uint32_t streamID = 0; if (pState->isSingleStream) { processCutVerts = true; - streamID = pState->singleStreamID; - if (streamID != stream) continue; + streamID = pState->singleStreamID; + if (streamID != stream) + continue; } else { @@ -955,16 +1016,35 @@ static void GeometryShaderStage( } // multi-stream output, need to translate StreamID buffer to a cut buffer - ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer); - pCutBuffer = (uint8_t*)pGsBuffers->pStreamCutBuffer; + ProcessStreamIdBuffer( + stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer); + pCutBuffer = (uint8_t*)pGsBuffers->pStreamCutBuffer; processCutVerts = false; } #if USE_SIMD16_FRONTEND - PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim); + PA_STATE_CUT gsPa(pDC, + (uint8_t*)pGsBuffers->pGsTransposed, + numEmittedVerts, + pState->outputVertexSize, + reinterpret_cast<simd16mask*>(pCutBuffer), + numEmittedVerts, + numAttribs, + pState->outputTopology, + processCutVerts, + pa.numVertsPerPrim); #else - PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim); + PA_STATE_CUT gsPa(pDC, + (uint8_t*)pGsBuffers->pGsTransposed, + numEmittedVerts, + pState->outputVertexSize, + pCutBuffer, + numEmittedVerts, + numAttribs, + pState->outputTopology, + processCutVerts, + pa.numVertsPerPrim); #endif while (gsPa.GetNextStreamOutput()) @@ -999,18 +1079,19 @@ static void GeometryShaderStage( // Gather data from the SVG if provided. simd16scalari vViewportIdx = SIMD16::setzero_si(); - simd16scalari vRtIdx = SIMD16::setzero_si(); - SIMD16::Vec4 svgAttrib[4]; + simd16scalari vRtIdx = SIMD16::setzero_si(); + SIMD16::Vec4 svgAttrib[4]; - if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex) + if (state.backendState.readViewportArrayIndex || + state.backendState.readRenderTargetArrayIndex) { gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib); } - if (state.backendState.readViewportArrayIndex) { - vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]); + vViewportIdx = + SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]); gsPa.viewportArrayActive = true; } if (state.backendState.readRenderTargetArrayIndex) @@ -1021,36 +1102,50 @@ static void GeometryShaderStage( { // OOB VPAI indices => forced to zero. - vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si()); - simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); - simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports); + vViewportIdx = + SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si()); + simd16scalari vNumViewports = + SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); + simd16scalari vClearMask = + SIMD16::cmplt_epi32(vViewportIdx, vNumViewports); vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx); gsPa.useAlternateOffset = false; - pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId, vViewportIdx, vRtIdx); + pfnClipFunc(pDC, + gsPa, + workerId, + attrib_simd16, + GenMask(gsPa.NumPrims()), + vPrimId, + vViewportIdx, + vRtIdx); } #else simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]); // Gather data from the SVG if provided. simdscalari vViewportIdx = SIMD::setzero_si(); - simdscalari vRtIdx = SIMD::setzero_si(); - SIMD::Vec4 svgAttrib[4]; + simdscalari vRtIdx = SIMD::setzero_si(); + SIMD::Vec4 svgAttrib[4]; - if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex) + if (state.backendState.readViewportArrayIndex || + state.backendState.readRenderTargetArrayIndex) { gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib); } - if (state.backendState.readViewportArrayIndex) { - vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]); + vViewportIdx = + SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]); // OOB VPAI indices => forced to zero. - vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si()); - simdscalari vNumViewports = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); - simdscalari vClearMask = SIMD::cmplt_epi32(vViewportIdx, vNumViewports); + vViewportIdx = + SIMD::max_epi32(vViewportIdx, SIMD::setzero_si()); + simdscalari vNumViewports = + SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); + simdscalari vClearMask = + SIMD::cmplt_epi32(vViewportIdx, vNumViewports); vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx); gsPa.viewportArrayActive = true; } @@ -1060,7 +1155,14 @@ static void GeometryShaderStage( gsPa.rtArrayActive = true; } - pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewportIdx, vRtIdx); + pfnClipFunc(pDC, + gsPa, + workerId, + attrib, + GenMask(gsPa.NumPrims()), + vPrimId, + vViewportIdx, + vRtIdx); #endif } } @@ -1073,7 +1175,7 @@ static void GeometryShaderStage( // update GS pipeline stats UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount); UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated); - AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims)); + AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim * numInputPrims)); RDTSC_END(FEGeometryShader, 1); } @@ -1083,8 +1185,11 @@ static void GeometryShaderStage( /// @param state - API state /// @param ppGsOut - pointer to GS output buffer allocation /// @param ppCutBuffer - pointer to GS output cut buffer allocation -template<typename SIMD_T, uint32_t SIMD_WIDTH> -static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, uint32_t vertsPerPrim, GsBuffers* pGsBuffers) +template <typename SIMD_T, uint32_t SIMD_WIDTH> +static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, + const API_STATE& state, + uint32_t vertsPerPrim, + GsBuffers* pGsBuffers) { auto pArena = pDC->pArena; SWR_ASSERT(pArena != nullptr); @@ -1094,7 +1199,7 @@ static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, // Allocate storage for vertex inputs uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim; - pGsBuffers->pGsIn = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32); + pGsBuffers->pGsIn = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32); // Allocate arena space to hold GS output verts const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize; @@ -1106,7 +1211,8 @@ static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, // Allocate storage for transposed GS output uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH; - uint32_t transposedBufferSize = numSimdBatches * gsState.outputVertexSize * sizeof(Vec4<SIMD_T>); + uint32_t transposedBufferSize = + numSimdBatches * gsState.outputVertexSize * sizeof(Vec4<SIMD_T>); pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32); // Allocate storage to hold temporary stream->cut buffer, if necessary @@ -1116,7 +1222,8 @@ static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, } else { - pGsBuffers->pStreamCutBuffer = (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32); + pGsBuffers->pStreamCutBuffer = + (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32); } } @@ -1126,12 +1233,12 @@ static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, struct TessellationThreadLocalData { SWR_HS_CONTEXT hsContext; - ScalarPatch patchData[KNOB_SIMD_WIDTH]; - void* pTxCtx; - size_t tsCtxSize; + ScalarPatch patchData[KNOB_SIMD_WIDTH]; + void* pTxCtx; + size_t tsCtxSize; simdscalar* pDSOutput; - size_t dsOutputAllocSize; + size_t dsOutputAllocSize; }; THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr; @@ -1144,8 +1251,8 @@ static void AllocateTessellationData(SWR_CONTEXT* pContext) /// @TODO - Don't use thread local storage. Use Worker local storage instead. if (gt_pTessellationThreadData == nullptr) { - gt_pTessellationThreadData = (TessellationThreadLocalData*) - AlignedMalloc(sizeof(TessellationThreadLocalData), 64); + gt_pTessellationThreadData = + (TessellationThreadLocalData*)AlignedMalloc(sizeof(TessellationThreadLocalData), 64); memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData)); } } @@ -1156,42 +1263,37 @@ static void AllocateTessellationData(SWR_CONTEXT* pContext) /// @param workerId - thread's worker id. Even thread has a unique id. /// @param pa - The primitive assembly object. /// @param pGsOut - output stream for GS -template < - typename HasGeometryShaderT, - typename HasStreamOutT, - typename HasRastT> -static void TessellationStages( - DRAW_CONTEXT *pDC, - uint32_t workerId, - PA_STATE& pa, - GsBuffers* pGsBuffers, - uint32_t* pSoPrimData, +template <typename HasGeometryShaderT, typename HasStreamOutT, typename HasRastT> +static void TessellationStages(DRAW_CONTEXT* pDC, + uint32_t workerId, + PA_STATE& pa, + GsBuffers* pGsBuffers, + uint32_t* pSoPrimData, #if USE_SIMD16_FRONTEND - uint32_t numPrims_simd8, + uint32_t numPrims_simd8, #endif - simdscalari const &primID) + simdscalari const& primID) { - const API_STATE& state = GetApiState(pDC); + const API_STATE& state = GetApiState(pDC); const SWR_TS_STATE& tsState = state.tsState; void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; SWR_ASSERT(gt_pTessellationThreadData); - HANDLE tsCtx = TSInitCtx( - tsState.domain, - tsState.partitioning, - tsState.tsOutputTopology, - gt_pTessellationThreadData->pTxCtx, - gt_pTessellationThreadData->tsCtxSize); + HANDLE tsCtx = TSInitCtx(tsState.domain, + tsState.partitioning, + tsState.tsOutputTopology, + gt_pTessellationThreadData->pTxCtx, + gt_pTessellationThreadData->tsCtxSize); if (tsCtx == nullptr) { - gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64); - tsCtx = TSInitCtx( - tsState.domain, - tsState.partitioning, - tsState.tsOutputTopology, - gt_pTessellationThreadData->pTxCtx, - gt_pTessellationThreadData->tsCtxSize); + gt_pTessellationThreadData->pTxCtx = + AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64); + tsCtx = TSInitCtx(tsState.domain, + tsState.partitioning, + tsState.tsOutputTopology, + gt_pTessellationThreadData->pTxCtx, + gt_pTessellationThreadData->tsCtxSize); } SWR_ASSERT(tsCtx); @@ -1201,10 +1303,17 @@ static void TessellationStages( { switch (tsState.postDSTopology) { - case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles_simd16; break; - case TOP_LINE_LIST: pfnClipFunc = ClipLines_simd16; break; - case TOP_POINT_LIST: pfnClipFunc = ClipPoints_simd16; break; - default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology); + case TOP_TRIANGLE_LIST: + pfnClipFunc = ClipTriangles_simd16; + break; + case TOP_LINE_LIST: + pfnClipFunc = ClipLines_simd16; + break; + case TOP_POINT_LIST: + pfnClipFunc = ClipPoints_simd16; + break; + default: + SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology); } } @@ -1214,17 +1323,24 @@ static void TessellationStages( { switch (tsState.postDSTopology) { - case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break; - case TOP_LINE_LIST: pfnClipFunc = ClipLines; break; - case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break; - default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology); + case TOP_TRIANGLE_LIST: + pfnClipFunc = ClipTriangles; + break; + case TOP_LINE_LIST: + pfnClipFunc = ClipLines; + break; + case TOP_POINT_LIST: + pfnClipFunc = ClipPoints; + break; + default: + SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology); } } #endif SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext; - hsContext.pCPout = gt_pTessellationThreadData->patchData; - hsContext.PrimitiveID = primID; + hsContext.pCPout = gt_pTessellationThreadData->patchData; + hsContext.PrimitiveID = primID; uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false); // Max storage for one attribute for an entire simdprimitive @@ -1266,7 +1382,7 @@ static void TessellationStages( for (uint32_t p = 0; p < numPrims; ++p) { // Run Tessellator - SWR_TS_TESSELLATED_DATA tsData = { 0 }; + SWR_TS_TESSELLATED_DATA tsData = {0}; RDTSC_BEGIN(FETessellation, pDC->drawId); TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData); AR_EVENT(TessPrimCount(1)); @@ -1279,17 +1395,20 @@ static void TessellationStages( SWR_ASSERT(tsData.NumDomainPoints); // Allocate DS Output memory - uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH; + uint32_t requiredDSVectorInvocations = + AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH; #if USE_SIMD16_FRONTEND - size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.dsAllocationSize; // simd8 -> simd16, padding + size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * + tsState.dsAllocationSize; // simd8 -> simd16, padding #else size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.dsAllocationSize; - size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors; + size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors; #endif if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize) { AlignedFree(gt_pTessellationThreadData->pDSOutput); - gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64); + gt_pTessellationThreadData->pDSOutput = + (simdscalar*)AlignedMalloc(requiredAllocSize, 64); gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize; } SWR_ASSERT(gt_pTessellationThreadData->pDSOutput); @@ -1301,21 +1420,22 @@ static void TessellationStages( // Run Domain Shader SWR_DS_CONTEXT dsContext; - dsContext.PrimitiveID = pPrimId[p]; - dsContext.pCpIn = &hsContext.pCPout[p]; - dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU; - dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV; - dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput; + dsContext.PrimitiveID = pPrimId[p]; + dsContext.pCpIn = &hsContext.pCPout[p]; + dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU; + dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV; + dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput; dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset; #if USE_SIMD16_FRONTEND - dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16 + dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16 #else - dsContext.vectorStride = requiredDSVectorInvocations; + dsContext.vectorStride = requiredDSVectorInvocations; #endif uint32_t dsInvocations = 0; - for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset) + for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; + ++dsContext.vectorOffset) { dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations); @@ -1330,14 +1450,14 @@ static void TessellationStages( UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints); #if USE_SIMD16_FRONTEND - SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16 + SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16 #endif PA_TESS tessPa( pDC, #if USE_SIMD16_FRONTEND - reinterpret_cast<const simd16scalar *>(dsContext.pOutputData), // simd8 -> simd16 - dsContext.vectorStride / 2, // simd8 -> simd16 + reinterpret_cast<const simd16scalar*>(dsContext.pOutputData), // simd8 -> simd16 + dsContext.vectorStride / 2, // simd8 -> simd16 #else dsContext.pOutputData, dsContext.vectorStride, @@ -1352,29 +1472,37 @@ static void TessellationStages( while (tessPa.HasWork()) { #if USE_SIMD16_FRONTEND - const uint32_t numPrims = tessPa.NumPrims(); + const uint32_t numPrims = tessPa.NumPrims(); const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH); - const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH; + const uint32_t numPrims_hi = + std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH; - const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID); - const simdscalari primID_lo = _simd16_extract_si(primID, 0); - const simdscalari primID_hi = _simd16_extract_si(primID, 1); + const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID); + const simdscalari primID_lo = _simd16_extract_si(primID, 0); + const simdscalari primID_hi = _simd16_extract_si(primID, 1); #endif if (HasGeometryShaderT::value) { #if USE_SIMD16_FRONTEND tessPa.useAlternateOffset = false; - GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo); + GeometryShaderStage<HasStreamOutT, HasRastT>( + pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo); if (numPrims_hi) { tessPa.useAlternateOffset = true; - GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi); + GeometryShaderStage<HasStreamOutT, HasRastT>( + pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi); } #else GeometryShaderStage<HasStreamOutT, HasRastT>( - pDC, workerId, tessPa, pGsBuffers, pSoPrimData, _simd_set1_epi32(dsContext.PrimitiveID)); + pDC, + workerId, + tessPa, + pGsBuffers, + pSoPrimData, + _simd_set1_epi32(dsContext.PrimitiveID)); #endif } else @@ -1390,9 +1518,9 @@ static void TessellationStages( if (HasRastT::value) { #if USE_SIMD16_FRONTEND - simd16vector prim_simd16[3]; // Only deal with triangles, lines, or points + simd16vector prim_simd16[3]; // Only deal with triangles, lines, or points #else - simdvector prim[3]; // Only deal with triangles, lines, or points + simdvector prim[3]; // Only deal with triangles, lines, or points #endif RDTSC_BEGIN(FEPAAssemble, pDC->drawId); bool assemble = @@ -1408,15 +1536,15 @@ static void TessellationStages( #if USE_SIMD16_FRONTEND // Gather data from the SVG if provided. simd16scalari vViewportIdx = SIMD16::setzero_si(); - simd16scalari vRtIdx = SIMD16::setzero_si(); - SIMD16::Vec4 svgAttrib[4]; + simd16scalari vRtIdx = SIMD16::setzero_si(); + SIMD16::Vec4 svgAttrib[4]; - if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex) + if (state.backendState.readViewportArrayIndex || + state.backendState.readRenderTargetArrayIndex) { tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib); } - if (state.backendState.readViewportArrayIndex) { vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]); @@ -1432,20 +1560,29 @@ static void TessellationStages( { // OOB VPAI indices => forced to zero. vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si()); - simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); + simd16scalari vNumViewports = + SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports); - vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx); + vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx); tessPa.useAlternateOffset = false; - pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID, vViewportIdx, vRtIdx); + pfnClipFunc(pDC, + tessPa, + workerId, + prim_simd16, + GenMask(numPrims), + primID, + vViewportIdx, + vRtIdx); } #else // Gather data from the SGV if provided. simdscalari vViewportIdx = SIMD::setzero_si(); - simdscalari vRtIdx = SIMD::setzero_si(); - SIMD::Vec4 svgAttrib[4]; + simdscalari vRtIdx = SIMD::setzero_si(); + SIMD::Vec4 svgAttrib[4]; - if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex) + if (state.backendState.readViewportArrayIndex || + state.backendState.readRenderTargetArrayIndex) { tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib); } @@ -1456,18 +1593,24 @@ static void TessellationStages( // OOB VPAI indices => forced to zero. vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si()); - simdscalari vNumViewports = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); - simdscalari vClearMask = SIMD::cmplt_epi32(vViewportIdx, vNumViewports); - vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx); + simdscalari vNumViewports = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); + simdscalari vClearMask = SIMD::cmplt_epi32(vViewportIdx, vNumViewports); + vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx); tessPa.viewportArrayActive = true; } if (state.backendState.readRenderTargetArrayIndex) { - vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]); + vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]); tessPa.rtArrayActive = true; } - pfnClipFunc(pDC, tessPa, workerId, prim, - GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), vViewportIdx, vRtIdx); + pfnClipFunc(pDC, + tessPa, + workerId, + prim, + GenMask(tessPa.NumPrims()), + _simd_set1_epi32(dsContext.PrimitiveID), + vViewportIdx, + vRtIdx); #endif } } @@ -1475,7 +1618,7 @@ static void TessellationStages( tessPa.NextPrim(); } // while (tessPa.HasWork()) - } // for (uint32_t p = 0; p < numPrims; ++p) + } // for (uint32_t p = 0; p < numPrims; ++p) #if USE_SIMD16_FRONTEND if (gt_pTessellationThreadData->pDSOutput != nullptr) @@ -1489,8 +1632,8 @@ static void TessellationStages( TSDestroyCtx(tsCtx); } -THREAD PA_STATE::SIMDVERTEX *gpVertexStore = nullptr; -THREAD uint32_t gVertexStoreSize = 0; +THREAD PA_STATE::SIMDVERTEX* gpVertexStore = nullptr; +THREAD uint32_t gVertexStoreSize = 0; ////////////////////////////////////////////////////////////////////////// /// @brief FE handler for SwrDraw. @@ -1503,20 +1646,14 @@ THREAD uint32_t gVertexStoreSize = 0; /// @param pDC - pointer to draw context. /// @param workerId - thread's worker id. /// @param pUserData - Pointer to DRAW_WORK -template < - typename IsIndexedT, - typename IsCutIndexEnabledT, - typename HasTessellationT, - typename HasGeometryShaderT, - typename HasStreamOutT, - typename HasRastT> -void ProcessDraw( - SWR_CONTEXT *pContext, - DRAW_CONTEXT *pDC, - uint32_t workerId, - void *pUserData) +template <typename IsIndexedT, + typename IsCutIndexEnabledT, + typename HasTessellationT, + typename HasGeometryShaderT, + typename HasStreamOutT, + typename HasRastT> +void ProcessDraw(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData) { - #if KNOB_ENABLE_TOSS_POINTS if (KNOB_TOSS_QUEUE_FE) { @@ -1528,8 +1665,8 @@ void ProcessDraw( void* pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; - DRAW_WORK& work = *(DRAW_WORK*)pUserData; - const API_STATE& state = GetApiState(pDC); + DRAW_WORK& work = *(DRAW_WORK*)pUserData; + const API_STATE& state = GetApiState(pDC); uint32_t indexSize = 0; uint32_t endVertex = work.numVerts; @@ -1567,9 +1704,11 @@ void ProcessDraw( if (HasGeometryShaderT::value) { #if USE_SIMD16_FRONTEND - AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers); + AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>( + pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers); #else - AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers); + AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>( + pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers); #endif } @@ -1599,14 +1738,14 @@ void ProcessDraw( #if USE_SIMD16_FRONTEND uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector); #else - uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector); + uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector); #endif SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM); // Compute storage requirements for vertex store // TODO: allocation needs to be rethought for better cut support - uint32_t numVerts = vertexCount + 2; // Need extra space for PA state machine + uint32_t numVerts = vertexCount + 2; // Need extra space for PA state machine uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes; // grow the vertex store for the PA as necessary @@ -1620,30 +1759,36 @@ void ProcessDraw( SWR_ASSERT(gpVertexStore == nullptr); - gpVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX *>(AlignedMalloc(vertexStoreSize, 64)); + gpVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX*>(AlignedMalloc(vertexStoreSize, 64)); gVertexStoreSize = vertexStoreSize; SWR_ASSERT(gpVertexStore != nullptr); } // choose primitive assembler - - PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, gpVertexStore, numVerts, state.frontendState.vsVertexSize, GetNumVerts(state.topology, 1)); - PA_STATE& pa = paFactory.GetPA(); + + PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, + state.topology, + work.numVerts, + gpVertexStore, + numVerts, + state.frontendState.vsVertexSize, + GetNumVerts(state.topology, 1)); + PA_STATE& pa = paFactory.GetPA(); #if USE_SIMD16_FRONTEND #if USE_SIMD16_SHADERS - simd16vertex vin; + simd16vertex vin; #else - simdvertex vin_lo; - simdvertex vin_hi; + simdvertex vin_lo; + simdvertex vin_hi; #endif - SWR_VS_CONTEXT vsContext_lo; - SWR_VS_CONTEXT vsContext_hi; + SWR_VS_CONTEXT vsContext_lo; + SWR_VS_CONTEXT vsContext_hi; #if USE_SIMD16_SHADERS - vsContext_lo.pVin = reinterpret_cast<simdvertex *>(&vin); - vsContext_hi.pVin = reinterpret_cast<simdvertex *>(&vin); + vsContext_lo.pVin = reinterpret_cast<simdvertex*>(&vin); + vsContext_hi.pVin = reinterpret_cast<simdvertex*>(&vin); #else vsContext_lo.pVin = &vin_lo; vsContext_hi.pVin = &vin_hi; @@ -1651,11 +1796,11 @@ void ProcessDraw( vsContext_lo.AlternateOffset = 0; vsContext_hi.AlternateOffset = 1; - SWR_FETCH_CONTEXT fetchInfo_lo = { 0 }; + SWR_FETCH_CONTEXT fetchInfo_lo = {0}; - fetchInfo_lo.pStreams = &state.vertexBuffers[0]; + fetchInfo_lo.pStreams = &state.vertexBuffers[0]; fetchInfo_lo.StartInstance = work.startInstance; - fetchInfo_lo.StartVertex = 0; + fetchInfo_lo.StartVertex = 0; if (IsIndexedT::value) { @@ -1674,27 +1819,30 @@ void ProcessDraw( fetchInfo_lo.StartVertex = work.startVertex; } - SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo; + SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo; - const simd16scalari vScale = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + const simd16scalari vScale = + _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++) { - uint32_t i = 0; + uint32_t i = 0; simd16scalari vIndex; if (IsIndexedT::value) { fetchInfo_lo.xpIndices = work.xpIB; - fetchInfo_hi.xpIndices = fetchInfo_lo.xpIndices + KNOB_SIMD_WIDTH * indexSize; // 1/2 of KNOB_SIMD16_WIDTH + fetchInfo_hi.xpIndices = + fetchInfo_lo.xpIndices + KNOB_SIMD_WIDTH * indexSize; // 1/2 of KNOB_SIMD16_WIDTH } else { vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale); fetchInfo_lo.xpIndices = (gfxptr_t)&vIndex; - fetchInfo_hi.xpIndices = (gfxptr_t)&vIndex + KNOB_SIMD_WIDTH * sizeof(int32_t); // 1/2 of KNOB_SIMD16_WIDTH + fetchInfo_hi.xpIndices = + (gfxptr_t)&vIndex + KNOB_SIMD_WIDTH * sizeof(int32_t); // 1/2 of KNOB_SIMD16_WIDTH } fetchInfo_lo.CurInstance = instanceNum; @@ -1705,24 +1853,24 @@ void ProcessDraw( while (pa.HasWork()) { - // GetNextVsOutput currently has the side effect of updating some PA state machine state. - // So we need to keep this outside of (i < endVertex) check. + // GetNextVsOutput currently has the side effect of updating some PA state machine + // state. So we need to keep this outside of (i < endVertex) check. - simdmask *pvCutIndices_lo = nullptr; - simdmask *pvCutIndices_hi = nullptr; + simdmask* pvCutIndices_lo = nullptr; + simdmask* pvCutIndices_hi = nullptr; if (IsIndexedT::value) { // simd16mask <=> simdmask[2] - pvCutIndices_lo = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[0]; - pvCutIndices_hi = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[1]; + pvCutIndices_lo = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[0]; + pvCutIndices_hi = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[1]; } - simd16vertex &vout = pa.GetNextVsOutput(); + simd16vertex& vout = pa.GetNextVsOutput(); - vsContext_lo.pVout = reinterpret_cast<simdvertex *>(&vout); - vsContext_hi.pVout = reinterpret_cast<simdvertex *>(&vout); + vsContext_lo.pVout = reinterpret_cast<simdvertex*>(&vout); + vsContext_hi.pVout = reinterpret_cast<simdvertex*>(&vout); if (i < endVertex) { @@ -1730,13 +1878,14 @@ void ProcessDraw( { fetchInfo_lo.xpLastIndex = fetchInfo_lo.xpIndices; uint32_t offset; - offset = std::min(endVertex-i, (uint32_t) KNOB_SIMD16_WIDTH); + offset = std::min(endVertex - i, (uint32_t)KNOB_SIMD16_WIDTH); offset *= 4; // convert from index to address #if USE_SIMD16_SHADERS fetchInfo_lo.xpLastIndex += offset; #else - fetchInfo_lo.xpLastIndex += std::min(offset, (uint32_t) KNOB_SIMD_WIDTH); - uint32_t offset2 = std::min(offset, (uint32_t) KNOB_SIMD16_WIDTH)-KNOB_SIMD_WIDTH; + fetchInfo_lo.xpLastIndex += std::min(offset, (uint32_t)KNOB_SIMD_WIDTH); + uint32_t offset2 = + std::min(offset, (uint32_t)KNOB_SIMD16_WIDTH) - KNOB_SIMD_WIDTH; assert(offset >= 0); fetchInfo_hi.xpLastIndex = fetchInfo_hi.xpIndices; fetchInfo_hi.xpLastIndex += offset2; @@ -1749,7 +1898,7 @@ void ProcessDraw( #else state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin_lo); - if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH + if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH { state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_hi, vin_hi); } @@ -1759,10 +1908,10 @@ void ProcessDraw( // forward fetch generated vertex IDs to the vertex shader #if USE_SIMD16_SHADERS #if USE_SIMD16_VS - vsContext_lo.VertexID16 = _simd16_insert_si( - vsContext_lo.VertexID16, fetchInfo_lo.VertexID, 0); - vsContext_lo.VertexID16 = _simd16_insert_si( - vsContext_lo.VertexID16, fetchInfo_lo.VertexID2, 1); + vsContext_lo.VertexID16 = + _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID, 0); + vsContext_lo.VertexID16 = + _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID2, 1); #else vsContext_lo.VertexID = fetchInfo_lo.VertexID; vsContext_hi.VertexID = fetchInfo_lo.VertexID2; @@ -1776,8 +1925,8 @@ void ProcessDraw( #if USE_SIMD16_VS vsContext_lo.mask16 = GenerateMask16(endVertex - i); #else - vsContext_lo.mask = GenerateMask(endVertex - i); - vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH)); + vsContext_lo.mask = GenerateMask(endVertex - i); + vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH)); #endif // forward cut mask to the PA @@ -1806,7 +1955,7 @@ void ProcessDraw( state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo); AR_EVENT(VSStats(vsContext_lo.stats.numInstExecuted)); - if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH + if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH { state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_hi); AR_EVENT(VSStats(vsContext_hi.stats.numInstExecuted)); @@ -1840,33 +1989,61 @@ void ProcessDraw( UPDATE_STAT_FE(IaPrimitives, pa.NumPrims()); const uint32_t numPrims = pa.NumPrims(); - const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH); - const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH; + const uint32_t numPrims_lo = + std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH); + const uint32_t numPrims_hi = + std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH; - const simd16scalari primID = pa.GetPrimID(work.startPrimID); - const simdscalari primID_lo = _simd16_extract_si(primID, 0); - const simdscalari primID_hi = _simd16_extract_si(primID, 1); + const simd16scalari primID = pa.GetPrimID(work.startPrimID); + const simdscalari primID_lo = _simd16_extract_si(primID, 0); + const simdscalari primID_hi = _simd16_extract_si(primID, 1); if (HasTessellationT::value) { pa.useAlternateOffset = false; - TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo); + TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>( + pDC, + workerId, + pa, + &gsBuffers, + pSoPrimData, + numPrims_lo, + primID_lo); if (numPrims_hi) { pa.useAlternateOffset = true; - TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi); + TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>( + pDC, + workerId, + pa, + &gsBuffers, + pSoPrimData, + numPrims_hi, + primID_hi); } } else if (HasGeometryShaderT::value) { pa.useAlternateOffset = false; - GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo); + GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, + workerId, + pa, + &gsBuffers, + pSoPrimData, + numPrims_lo, + primID_lo); if (numPrims_hi) { pa.useAlternateOffset = true; - GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi); + GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, + workerId, + pa, + &gsBuffers, + pSoPrimData, + numPrims_hi, + primID_hi); } } else @@ -1884,14 +2061,14 @@ void ProcessDraw( // Gather data from the SVG if provided. simd16scalari vpai = SIMD16::setzero_si(); simd16scalari rtai = SIMD16::setzero_si(); - SIMD16::Vec4 svgAttrib[4]; + SIMD16::Vec4 svgAttrib[4]; - if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex) + if (state.backendState.readViewportArrayIndex || + state.backendState.readRenderTargetArrayIndex) { pa.Assemble(VERTEX_SGV_SLOT, svgAttrib); } - if (state.backendState.readViewportArrayIndex) { vpai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]); @@ -1899,19 +2076,29 @@ void ProcessDraw( } if (state.backendState.readRenderTargetArrayIndex) { - rtai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]); + rtai = + SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]); pa.rtArrayActive = true; } { // OOB VPAI indices => forced to zero. vpai = SIMD16::max_epi32(vpai, SIMD16::setzero_si()); - simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); - simd16scalari vClearMask = SIMD16::cmplt_epi32(vpai, vNumViewports); + simd16scalari vNumViewports = + SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); + simd16scalari vClearMask = + SIMD16::cmplt_epi32(vpai, vNumViewports); vpai = SIMD16::and_si(vClearMask, vpai); pa.useAlternateOffset = false; - pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID, vpai, rtai); + pDC->pState->pfnProcessPrims_simd16(pDC, + pa, + workerId, + prim_simd16, + GenMask(numPrims), + primID, + vpai, + rtai); } } } @@ -1937,12 +2124,12 @@ void ProcessDraw( } #else - SWR_VS_CONTEXT vsContext; - SWR_FETCH_CONTEXT fetchInfo = { 0 }; + SWR_VS_CONTEXT vsContext; + SWR_FETCH_CONTEXT fetchInfo = {0}; - fetchInfo.pStreams = &state.vertexBuffers[0]; + fetchInfo.pStreams = &state.vertexBuffers[0]; fetchInfo.StartInstance = work.startInstance; - fetchInfo.StartVertex = 0; + fetchInfo.StartVertex = 0; if (IsIndexedT::value) { @@ -1950,7 +2137,8 @@ void ProcessDraw( // if the entire index buffer isn't being consumed, set the last index // so that fetches < a SIMD wide will be masked off - fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size); + fetchInfo.pLastIndex = + (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size); if (xpLastRequestedIndex < fetchInfo.pLastIndex) { fetchInfo.pLastIndex = xpLastRequestedIndex; @@ -1961,13 +2149,13 @@ void ProcessDraw( fetchInfo.StartVertex = work.startVertex; } - const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); /// @todo: temporarily move instance loop in the FE to ensure SO ordering for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++) { simdscalari vIndex; - uint32_t i = 0; + uint32_t i = 0; if (IsIndexedT::value) { @@ -1975,17 +2163,17 @@ void ProcessDraw( } else { - vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale); + vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale); fetchInfo.pIndices = (const int32_t*)&vIndex; } fetchInfo.CurInstance = instanceNum; - vsContext.InstanceID = instanceNum; + vsContext.InstanceID = instanceNum; while (pa.HasWork()) { - // GetNextVsOutput currently has the side effect of updating some PA state machine state. - // So we need to keep this outside of (i < endVertex) check. + // GetNextVsOutput currently has the side effect of updating some PA state machine + // state. So we need to keep this outside of (i < endVertex) check. simdmask* pvCutIndices = nullptr; if (IsIndexedT::value) { @@ -1993,12 +2181,11 @@ void ProcessDraw( } simdvertex& vout = pa.GetNextVsOutput(); - vsContext.pVin = &vout; - vsContext.pVout = &vout; + vsContext.pVin = &vout; + vsContext.pVout = &vout; if (i < endVertex) { - // 1. Execute FS/VS for a single SIMD. RDTSC_BEGIN(FEFetchShader, pDC->drawId); state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo, vout); @@ -2055,12 +2242,22 @@ void ProcessDraw( if (HasTessellationT::value) { TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>( - pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID)); + pDC, + workerId, + pa, + &gsBuffers, + pSoPrimData, + pa.GetPrimID(work.startPrimID)); } else if (HasGeometryShaderT::value) { GeometryShaderStage<HasStreamOutT, HasRastT>( - pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID)); + pDC, + workerId, + pa, + &gsBuffers, + pSoPrimData, + pa.GetPrimID(work.startPrimID)); } else { @@ -2076,33 +2273,45 @@ void ProcessDraw( // Gather data from the SVG if provided. simdscalari vViewportIdx = SIMD::setzero_si(); - simdscalari vRtIdx = SIMD::setzero_si(); - SIMD::Vec4 svgAttrib[4]; + simdscalari vRtIdx = SIMD::setzero_si(); + SIMD::Vec4 svgAttrib[4]; - if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex) + if (state.backendState.readViewportArrayIndex || + state.backendState.readRenderTargetArrayIndex) { pa.Assemble(VERTEX_SGV_SLOT, svgAttrib); } if (state.backendState.readViewportArrayIndex) { - vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]); + vViewportIdx = + SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]); // OOB VPAI indices => forced to zero. - vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si()); - simdscalari vNumViewports = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); - simdscalari vClearMask = SIMD::cmplt_epi32(vViewportIdx, vNumViewports); + vViewportIdx = + SIMD::max_epi32(vViewportIdx, SIMD::setzero_si()); + simdscalari vNumViewports = + SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); + simdscalari vClearMask = + SIMD::cmplt_epi32(vViewportIdx, vNumViewports); vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx); pa.viewportArrayActive = true; } if (state.backendState.readRenderTargetArrayIndex) { - vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]); + vRtIdx = + SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]); pa.rtArrayActive = true; } - pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, - GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), vViewportIdx, vRtIdx); + pDC->pState->pfnProcessPrims(pDC, + pa, + workerId, + prim, + GenMask(pa.NumPrims()), + pa.GetPrimID(work.startPrimID), + vViewportIdx, + vRtIdx); } } } @@ -2112,7 +2321,8 @@ void ProcessDraw( if (IsIndexedT::value) { - fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize); + fetchInfo.pIndices = + (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize); } else { @@ -2140,15 +2350,18 @@ struct FEDrawChooser } }; - // Selector for correct templated Draw front-end function -PFN_FE_WORK_FUNC GetProcessDrawFunc( - bool IsIndexed, - bool IsCutIndexEnabled, - bool HasTessellation, - bool HasGeometryShader, - bool HasStreamOut, - bool HasRasterization) +PFN_FE_WORK_FUNC GetProcessDrawFunc(bool IsIndexed, + bool IsCutIndexEnabled, + bool HasTessellation, + bool HasGeometryShader, + bool HasStreamOut, + bool HasRasterization) { - return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization); + return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, + IsCutIndexEnabled, + HasTessellation, + HasGeometryShader, + HasStreamOut, + HasRasterization); } diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h index 6a2ec8474f1..38fe77e240d 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.h +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h @@ -1,38 +1,38 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file frontend.h -* -* @brief Definitions for Frontend which handles vertex processing, -* primitive assembly, clipping, binning, etc. -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file frontend.h + * + * @brief Definitions for Frontend which handles vertex processing, + * primitive assembly, clipping, binning, etc. + * + ******************************************************************************/ #pragma once #include "context.h" #include "common/simdintrin.h" #include <type_traits> // Calculates the A and B coefficients for the 3 edges of the triangle -// +// // maths for edge equations: // standard form of a line in 2d // Ax + By + C = 0 @@ -40,14 +40,14 @@ // B = x1 - x0 // C = x0y1 - x1y0 INLINE -void triangleSetupAB(const __m128 vX, const __m128 vY, __m128 & vA, __m128 & vB) +void triangleSetupAB(const __m128 vX, const __m128 vY, __m128& vA, __m128& vB) { // vYsub = y1 y2 y0 dc __m128 vYsub = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(3, 0, 2, 1)); // vY = y0 y1 y2 dc vA = _mm_sub_ps(vY, vYsub); - // Result: + // Result: // A[0] = y0 - y1 // A[1] = y1 - y2 // A[2] = y2 - y0 @@ -57,28 +57,31 @@ void triangleSetupAB(const __m128 vX, const __m128 vY, __m128 & vA, __m128 & vB) // vX = x0 x1 x2 dc vB = _mm_sub_ps(vXsub, vX); - // Result: + // Result: // B[0] = x1 - x0 // B[1] = x2 - x1 // B[2] = x0 - x2 } INLINE -void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i & vA, __m128i & vB) +void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i& vA, __m128i& vB) { // generate edge equations // A = y0 - y1 // B = x1 - x0 // C = x0y1 - x1y0 __m128i vYsub = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 2, 1)); - vA = _mm_sub_epi32(vY, vYsub); + vA = _mm_sub_epi32(vY, vYsub); __m128i vXsub = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 2, 1)); - vB = _mm_sub_epi32(vXsub, vX); + vB = _mm_sub_epi32(vXsub, vX); } INLINE -void triangleSetupABIntVertical(const simdscalari vX[3], const simdscalari vY[3], simdscalari (&vA)[3], simdscalari (&vB)[3]) +void triangleSetupABIntVertical(const simdscalari vX[3], + const simdscalari vY[3], + simdscalari (&vA)[3], + simdscalari (&vB)[3]) { // A = y0 - y1 // B = x1 - x0 @@ -93,7 +96,10 @@ void triangleSetupABIntVertical(const simdscalari vX[3], const simdscalari vY[3] #if ENABLE_AVX512_SIMD16 INLINE -void triangleSetupABIntVertical(const simd16scalari vX[3], const simd16scalari vY[3], simd16scalari(&vA)[3], simd16scalari(&vB)[3]) +void triangleSetupABIntVertical(const simd16scalari vX[3], + const simd16scalari vY[3], + simd16scalari (&vA)[3], + simd16scalari (&vB)[3]) { // A = y0 - y1 // B = x1 - x0 @@ -112,7 +118,7 @@ void triangleSetupABIntVertical(const simd16scalari vX[3], const simd16scalari v // Px = x0-x2, Py = y0-y2 // Qx = x1-x2, Qy = y1-y2 // |Px Qx| -// det = | | = PxQy - PyQx +// det = | | = PxQy - PyQx // |Py Qy| // simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2) // try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx @@ -127,37 +133,39 @@ float calcDeterminantInt(const __m128i vA, const __m128i vB) // vBShuf = [B2, B0, B1, B0] __m128i vBShuf = _mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 1, 0, 2)); // vMul = [A1*B2, B1*A2] - __m128i vMul = _mm_mul_epi32(vAShuf, vBShuf); + __m128i vMul = _mm_mul_epi32(vAShuf, vBShuf); // shuffle upper to lower // vMul2 = [B1*A2, B1*A2] __m128i vMul2 = _mm_shuffle_epi32(vMul, _MM_SHUFFLE(3, 2, 3, 2)); - //vMul = [A1*B2 - B1*A2] + // vMul = [A1*B2 - B1*A2] vMul = _mm_sub_epi64(vMul, vMul2); int64_t result; _mm_store_sd((double*)&result, _mm_castsi128_pd(vMul)); double dResult = (double)result; - dResult = dResult * (1.0 / FIXED_POINT16_SCALE); + dResult = dResult * (1.0 / FIXED_POINT16_SCALE); return (float)dResult; } INLINE -void calcDeterminantIntVertical(const simdscalari vA[3], const simdscalari vB[3], simdscalari *pvDet) +void calcDeterminantIntVertical(const simdscalari vA[3], + const simdscalari vB[3], + simdscalari* pvDet) { // refer to calcDeterminantInt comment for calculation explanation // A1*B2 - simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5 - simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7 + simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5 + simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7 simdscalari vB2Lo = _simd_unpacklo_epi32(vB[2], vB[2]); simdscalari vB2Hi = _simd_unpackhi_epi32(vB[2], vB[2]); - simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5 - simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7 + simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5 + simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7 // B1*A2 simdscalari vA2Lo = _simd_unpacklo_epi32(vA[2], vA[2]); @@ -185,19 +193,22 @@ void calcDeterminantIntVertical(const simdscalari vA[3], const simdscalari vB[3] #if ENABLE_AVX512_SIMD16 INLINE -void calcDeterminantIntVertical(const simd16scalari vA[3], const simd16scalari vB[3], simd16scalari *pvDet) +void calcDeterminantIntVertical(const simd16scalari vA[3], + const simd16scalari vB[3], + simd16scalari* pvDet) { // refer to calcDeterminantInt comment for calculation explanation // A1*B2 - simd16scalari vA1_lo = _simd16_unpacklo_epi32(vA[1], vA[1]); // X 0 X 1 X 4 X 5 X 8 X 9 X C X D (32b) - simd16scalari vA1_hi = _simd16_unpackhi_epi32(vA[1], vA[1]); // X 2 X 3 X 6 X 7 X A X B X E X F + simd16scalari vA1_lo = + _simd16_unpacklo_epi32(vA[1], vA[1]); // X 0 X 1 X 4 X 5 X 8 X 9 X C X D (32b) + simd16scalari vA1_hi = _simd16_unpackhi_epi32(vA[1], vA[1]); // X 2 X 3 X 6 X 7 X A X B X E X F simd16scalari vB2_lo = _simd16_unpacklo_epi32(vB[2], vB[2]); simd16scalari vB2_hi = _simd16_unpackhi_epi32(vB[2], vB[2]); - simd16scalari vA1B2_lo = _simd16_mul_epi32(vA1_lo, vB2_lo); // 0 1 4 5 8 9 C D (64b) - simd16scalari vA1B2_hi = _simd16_mul_epi32(vA1_hi, vB2_hi); // 2 3 6 7 A B E F + simd16scalari vA1B2_lo = _simd16_mul_epi32(vA1_lo, vB2_lo); // 0 1 4 5 8 9 C D (64b) + simd16scalari vA1B2_hi = _simd16_mul_epi32(vA1_hi, vB2_hi); // 2 3 6 7 A B E F // B1*A2 simd16scalari vA2_lo = _simd16_unpacklo_epi32(vA[2], vA[2]); @@ -210,32 +221,31 @@ void calcDeterminantIntVertical(const simd16scalari vA[3], const simd16scalari v simd16scalari vA2B1_hi = _simd16_mul_epi32(vA2_hi, vB1_hi); // A1*B2 - A2*B1 - simd16scalari difflo = _simd16_sub_epi64(vA1B2_lo, vA2B1_lo); // 0 1 4 5 8 9 C D (64b) - simd16scalari diffhi = _simd16_sub_epi64(vA1B2_hi, vA2B1_hi); // 2 3 6 7 A B E F + simd16scalari difflo = _simd16_sub_epi64(vA1B2_lo, vA2B1_lo); // 0 1 4 5 8 9 C D (64b) + simd16scalari diffhi = _simd16_sub_epi64(vA1B2_hi, vA2B1_hi); // 2 3 6 7 A B E F // (1, 0, 1, 0) = 01 00 01 00 = 0x44, (3, 2, 3, 2) = 11 10 11 10 = 0xEE - simd16scalari templo = _simd16_permute2f128_si(difflo, diffhi, 0x44); // 0 1 4 5 2 3 6 7 (64b) - simd16scalari temphi = _simd16_permute2f128_si(difflo, diffhi, 0xEE); // 8 9 C D A B E F + simd16scalari templo = _simd16_permute2f128_si(difflo, diffhi, 0x44); // 0 1 4 5 2 3 6 7 (64b) + simd16scalari temphi = _simd16_permute2f128_si(difflo, diffhi, 0xEE); // 8 9 C D A B E F // (3, 1, 2, 0) = 11 01 10 00 = 0xD8 - pvDet[0] = _simd16_permute2f128_si(templo, templo, 0xD8); // 0 1 2 3 4 5 6 7 (64b) - pvDet[1] = _simd16_permute2f128_si(temphi, temphi, 0xD8); // 8 9 A B C D E F + pvDet[0] = _simd16_permute2f128_si(templo, templo, 0xD8); // 0 1 2 3 4 5 6 7 (64b) + pvDet[1] = _simd16_permute2f128_si(temphi, temphi, 0xD8); // 8 9 A B C D E F } #endif INLINE -void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128 &vB, __m128 &vC) +void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128& vB, __m128& vC) { // C = -Ax - By - vC = _mm_mul_ps(vA, vX); - __m128 vCy = _mm_mul_ps(vB, vY); - vC = _mm_mul_ps(vC, _mm_set1_ps(-1.0f)); - vC = _mm_sub_ps(vC, vCy); + vC = _mm_mul_ps(vA, vX); + __m128 vCy = _mm_mul_ps(vB, vY); + vC = _mm_mul_ps(vC, _mm_set1_ps(-1.0f)); + vC = _mm_sub_ps(vC, vCy); } -template<uint32_t NumVerts> -INLINE -void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices) +template <uint32_t NumVerts> +INLINE void viewportTransform(simdvector* v, const SWR_VIEWPORT_MATRICES& vpMatrices) { simdscalar m00 = _simd_load1_ps(&vpMatrices.m00[0]); simdscalar m30 = _simd_load1_ps(&vpMatrices.m30[0]); @@ -253,9 +263,8 @@ void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices) } #if USE_SIMD16_FRONTEND -template<uint32_t NumVerts> -INLINE -void viewportTransform(simd16vector *v, const SWR_VIEWPORT_MATRICES & vpMatrices) +template <uint32_t NumVerts> +INLINE void viewportTransform(simd16vector* v, const SWR_VIEWPORT_MATRICES& vpMatrices) { const simd16scalar m00 = _simd16_broadcast_ss(&vpMatrices.m00[0]); const simd16scalar m30 = _simd16_broadcast_ss(&vpMatrices.m30[0]); @@ -273,9 +282,10 @@ void viewportTransform(simd16vector *v, const SWR_VIEWPORT_MATRICES & vpMatrices } #endif -template<uint32_t NumVerts> -INLINE -void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simdscalari const &vViewportIdx) +template <uint32_t NumVerts> +INLINE void viewportTransform(simdvector* v, + const SWR_VIEWPORT_MATRICES& vpMatrices, + simdscalari const& vViewportIdx) { // perform a gather of each matrix element based on the viewport array indexes simdscalar m00 = _simd_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4); @@ -294,9 +304,10 @@ void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, } #if USE_SIMD16_FRONTEND -template<uint32_t NumVerts> -INLINE -void viewportTransform(simd16vector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simd16scalari const &vViewportIdx) +template <uint32_t NumVerts> +INLINE void viewportTransform(simd16vector* v, + const SWR_VIEWPORT_MATRICES& vpMatrices, + simd16scalari const& vViewportIdx) { // perform a gather of each matrix element based on the viewport array indexes const simd16scalar m00 = _simd16_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4); @@ -316,7 +327,7 @@ void viewportTransform(simd16vector *v, const SWR_VIEWPORT_MATRICES & vpMatrices #endif INLINE -void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, SWR_RECT &bbox) +void calcBoundingBoxInt(const __m128i& vX, const __m128i& vY, SWR_RECT& bbox) { // Need horizontal fp min here __m128i vX1 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 2, 0, 1)); @@ -325,18 +336,17 @@ void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, SWR_RECT &bbox) __m128i vY1 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 2, 0, 1)); __m128i vY2 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 1, 2)); - __m128i vMinX = _mm_min_epi32(vX, vX1); - vMinX = _mm_min_epi32(vMinX, vX2); + vMinX = _mm_min_epi32(vMinX, vX2); __m128i vMaxX = _mm_max_epi32(vX, vX1); - vMaxX = _mm_max_epi32(vMaxX, vX2); + vMaxX = _mm_max_epi32(vMaxX, vX2); __m128i vMinY = _mm_min_epi32(vY, vY1); - vMinY = _mm_min_epi32(vMinY, vY2); + vMinY = _mm_min_epi32(vMinY, vY2); __m128i vMaxY = _mm_max_epi32(vY, vY1); - vMaxY = _mm_max_epi32(vMaxY, vY2); + vMaxY = _mm_max_epi32(vMaxY, vY2); bbox.xmin = _mm_extract_epi32(vMinX, 0); bbox.xmax = _mm_extract_epi32(vMaxX, 0); @@ -345,54 +355,84 @@ void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, SWR_RECT &bbox) } INLINE -bool CanUseSimplePoints(DRAW_CONTEXT *pDC) +bool CanUseSimplePoints(DRAW_CONTEXT* pDC) { const API_STATE& state = GetApiState(pDC); return (state.rastState.sampleCount == SWR_MULTISAMPLE_1X && - state.rastState.pointSize == 1.0f && - !state.rastState.pointParam && - !state.rastState.pointSpriteEnable && - !state.backendState.clipDistanceMask); + state.rastState.pointSize == 1.0f && !state.rastState.pointParam && + !state.rastState.pointSpriteEnable && !state.backendState.clipDistanceMask); } INLINE bool vHasNaN(const __m128& vec) { - const __m128 result = _mm_cmpunord_ps(vec, vec); - const int32_t mask = _mm_movemask_ps(result); + const __m128 result = _mm_cmpunord_ps(vec, vec); + const int32_t mask = _mm_movemask_ps(result); return (mask != 0); } uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements); uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts); - // ProcessDraw front-end function. All combinations of parameter values are available -PFN_FE_WORK_FUNC GetProcessDrawFunc( - bool IsIndexed, - bool IsCutIndexEnabled, - bool HasTessellation, - bool HasGeometryShader, - bool HasStreamOut, - bool HasRasterization); - -void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -void ProcessDiscardInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -void ProcessShutdown(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +PFN_FE_WORK_FUNC GetProcessDrawFunc(bool IsIndexed, + bool IsCutIndexEnabled, + bool HasTessellation, + bool HasGeometryShader, + bool HasStreamOut, + bool HasRasterization); + +void ProcessClear(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData); +void ProcessStoreTiles(SWR_CONTEXT* pContext, + DRAW_CONTEXT* pDC, + uint32_t workerId, + void* pUserData); +void ProcessDiscardInvalidateTiles(SWR_CONTEXT* pContext, + DRAW_CONTEXT* pDC, + uint32_t workerId, + void* pUserData); +void ProcessSync(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData); +void ProcessShutdown(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData); PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative); #if USE_SIMD16_FRONTEND PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative); #endif -struct PA_STATE_BASE; // forward decl -void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx, simdscalari const &rtIdx); -void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx, simdscalari const &rtIdx); +struct PA_STATE_BASE; // forward decl +void BinPoints(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simdvector prims[3], + uint32_t primMask, + simdscalari const& primID, + simdscalari const& viewportIdx, + simdscalari const& rtIdx); +void BinLines(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simdvector prims[3], + uint32_t primMask, + simdscalari const& primID, + simdscalari const& viewportIdx, + simdscalari const& rtIdx); #if USE_SIMD16_FRONTEND -void SIMDCALL BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx, simd16scalari const &rtIdx); -void SIMDCALL BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx, simd16scalari const &rtIdx); +void SIMDCALL BinPoints_simd16(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simd16vector prims[3], + uint32_t primMask, + simd16scalari const& primID, + simd16scalari const& viewportIdx, + simd16scalari const& rtIdx); +void SIMDCALL BinLines_simd16(DRAW_CONTEXT* pDC, + PA_STATE& pa, + uint32_t workerId, + simd16vector prims[3], + uint32_t primMask, + simd16scalari const& primID, + simd16scalari const& viewportIdx, + simd16scalari const& rtIdx); #endif diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h index d88a3aac97c..b52accbbab3 100644 --- a/src/gallium/drivers/swr/rasterizer/core/knobs.h +++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h @@ -1,48 +1,48 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file knobs.h -* -* @brief Static (Compile-Time) Knobs for Core. -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file knobs.h + * + * @brief Static (Compile-Time) Knobs for Core. + * + ******************************************************************************/ #pragma once #include <stdint.h> #include <gen_knobs.h> -#define KNOB_ARCH_AVX 0 -#define KNOB_ARCH_AVX2 1 +#define KNOB_ARCH_AVX 0 +#define KNOB_ARCH_AVX2 1 #define KNOB_ARCH_AVX512 2 /////////////////////////////////////////////////////////////////////////////// // AVX512 Support /////////////////////////////////////////////////////////////////////////////// -#define ENABLE_AVX512_SIMD16 1 -#define USE_8x2_TILE_BACKEND 1 -#define USE_SIMD16_FRONTEND 1 -#define USE_SIMD16_SHADERS 1 // requires USE_SIMD16_FRONTEND -#define USE_SIMD16_VS 1 // requires USE_SIMD16_SHADERS +#define ENABLE_AVX512_SIMD16 1 +#define USE_8x2_TILE_BACKEND 1 +#define USE_SIMD16_FRONTEND 1 +#define USE_SIMD16_SHADERS 1 // requires USE_SIMD16_FRONTEND +#define USE_SIMD16_VS 1 // requires USE_SIMD16_SHADERS /////////////////////////////////////////////////////////////////////////////// // Architecture validation @@ -89,49 +89,49 @@ // Configuration knobs /////////////////////////////////////////////////////////////////////////////// // Maximum supported number of active vertex buffer streams -#define KNOB_NUM_STREAMS 32 +#define KNOB_NUM_STREAMS 32 // Maximum supported active viewports and scissors -#define KNOB_NUM_VIEWPORTS_SCISSORS 16 +#define KNOB_NUM_VIEWPORTS_SCISSORS 16 // Guardband range used by the clipper -#define KNOB_GUARDBAND_WIDTH 32768.0f -#define KNOB_GUARDBAND_HEIGHT 32768.0f +#define KNOB_GUARDBAND_WIDTH 32768.0f +#define KNOB_GUARDBAND_HEIGHT 32768.0f /////////////////////////////// // Macro tile configuration /////////////////////////////// // raster tile dimensions -#define KNOB_TILE_X_DIM 8 -#define KNOB_TILE_X_DIM_SHIFT 3 -#define KNOB_TILE_Y_DIM 8 -#define KNOB_TILE_Y_DIM_SHIFT 3 +#define KNOB_TILE_X_DIM 8 +#define KNOB_TILE_X_DIM_SHIFT 3 +#define KNOB_TILE_Y_DIM 8 +#define KNOB_TILE_Y_DIM_SHIFT 3 -// fixed macrotile pixel dimension for now, eventually will be +// fixed macrotile pixel dimension for now, eventually will be // dynamically set based on tile format and pixel size -#define KNOB_MACROTILE_X_DIM 32 -#define KNOB_MACROTILE_Y_DIM 32 -#define KNOB_MACROTILE_X_DIM_FIXED_SHIFT 13 -#define KNOB_MACROTILE_Y_DIM_FIXED_SHIFT 13 -#define KNOB_MACROTILE_X_DIM_FIXED (KNOB_MACROTILE_X_DIM << 8) -#define KNOB_MACROTILE_Y_DIM_FIXED (KNOB_MACROTILE_Y_DIM << 8) -#define KNOB_MACROTILE_X_DIM_IN_TILES (KNOB_MACROTILE_X_DIM >> KNOB_TILE_X_DIM_SHIFT) -#define KNOB_MACROTILE_Y_DIM_IN_TILES (KNOB_MACROTILE_Y_DIM >> KNOB_TILE_Y_DIM_SHIFT) +#define KNOB_MACROTILE_X_DIM 32 +#define KNOB_MACROTILE_Y_DIM 32 +#define KNOB_MACROTILE_X_DIM_FIXED_SHIFT 13 +#define KNOB_MACROTILE_Y_DIM_FIXED_SHIFT 13 +#define KNOB_MACROTILE_X_DIM_FIXED (KNOB_MACROTILE_X_DIM << 8) +#define KNOB_MACROTILE_Y_DIM_FIXED (KNOB_MACROTILE_Y_DIM << 8) +#define KNOB_MACROTILE_X_DIM_IN_TILES (KNOB_MACROTILE_X_DIM >> KNOB_TILE_X_DIM_SHIFT) +#define KNOB_MACROTILE_Y_DIM_IN_TILES (KNOB_MACROTILE_Y_DIM >> KNOB_TILE_Y_DIM_SHIFT) // total # of hot tiles available. This should be enough to // fully render a 16kx16k 128bpp render target -#define KNOB_NUM_HOT_TILES_X 256 -#define KNOB_NUM_HOT_TILES_Y 256 -#define KNOB_COLOR_HOT_TILE_FORMAT R32G32B32A32_FLOAT -#define KNOB_DEPTH_HOT_TILE_FORMAT R32_FLOAT -#define KNOB_STENCIL_HOT_TILE_FORMAT R8_UINT +#define KNOB_NUM_HOT_TILES_X 256 +#define KNOB_NUM_HOT_TILES_Y 256 +#define KNOB_COLOR_HOT_TILE_FORMAT R32G32B32A32_FLOAT +#define KNOB_DEPTH_HOT_TILE_FORMAT R32_FLOAT +#define KNOB_STENCIL_HOT_TILE_FORMAT R8_UINT // Max scissor rectangle -#define KNOB_MAX_SCISSOR_X KNOB_NUM_HOT_TILES_X * KNOB_MACROTILE_X_DIM -#define KNOB_MAX_SCISSOR_Y KNOB_NUM_HOT_TILES_Y * KNOB_MACROTILE_Y_DIM +#define KNOB_MAX_SCISSOR_X KNOB_NUM_HOT_TILES_X* KNOB_MACROTILE_X_DIM +#define KNOB_MAX_SCISSOR_Y KNOB_NUM_HOT_TILES_Y* KNOB_MACROTILE_Y_DIM -#if KNOB_SIMD_WIDTH==8 && KNOB_TILE_X_DIM < 4 +#if KNOB_SIMD_WIDTH == 8 && KNOB_TILE_X_DIM < 4 #error "incompatible width/tile dimensions" #endif @@ -160,14 +160,14 @@ /////////////////////////////////////////////////////////////////////////////// // Optimization knobs /////////////////////////////////////////////////////////////////////////////// -#define KNOB_USE_FAST_SRGB TRUE +#define KNOB_USE_FAST_SRGB TRUE // enables cut-aware primitive assembler -#define KNOB_ENABLE_CUT_AWARE_PA TRUE +#define KNOB_ENABLE_CUT_AWARE_PA TRUE // enables early rasterization (useful for small triangles) #if !defined(KNOB_ENABLE_EARLY_RAST) -#define KNOB_ENABLE_EARLY_RAST 1 +#define KNOB_ENABLE_EARLY_RAST 1 #endif #if KNOB_ENABLE_EARLY_RAST @@ -182,6 +182,5 @@ // Set to 1 to use the dynamic KNOB_TOSS_XXXX knobs. #if !defined(KNOB_ENABLE_TOSS_POINTS) -#define KNOB_ENABLE_TOSS_POINTS 0 +#define KNOB_ENABLE_TOSS_POINTS 0 #endif - diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h index 12c2a3031ea..f8797a8f2bc 100644 --- a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h +++ b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h @@ -1,30 +1,30 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file knobs_init.h -* -* @brief Dynamic Knobs Initialization for Core. -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file knobs_init.h + * + * @brief Dynamic Knobs Initialization for Core. + * + ******************************************************************************/ #pragma once #include <core/knobs.h> @@ -37,9 +37,9 @@ template <typename T> static inline void ConvertEnvToKnob(const char* pOverride, T& knobValue) { - uint32_t value = 0; - char* pStopped = nullptr; - value = strtoul(pOverride, &pStopped, 0); + uint32_t value = 0; + char* pStopped = nullptr; + value = strtoul(pOverride, &pStopped, 0); if (pStopped != pOverride) { knobValue = static_cast<T>(value); @@ -65,9 +65,9 @@ static inline void ConvertEnvToKnob(const char* pOverride, bool& knobValue) } // Try converting to a number and casting to bool - uint32_t value = 0; - char* pStopped = nullptr; - value = strtoul(pOverride, &pStopped, 0); + uint32_t value = 0; + char* pStopped = nullptr; + value = strtoul(pOverride, &pStopped, 0); if (pStopped != pOverride) { knobValue = value != 0; diff --git a/src/gallium/drivers/swr/rasterizer/core/multisample.h b/src/gallium/drivers/swr/rasterizer/core/multisample.h index 2ca8c1b3e8d..3b23974a7f4 100644 --- a/src/gallium/drivers/swr/rasterizer/core/multisample.h +++ b/src/gallium/drivers/swr/rasterizer/core/multisample.h @@ -1,28 +1,28 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file multisample.h -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file multisample.h + * + ******************************************************************************/ #pragma once @@ -36,225 +36,387 @@ typedef std::integral_constant<int, 1> SingleSampleT; INLINE SWR_MULTISAMPLE_COUNT GetSampleCount(uint32_t numSamples) { - switch(numSamples) + switch (numSamples) { - case 1: return SWR_MULTISAMPLE_1X; - case 2: return SWR_MULTISAMPLE_2X; - case 4: return SWR_MULTISAMPLE_4X; - case 8: return SWR_MULTISAMPLE_8X; - case 16: return SWR_MULTISAMPLE_16X; - default: assert(0); return SWR_MULTISAMPLE_1X; + case 1: + return SWR_MULTISAMPLE_1X; + case 2: + return SWR_MULTISAMPLE_2X; + case 4: + return SWR_MULTISAMPLE_4X; + case 8: + return SWR_MULTISAMPLE_8X; + case 16: + return SWR_MULTISAMPLE_16X; + default: + assert(0); + return SWR_MULTISAMPLE_1X; } } // hardcoded offsets based on Direct3d standard multisample positions // 8 x 8 pixel grid ranging from (0, 0) to (15, 15), with (0, 0) = UL pixel corner // coords are 0.8 fixed point offsets from (0, 0) -template<SWR_MULTISAMPLE_COUNT sampleCount, bool isCenter = false> +template <SWR_MULTISAMPLE_COUNT sampleCount, bool isCenter = false> struct MultisampleTraits { - INLINE static float X(uint32_t sampleNum) = delete; - INLINE static float Y(uint32_t sampleNum) = delete; - INLINE static simdscalari FullSampleMask() = delete; + INLINE static float X(uint32_t sampleNum) = delete; + INLINE static float Y(uint32_t sampleNum) = delete; + INLINE static simdscalari FullSampleMask() = delete; static const uint32_t numSamples = 0; }; -template<> +template <> struct MultisampleTraits<SWR_MULTISAMPLE_1X, false> { - INLINE static float X(uint32_t sampleNum) {return samplePosX[sampleNum];}; - INLINE static float Y(uint32_t sampleNum) {return samplePosY[sampleNum];}; - INLINE static simdscalari FullSampleMask(){return _simd_set1_epi32(0x1);}; + INLINE static float X(uint32_t sampleNum) { return samplePosX[sampleNum]; }; + INLINE static float Y(uint32_t sampleNum) { return samplePosY[sampleNum]; }; + INLINE static simdscalari FullSampleMask() { return _simd_set1_epi32(0x1); }; - static const uint32_t numSamples = 1; - static const uint32_t numCoverageSamples = 1; - static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_1X; - static constexpr uint32_t samplePosXi[1] = { 0x80 }; - static constexpr uint32_t samplePosYi[1] = { 0x80 }; - static constexpr float samplePosX[1] = { 0.5f }; - static constexpr float samplePosY[1] = { 0.5f }; + static const uint32_t numSamples = 1; + static const uint32_t numCoverageSamples = 1; + static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_1X; + static constexpr uint32_t samplePosXi[1] = {0x80}; + static constexpr uint32_t samplePosYi[1] = {0x80}; + static constexpr float samplePosX[1] = {0.5f}; + static constexpr float samplePosY[1] = {0.5f}; }; -template<> +template <> struct MultisampleTraits<SWR_MULTISAMPLE_1X, true> { - INLINE static float X(uint32_t sampleNum) {return 0.5f;}; - INLINE static float Y(uint32_t sampleNum) {return 0.5f;}; - INLINE static simdscalari FullSampleMask(){return _simd_set1_epi32(0x1);}; - - static const uint32_t numSamples = 1; - static const uint32_t numCoverageSamples = 1; - static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_1X; - static constexpr uint32_t samplePosXi[1] = { 0x80 }; - static constexpr uint32_t samplePosYi[1] = { 0x80 }; - static constexpr float samplePosX[1] = { 0.5f }; - static constexpr float samplePosY[1] = { 0.5f }; + INLINE static float X(uint32_t sampleNum) { return 0.5f; }; + INLINE static float Y(uint32_t sampleNum) { return 0.5f; }; + INLINE static simdscalari FullSampleMask() { return _simd_set1_epi32(0x1); }; + + static const uint32_t numSamples = 1; + static const uint32_t numCoverageSamples = 1; + static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_1X; + static constexpr uint32_t samplePosXi[1] = {0x80}; + static constexpr uint32_t samplePosYi[1] = {0x80}; + static constexpr float samplePosX[1] = {0.5f}; + static constexpr float samplePosY[1] = {0.5f}; }; -template<> +template <> struct MultisampleTraits<SWR_MULTISAMPLE_2X, false> { - INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; }; - INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; }; + INLINE static float X(uint32_t sampleNum) + { + SWR_ASSERT(sampleNum < numSamples); + return samplePosX[sampleNum]; + }; + INLINE static float Y(uint32_t sampleNum) + { + SWR_ASSERT(sampleNum < numSamples); + return samplePosY[sampleNum]; + }; INLINE static simdscalari FullSampleMask() { - static const simdscalari mask =_simd_set1_epi32(0x3); - return mask; + static const simdscalari mask = _simd_set1_epi32(0x3); + return mask; } - static const uint32_t numSamples = 2; - static const uint32_t numCoverageSamples = 2; - static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_2X; - static constexpr uint32_t samplePosXi[2] = { 0xC0, 0x40 }; - static constexpr uint32_t samplePosYi[2] = { 0xC0, 0x40 }; - static constexpr float samplePosX[2] = {0.75f, 0.25f}; - static constexpr float samplePosY[2] = {0.75f, 0.25f}; + static const uint32_t numSamples = 2; + static const uint32_t numCoverageSamples = 2; + static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_2X; + static constexpr uint32_t samplePosXi[2] = {0xC0, 0x40}; + static constexpr uint32_t samplePosYi[2] = {0xC0, 0x40}; + static constexpr float samplePosX[2] = {0.75f, 0.25f}; + static constexpr float samplePosY[2] = {0.75f, 0.25f}; }; -template<> +template <> struct MultisampleTraits<SWR_MULTISAMPLE_2X, true> { - INLINE static float X(uint32_t sampleNum) {return 0.5f;}; - INLINE static float Y(uint32_t sampleNum) {return 0.5f;}; + INLINE static float X(uint32_t sampleNum) { return 0.5f; }; + INLINE static float Y(uint32_t sampleNum) { return 0.5f; }; INLINE static simdscalari FullSampleMask() { - static const simdscalari mask =_simd_set1_epi32(0x3); - return mask; + static const simdscalari mask = _simd_set1_epi32(0x3); + return mask; } - static const uint32_t numSamples = 2; - static const uint32_t numCoverageSamples = 1; - static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_2X; - static constexpr uint32_t samplePosXi[2] = { 0x80 , 0x80 }; - static constexpr uint32_t samplePosYi[2] = { 0x80 , 0x80 }; - static constexpr float samplePosX[2] = { 0.5f, 0.5f }; - static constexpr float samplePosY[2] = { 0.5f, 0.5f }; + static const uint32_t numSamples = 2; + static const uint32_t numCoverageSamples = 1; + static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_2X; + static constexpr uint32_t samplePosXi[2] = {0x80, 0x80}; + static constexpr uint32_t samplePosYi[2] = {0x80, 0x80}; + static constexpr float samplePosX[2] = {0.5f, 0.5f}; + static constexpr float samplePosY[2] = {0.5f, 0.5f}; }; -template<> +template <> struct MultisampleTraits<SWR_MULTISAMPLE_4X, false> { - INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; }; - INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; }; + INLINE static float X(uint32_t sampleNum) + { + SWR_ASSERT(sampleNum < numSamples); + return samplePosX[sampleNum]; + }; + INLINE static float Y(uint32_t sampleNum) + { + SWR_ASSERT(sampleNum < numSamples); + return samplePosY[sampleNum]; + }; INLINE static simdscalari FullSampleMask() { static const simdscalari mask = _simd_set1_epi32(0xF); return mask; } - static const uint32_t numSamples = 4; - static const uint32_t numCoverageSamples = 4; - static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_4X; - static constexpr uint32_t samplePosXi[4] = { 0x60, 0xE0, 0x20, 0xA0 }; - static constexpr uint32_t samplePosYi[4] = { 0x20, 0x60, 0xA0, 0xE0 }; - static constexpr float samplePosX[4] = { 0.375f, 0.875f, 0.125f, 0.625f }; - static constexpr float samplePosY[4] = { 0.125f, 0.375f, 0.625f, 0.875f }; + static const uint32_t numSamples = 4; + static const uint32_t numCoverageSamples = 4; + static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_4X; + static constexpr uint32_t samplePosXi[4] = {0x60, 0xE0, 0x20, 0xA0}; + static constexpr uint32_t samplePosYi[4] = {0x20, 0x60, 0xA0, 0xE0}; + static constexpr float samplePosX[4] = {0.375f, 0.875f, 0.125f, 0.625f}; + static constexpr float samplePosY[4] = {0.125f, 0.375f, 0.625f, 0.875f}; }; -template<> +template <> struct MultisampleTraits<SWR_MULTISAMPLE_4X, true> { - INLINE static float X(uint32_t sampleNum) {return 0.5f;}; - INLINE static float Y(uint32_t sampleNum) {return 0.5f;}; + INLINE static float X(uint32_t sampleNum) { return 0.5f; }; + INLINE static float Y(uint32_t sampleNum) { return 0.5f; }; INLINE static simdscalari FullSampleMask() { static const simdscalari mask = _simd_set1_epi32(0xF); return mask; } - static const uint32_t numSamples = 4; - static const uint32_t numCoverageSamples = 1; - static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_4X; - static constexpr uint32_t samplePosXi[4] = { 0x80, 0x80, 0x80, 0x80 }; - static constexpr uint32_t samplePosYi[4] = { 0x80, 0x80, 0x80, 0x80 }; - static constexpr float samplePosX[4] = { 0.5f, 0.5f, 0.5f, 0.5f }; - static constexpr float samplePosY[4] = { 0.5f, 0.5f, 0.5f, 0.5f }; + static const uint32_t numSamples = 4; + static const uint32_t numCoverageSamples = 1; + static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_4X; + static constexpr uint32_t samplePosXi[4] = {0x80, 0x80, 0x80, 0x80}; + static constexpr uint32_t samplePosYi[4] = {0x80, 0x80, 0x80, 0x80}; + static constexpr float samplePosX[4] = {0.5f, 0.5f, 0.5f, 0.5f}; + static constexpr float samplePosY[4] = {0.5f, 0.5f, 0.5f, 0.5f}; }; -template<> +template <> struct MultisampleTraits<SWR_MULTISAMPLE_8X, false> { - INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; }; - INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; }; + INLINE static float X(uint32_t sampleNum) + { + SWR_ASSERT(sampleNum < numSamples); + return samplePosX[sampleNum]; + }; + INLINE static float Y(uint32_t sampleNum) + { + SWR_ASSERT(sampleNum < numSamples); + return samplePosY[sampleNum]; + }; INLINE static simdscalari FullSampleMask() { static const simdscalari mask = _simd_set1_epi32(0xFF); return mask; } - static const uint32_t numSamples = 8; - static const uint32_t numCoverageSamples = 8; - static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_8X; - static constexpr uint32_t samplePosXi[8] = { 0x90, 0x70, 0xD0, 0x50, 0x30, 0x10, 0xB0, 0xF0 }; - static constexpr uint32_t samplePosYi[8] = { 0x50, 0xB0, 0x90, 0x30, 0xD0, 0x70, 0xF0, 0x10 }; - static constexpr float samplePosX[8] = { 0.5625f, 0.4375f, 0.8125f, 0.3125f, 0.1875f, 0.0625f, 0.6875f, 0.9375f }; - static constexpr float samplePosY[8] = { 0.3125f, 0.6875f, 0.5625f, 0.1875f, 0.8125f, 0.4375f, 0.9375f, 0.0625f }; + static const uint32_t numSamples = 8; + static const uint32_t numCoverageSamples = 8; + static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_8X; + static constexpr uint32_t samplePosXi[8] = {0x90, 0x70, 0xD0, 0x50, 0x30, 0x10, 0xB0, 0xF0}; + static constexpr uint32_t samplePosYi[8] = {0x50, 0xB0, 0x90, 0x30, 0xD0, 0x70, 0xF0, 0x10}; + static constexpr float samplePosX[8] = { + 0.5625f, 0.4375f, 0.8125f, 0.3125f, 0.1875f, 0.0625f, 0.6875f, 0.9375f}; + static constexpr float samplePosY[8] = { + 0.3125f, 0.6875f, 0.5625f, 0.1875f, 0.8125f, 0.4375f, 0.9375f, 0.0625f}; }; -template<> +template <> struct MultisampleTraits<SWR_MULTISAMPLE_8X, true> { - INLINE static float X(uint32_t sampleNum) {return 0.5f;}; - INLINE static float Y(uint32_t sampleNum) {return 0.5f;}; + INLINE static float X(uint32_t sampleNum) { return 0.5f; }; + INLINE static float Y(uint32_t sampleNum) { return 0.5f; }; INLINE static simdscalari FullSampleMask() { static const simdscalari mask = _simd_set1_epi32(0xFF); return mask; } - static const uint32_t numSamples = 8; - static const uint32_t numCoverageSamples = 1; - static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_8X; - static constexpr uint32_t samplePosXi[8] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; - static constexpr uint32_t samplePosYi[8] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; - static constexpr float samplePosX[8] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f }; - static constexpr float samplePosY[8] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f }; + static const uint32_t numSamples = 8; + static const uint32_t numCoverageSamples = 1; + static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_8X; + static constexpr uint32_t samplePosXi[8] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; + static constexpr uint32_t samplePosYi[8] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; + static constexpr float samplePosX[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f}; + static constexpr float samplePosY[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f}; }; -template<> +template <> struct MultisampleTraits<SWR_MULTISAMPLE_16X, false> { - INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; }; - INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; }; + INLINE static float X(uint32_t sampleNum) + { + SWR_ASSERT(sampleNum < numSamples); + return samplePosX[sampleNum]; + }; + INLINE static float Y(uint32_t sampleNum) + { + SWR_ASSERT(sampleNum < numSamples); + return samplePosY[sampleNum]; + }; INLINE static simdscalari FullSampleMask() { static const simdscalari mask = _simd_set1_epi32(0xFFFF); return mask; } - static const uint32_t numSamples = 16; - static const uint32_t numCoverageSamples = 16; - static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_16X; - static constexpr uint32_t samplePosXi[16] = { 0x90, 0x70, 0x50, 0xC0, 0x30, 0xA0, 0xD0, 0xB0, 0x60, 0x80, 0x40, 0x20, 0x00, 0xF0, 0xE0, 0x10 }; - static constexpr uint32_t samplePosYi[16] = { 0x90, 0x50, 0xA0, 0x70, 0x60, 0xD0, 0xB0, 0x30, 0xE0, 0x10, 0x20, 0xC0, 0x80, 0x40, 0xF0, 0x00 }; - static constexpr float samplePosX[16] = { 0.5625f, 0.4375f, 0.3125f, 0.7500f, 0.1875f, 0.6250f, 0.8125f, 0.6875f, 0.3750f, 0.5000f, 0.2500f, 0.1250f, 0.0000f, 0.9375f, 0.8750f, 0.0625f }; - static constexpr float samplePosY[16] = { 0.5625f, 0.3125f, 0.6250f, 0.4375f, 0.3750f, 0.8125f, 0.6875f, 0.1875f, 0.8750f, 0.0625f, 0.1250f, 0.7500f, 0.5000f, 0.2500f, 0.9375f, 0.0000f }; + static const uint32_t numSamples = 16; + static const uint32_t numCoverageSamples = 16; + static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_16X; + static constexpr uint32_t samplePosXi[16] = {0x90, + 0x70, + 0x50, + 0xC0, + 0x30, + 0xA0, + 0xD0, + 0xB0, + 0x60, + 0x80, + 0x40, + 0x20, + 0x00, + 0xF0, + 0xE0, + 0x10}; + static constexpr uint32_t samplePosYi[16] = {0x90, + 0x50, + 0xA0, + 0x70, + 0x60, + 0xD0, + 0xB0, + 0x30, + 0xE0, + 0x10, + 0x20, + 0xC0, + 0x80, + 0x40, + 0xF0, + 0x00}; + static constexpr float samplePosX[16] = {0.5625f, + 0.4375f, + 0.3125f, + 0.7500f, + 0.1875f, + 0.6250f, + 0.8125f, + 0.6875f, + 0.3750f, + 0.5000f, + 0.2500f, + 0.1250f, + 0.0000f, + 0.9375f, + 0.8750f, + 0.0625f}; + static constexpr float samplePosY[16] = {0.5625f, + 0.3125f, + 0.6250f, + 0.4375f, + 0.3750f, + 0.8125f, + 0.6875f, + 0.1875f, + 0.8750f, + 0.0625f, + 0.1250f, + 0.7500f, + 0.5000f, + 0.2500f, + 0.9375f, + 0.0000f}; }; -template<> +template <> struct MultisampleTraits<SWR_MULTISAMPLE_16X, true> { - INLINE static float X(uint32_t sampleNum) {return 0.5f;}; - INLINE static float Y(uint32_t sampleNum) {return 0.5f;}; + INLINE static float X(uint32_t sampleNum) { return 0.5f; }; + INLINE static float Y(uint32_t sampleNum) { return 0.5f; }; INLINE static simdscalari FullSampleMask() { static const simdscalari mask = _simd_set1_epi32(0xFFFF); return mask; } - static const uint32_t numSamples = 16; - static const uint32_t numCoverageSamples = 1; - static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_16X; - static constexpr uint32_t samplePosXi[16] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; - static constexpr uint32_t samplePosYi[16] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; - static constexpr float samplePosX[16] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f }; - static constexpr float samplePosY[16] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f }; + static const uint32_t numSamples = 16; + static const uint32_t numCoverageSamples = 1; + static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_16X; + static constexpr uint32_t samplePosXi[16] = {0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80}; + static constexpr uint32_t samplePosYi[16] = {0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80}; + static constexpr float samplePosX[16] = {0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f}; + static constexpr float samplePosY[16] = {0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f, + 0.5f}; }; INLINE -bool isNonStandardPattern(const SWR_MULTISAMPLE_COUNT sampleCount, const SWR_MULTISAMPLE_POS& samplePos) +bool isNonStandardPattern(const SWR_MULTISAMPLE_COUNT sampleCount, + const SWR_MULTISAMPLE_POS& samplePos) { // detect if we're using standard or center sample patterns const uint32_t *standardPosX, *standardPosY; - switch(sampleCount) + switch (sampleCount) { case SWR_MULTISAMPLE_1X: standardPosX = MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosXi; @@ -281,15 +443,15 @@ bool isNonStandardPattern(const SWR_MULTISAMPLE_COUNT sampleCount, const SWR_MUL } // scan sample pattern for standard or center - uint32_t numSamples = GetNumSamples(sampleCount); - bool bIsStandard = true; - if(numSamples > 1) + uint32_t numSamples = GetNumSamples(sampleCount); + bool bIsStandard = true; + if (numSamples > 1) { - for(uint32_t i = 0; i < numSamples; i++) + for (uint32_t i = 0; i < numSamples; i++) { - bIsStandard = (standardPosX[i] == samplePos.Xi(i)) || - (standardPosY[i] == samplePos.Yi(i)); - if(!bIsStandard) + bIsStandard = + (standardPosX[i] == samplePos.Xi(i)) || (standardPosY[i] == samplePos.Yi(i)); + if (!bIsStandard) break; } } diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h index ab1d46de9d0..e19c8ea4a79 100644 --- a/src/gallium/drivers/swr/rasterizer/core/pa.h +++ b/src/gallium/drivers/swr/rasterizer/core/pa.h @@ -1,33 +1,33 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file pa.h -* -* @brief Definitions for primitive assembly. -* N primitives are assembled at a time, where N is the SIMD width. -* A state machine, that is specific for a given topology, drives the -* assembly of vertices into triangles. -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file pa.h + * + * @brief Definitions for primitive assembly. + * N primitives are assembled at a time, where N is the SIMD width. + * A state machine, that is specific for a given topology, drives the + * assembly of vertices into triangles. + * + ******************************************************************************/ #pragma once #include "frontend.h" @@ -42,13 +42,13 @@ struct PA_STATE SIMD_WIDTH_LOG2 = 4 }; - typedef simd16mask SIMDMASK; + typedef simd16mask SIMDMASK; - typedef simd16scalar SIMDSCALAR; - typedef simd16vector SIMDVECTOR; - typedef simd16vertex SIMDVERTEX; + typedef simd16scalar SIMDSCALAR; + typedef simd16vector SIMDVECTOR; + typedef simd16vertex SIMDVERTEX; - typedef simd16scalari SIMDSCALARI; + typedef simd16scalari SIMDSCALARI; #else enum @@ -58,36 +58,45 @@ struct PA_STATE SIMD_WIDTH_LOG2 = 3 }; - typedef simdmask SIMDMASK; + typedef simdmask SIMDMASK; - typedef simdscalar SIMDSCALAR; - typedef simdvector SIMDVECTOR; - typedef simdvertex SIMDVERTEX; + typedef simdscalar SIMDSCALAR; + typedef simdvector SIMDVECTOR; + typedef simdvertex SIMDVERTEX; - typedef simdscalari SIMDSCALARI; + typedef simdscalari SIMDSCALARI; #endif - DRAW_CONTEXT *pDC{ nullptr }; // draw context - uint8_t* pStreamBase{ nullptr }; // vertex stream - uint32_t streamSizeInVerts{ 0 }; // total size of the input stream in verts - uint32_t vertexStride{ 0 }; // stride of a vertex in simdvector units + DRAW_CONTEXT* pDC{nullptr}; // draw context + uint8_t* pStreamBase{nullptr}; // vertex stream + uint32_t streamSizeInVerts{0}; // total size of the input stream in verts + uint32_t vertexStride{0}; // stride of a vertex in simdvector units - // The topology the binner will use. In some cases the FE changes the topology from the api state. - PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN }; + // The topology the binner will use. In some cases the FE changes the topology from the api + // state. + PRIMITIVE_TOPOLOGY binTopology{TOP_UNKNOWN}; #if ENABLE_AVX512_SIMD16 - bool useAlternateOffset{ false }; + bool useAlternateOffset{false}; #endif - bool viewportArrayActive{ false }; - bool rtArrayActive { false }; - uint32_t numVertsPerPrim{ 0 }; + bool viewportArrayActive{false}; + bool rtArrayActive{false}; + uint32_t numVertsPerPrim{0}; - PA_STATE(){} - PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, uint32_t in_numVertsPerPrim) : - pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts), vertexStride(in_vertexStride), numVertsPerPrim(in_numVertsPerPrim) {} + PA_STATE() {} + PA_STATE(DRAW_CONTEXT* in_pDC, + uint8_t* in_pStreamBase, + uint32_t in_streamSizeInVerts, + uint32_t in_vertexStride, + uint32_t in_numVertsPerPrim) : + pDC(in_pDC), + pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts), + vertexStride(in_vertexStride), numVertsPerPrim(in_numVertsPerPrim) + { + } - virtual bool HasWork() = 0; + virtual bool HasWork() = 0; virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0; #if ENABLE_AVX512_SIMD16 virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0; @@ -96,14 +105,14 @@ struct PA_STATE #if ENABLE_AVX512_SIMD16 virtual bool Assemble(uint32_t slot, simd16vector verts[]) = 0; #endif - virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0; - virtual bool NextPrim() = 0; - virtual SIMDVERTEX& GetNextVsOutput() = 0; - virtual bool GetNextStreamOutput() = 0; - virtual SIMDMASK& GetNextVsIndices() = 0; - virtual uint32_t NumPrims() = 0; - virtual void Reset() = 0; - virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0; + virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0; + virtual bool NextPrim() = 0; + virtual SIMDVERTEX& GetNextVsOutput() = 0; + virtual bool GetNextStreamOutput() = 0; + virtual SIMDMASK& GetNextVsIndices() = 0; + virtual uint32_t NumPrims() = 0; + virtual void Reset() = 0; + virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0; }; // The Optimized PA is a state machine that assembles triangles from vertex shader simd @@ -117,69 +126,77 @@ struct PA_STATE // 1. We call this the current and previous simd vertex. // 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In // order to assemble the second triangle, for a triangle list, we'll need the -// last vertex from the previous simd and the first 2 vertices from the current simd. +// last vertex from the previous simd and the first 2 vertices from the current +// simd. // 3. At times the PA can assemble multiple triangles from the 2 simd vertices. // // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without // cuts struct PA_STATE_OPT : public PA_STATE { - uint32_t numPrims{ 0 }; // Total number of primitives for draw. - uint32_t numPrimsComplete{ 0 }; // Total number of complete primitives. + uint32_t numPrims{0}; // Total number of primitives for draw. + uint32_t numPrimsComplete{0}; // Total number of complete primitives. - uint32_t numSimdPrims{ 0 }; // Number of prims in current simd. + uint32_t numSimdPrims{0}; // Number of prims in current simd. - uint32_t cur{ 0 }; // index to current VS output. - uint32_t prev{ 0 }; // index to prev VS output. Not really needed in the state. - const uint32_t first{ 0 }; // index to first VS output. Used for tri fan and line loop. + uint32_t cur{0}; // index to current VS output. + uint32_t prev{0}; // index to prev VS output. Not really needed in the state. + const uint32_t first{0}; // index to first VS output. Used for tri fan and line loop. - uint32_t counter{ 0 }; // state counter - bool reset{ false }; // reset state + uint32_t counter{0}; // state counter + bool reset{false}; // reset state - uint32_t primIDIncr{ 0 }; // how much to increment for each vector (typically vector / {1, 2}) + uint32_t primIDIncr{0}; // how much to increment for each vector (typically vector / {1, 2}) SIMDSCALARI primID; - typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); + typedef bool (*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); #if ENABLE_AVX512_SIMD16 - typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); + typedef bool (*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); #endif - typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]); + typedef void (*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, + uint32_t slot, + uint32_t primIndex, + simd4scalar verts[]); - PFN_PA_FUNC pfnPaFunc{ nullptr }; // PA state machine function for assembling 4 triangles. + PFN_PA_FUNC pfnPaFunc{nullptr}; // PA state machine function for assembling 4 triangles. #if ENABLE_AVX512_SIMD16 - PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{ nullptr }; + PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{nullptr}; #endif - PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr }; // PA state machine function for assembling single triangle. - PFN_PA_FUNC pfnPaFuncReset{ nullptr }; // initial state to set on reset + PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ + nullptr}; // PA state machine function for assembling single triangle. + PFN_PA_FUNC pfnPaFuncReset{nullptr}; // initial state to set on reset #if ENABLE_AVX512_SIMD16 - PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{ nullptr }; + PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{nullptr}; #endif // state used to advance the PA when Next is called - PFN_PA_FUNC pfnPaNextFunc{ nullptr }; + PFN_PA_FUNC pfnPaNextFunc{nullptr}; #if ENABLE_AVX512_SIMD16 - PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{ nullptr }; + PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{nullptr}; #endif - uint32_t nextNumSimdPrims{ 0 }; - uint32_t nextNumPrimsIncrement{ 0 }; - bool nextReset{ false }; - bool isStreaming{ false }; + uint32_t nextNumSimdPrims{0}; + uint32_t nextNumPrimsIncrement{0}; + bool nextReset{false}; + bool isStreaming{false}; - SIMDMASK junkIndices { 0 }; // temporary index store for unused virtual function + SIMDMASK junkIndices{0}; // temporary index store for unused virtual function PA_STATE_OPT() {} - PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts, - uint32_t vertexStride, bool in_isStreaming, uint32_t numVertsPerPrim, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN); + PA_STATE_OPT(DRAW_CONTEXT* pDC, + uint32_t numPrims, + uint8_t* pStream, + uint32_t streamSizeInVerts, + uint32_t vertexStride, + bool in_isStreaming, + uint32_t numVertsPerPrim, + PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN); - bool HasWork() - { - return (this->numPrimsComplete < this->numPrims) ? true : false; - } + bool HasWork() { return (this->numPrimsComplete < this->numPrims) ? true : false; } simdvector& GetSimdVector(uint32_t index, uint32_t slot) { SWR_ASSERT(slot < vertexStride); - uint32_t offset = index * vertexStride + slot; + uint32_t offset = index * vertexStride + slot; simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset]; return vertexSlot; } @@ -188,7 +205,7 @@ struct PA_STATE_OPT : public PA_STATE simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) { SWR_ASSERT(slot < vertexStride); - uint32_t offset = index * vertexStride + slot; + uint32_t offset = index * vertexStride + slot; simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset]; return vertexSlot; } @@ -196,10 +213,7 @@ struct PA_STATE_OPT : public PA_STATE #endif // Assembles 4 triangles. Each simdvector is a single vertex from 4 // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle. - bool Assemble(uint32_t slot, simdvector verts[]) - { - return this->pfnPaFunc(*this, slot, verts); - } + bool Assemble(uint32_t slot, simdvector verts[]) { return this->pfnPaFunc(*this, slot, verts); } #if ENABLE_AVX512_SIMD16 bool Assemble(uint32_t slot, simd16vector verts[]) @@ -239,12 +253,12 @@ struct PA_STATE_OPT : public PA_STATE else { this->counter = (this->reset) ? 0 : (this->counter + 1); - this->reset = false; + this->reset = false; } if (!HasWork()) { - morePrims = false; // no more to do + morePrims = false; // no more to do } return morePrims; @@ -259,15 +273,16 @@ struct PA_STATE_OPT : public PA_STATE { // prev undefined for first state prev = cur; - cur = counter; + cur = counter; } else { - // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in the buffer + // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in + // the buffer uint32_t temp = prev; prev = cur; - cur = temp; + cur = temp; } SWR_ASSERT(cur < numSimdVerts); @@ -285,44 +300,46 @@ struct PA_STATE_OPT : public PA_STATE bool GetNextStreamOutput() { this->prev = this->cur; - this->cur = this->counter; + this->cur = this->counter; return HasWork(); } uint32_t NumPrims() { - return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ? - (SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : SIMD_WIDTH; + return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) + ? (SIMD_WIDTH - + (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) + : SIMD_WIDTH; } - void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, - PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, - uint32_t numSimdPrims = 0, - uint32_t numPrimsIncrement = 0, - bool reset = false) + void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, + PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, + uint32_t numSimdPrims = 0, + uint32_t numPrimsIncrement = 0, + bool reset = false) { - this->pfnPaNextFunc = pfnPaNextFunc; - this->nextNumSimdPrims = numSimdPrims; + this->pfnPaNextFunc = pfnPaNextFunc; + this->nextNumSimdPrims = numSimdPrims; this->nextNumPrimsIncrement = numPrimsIncrement; - this->nextReset = reset; + this->nextReset = reset; this->pfnPaSingleFunc = pfnPaNextSingleFunc; } #if ENABLE_AVX512_SIMD16 void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16, - PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, - PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, - uint32_t numSimdPrims = 0, - uint32_t numPrimsIncrement = 0, - bool reset = false) - { - this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16; - this->pfnPaNextFunc = pfnPaNextFunc; - this->nextNumSimdPrims = numSimdPrims; + PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, + PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, + uint32_t numSimdPrims = 0, + uint32_t numPrimsIncrement = 0, + bool reset = false) + { + this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16; + this->pfnPaNextFunc = pfnPaNextFunc; + this->nextNumSimdPrims = numSimdPrims; this->nextNumPrimsIncrement = numPrimsIncrement; - this->nextReset = reset; + this->nextReset = reset; this->pfnPaSingleFunc = pfnPaNextSingleFunc; } @@ -339,44 +356,54 @@ struct PA_STATE_OPT : public PA_STATE this->pfnPaFunc_simd16 = this->pfnPaFuncReset_simd16; #endif this->numPrimsComplete = 0; - this->numSimdPrims = 0; - this->cur = 0; - this->prev = 0; - this->counter = 0; - this->reset = false; + this->numSimdPrims = 0; + this->cur = 0; + this->prev = 0; + this->counter = 0; + this->reset = false; } SIMDSCALARI GetPrimID(uint32_t startID) { #if USE_SIMD16_FRONTEND - return _simd16_add_epi32(this->primID, + return _simd16_add_epi32( + this->primID, _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH))); #else - return _simd_add_epi32(this->primID, + return _simd_add_epi32( + this->primID, _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH))); #endif } }; // helper C wrappers to avoid having to rewrite all the PA topology state functions -INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, - PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, - uint32_t numSimdPrims = 0, - uint32_t numPrimsIncrement = 0, - bool reset = false) +INLINE void SetNextPaState(PA_STATE_OPT& pa, + PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, + PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, + uint32_t numSimdPrims = 0, + uint32_t numPrimsIncrement = 0, + bool reset = false) { - return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset); + return pa.SetNextState( + pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset); } #if ENABLE_AVX512_SIMD16 -INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16, - PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, - PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, - uint32_t numSimdPrims = 0, - uint32_t numPrimsIncrement = 0, - bool reset = false) +INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, + PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16, + PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, + PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, + uint32_t numSimdPrims = 0, + uint32_t numPrimsIncrement = 0, + bool reset = false) { - return pa.SetNextState_simd16(pfnPaNextFunc_simd16, pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset); + return pa.SetNextState_simd16(pfnPaNextFunc_simd16, + pfnPaNextFunc, + pfnPaNextSingleFunc, + numSimdPrims, + numPrimsIncrement, + reset); } #endif @@ -395,59 +422,70 @@ INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32 // Cut-aware primitive assembler. struct PA_STATE_CUT : public PA_STATE { - SIMDMASK* pCutIndices{ nullptr }; // cut indices buffer, 1 bit per vertex - uint32_t numVerts{ 0 }; // number of vertices available in buffer store - uint32_t numAttribs{ 0 }; // number of attributes - int32_t numRemainingVerts{ 0 }; // number of verts remaining to be assembled - uint32_t numVertsToAssemble{ 0 }; // total number of verts to assemble for the draw + SIMDMASK* pCutIndices{nullptr}; // cut indices buffer, 1 bit per vertex + uint32_t numVerts{0}; // number of vertices available in buffer store + uint32_t numAttribs{0}; // number of attributes + int32_t numRemainingVerts{0}; // number of verts remaining to be assembled + uint32_t numVertsToAssemble{0}; // total number of verts to assemble for the draw #if ENABLE_AVX512_SIMD16 - OSALIGNSIMD16(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather + OSALIGNSIMD16(uint32_t) + indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather #else - OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather + OSALIGNSIMD(uint32_t) + indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather #endif - SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd - uint32_t numPrimsAssembled{ 0 }; // number of primitives that are fully assembled - uint32_t headVertex{ 0 }; // current unused vertex slot in vertex buffer store - uint32_t tailVertex{ 0 }; // beginning vertex currently assembling - uint32_t curVertex{ 0 }; // current unprocessed vertex - uint32_t startPrimId{ 0 }; // starting prim id - SIMDSCALARI vPrimId; // vector of prim ID - bool needOffsets{ false }; // need to compute gather offsets for current SIMD - uint32_t vertsPerPrim{ 0 }; - bool processCutVerts{ false }; // vertex indices with cuts should be processed as normal, otherwise they - // are ignored. Fetch shader sends invalid verts on cuts that should be ignored - // while the GS sends valid verts for every index - - simdvector junkVector; // junk simdvector for unimplemented API + SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd + uint32_t numPrimsAssembled{0}; // number of primitives that are fully assembled + uint32_t headVertex{0}; // current unused vertex slot in vertex buffer store + uint32_t tailVertex{0}; // beginning vertex currently assembling + uint32_t curVertex{0}; // current unprocessed vertex + uint32_t startPrimId{0}; // starting prim id + SIMDSCALARI vPrimId; // vector of prim ID + bool needOffsets{false}; // need to compute gather offsets for current SIMD + uint32_t vertsPerPrim{0}; + bool processCutVerts{ + false}; // vertex indices with cuts should be processed as normal, otherwise they + // are ignored. Fetch shader sends invalid verts on cuts that should be ignored + // while the GS sends valid verts for every index + + simdvector junkVector; // junk simdvector for unimplemented API #if ENABLE_AVX512_SIMD16 - simd16vector junkVector_simd16; // junk simd16vector for unimplemented API + simd16vector junkVector_simd16; // junk simd16vector for unimplemented API #endif // Topology state tracking uint32_t vert[MAX_NUM_VERTS_PER_PRIM]; - uint32_t curIndex{ 0 }; - bool reverseWinding{ false }; // indicates reverse winding for strips - int32_t adjExtraVert{ 0 }; // extra vert uses for tristrip w/ adj + uint32_t curIndex{0}; + bool reverseWinding{false}; // indicates reverse winding for strips + int32_t adjExtraVert{0}; // extra vert uses for tristrip w/ adj - typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish); - PFN_PA_FUNC pfnPa{ nullptr }; // per-topology function that processes a single vert + typedef void (PA_STATE_CUT::*PFN_PA_FUNC)(uint32_t vert, bool finish); + PFN_PA_FUNC pfnPa{nullptr}; // per-topology function that processes a single vert PA_STATE_CUT() {} - PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, SIMDMASK* in_pIndices, uint32_t in_numVerts, - uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts, uint32_t in_numVertsPerPrim) - : PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride, in_numVertsPerPrim) - { - numVerts = in_streamSizeInVerts; - numAttribs = in_numAttribs; - binTopology = topo; - needOffsets = false; + PA_STATE_CUT(DRAW_CONTEXT* pDC, + uint8_t* in_pStream, + uint32_t in_streamSizeInVerts, + uint32_t in_vertexStride, + SIMDMASK* in_pIndices, + uint32_t in_numVerts, + uint32_t in_numAttribs, + PRIMITIVE_TOPOLOGY topo, + bool in_processCutVerts, + uint32_t in_numVertsPerPrim) : + PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride, in_numVertsPerPrim) + { + numVerts = in_streamSizeInVerts; + numAttribs = in_numAttribs; + binTopology = topo; + needOffsets = false; processCutVerts = in_processCutVerts; numVertsToAssemble = numRemainingVerts = in_numVerts; - numPrimsAssembled = 0; + numPrimsAssembled = 0; headVertex = tailVertex = curVertex = 0; - curIndex = 0; + curIndex = 0; pCutIndices = in_pIndices; memset(indices, 0, sizeof(indices)); #if USE_SIMD16_FRONTEND @@ -456,49 +494,72 @@ struct PA_STATE_CUT : public PA_STATE vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); #endif reverseWinding = false; - adjExtraVert = -1; + adjExtraVert = -1; bool gsEnabled = pDC->pState->state.gsState.gsEnable; - vertsPerPrim = NumVertsPerPrim(topo, gsEnabled); + vertsPerPrim = NumVertsPerPrim(topo, gsEnabled); switch (topo) { - case TOP_TRIANGLE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertTriList; break; - case TOP_TRI_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break; - case TOP_TRIANGLE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break; - case TOP_TRI_STRIP_ADJ: if (gsEnabled) - { - pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ; - } - else - { - pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ; - } - break; - - case TOP_POINT_LIST: pfnPa = &PA_STATE_CUT::ProcessVertPointList; break; - case TOP_LINE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertLineList; break; - case TOP_LINE_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break; - case TOP_LINE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break; - case TOP_LISTSTRIP_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break; - case TOP_RECT_LIST: pfnPa = &PA_STATE_CUT::ProcessVertRectList; break; - default: assert(0 && "Unimplemented topology"); + case TOP_TRIANGLE_LIST: + pfnPa = &PA_STATE_CUT::ProcessVertTriList; + break; + case TOP_TRI_LIST_ADJ: + pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj + : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; + break; + case TOP_TRIANGLE_STRIP: + pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; + break; + case TOP_TRI_STRIP_ADJ: + if (gsEnabled) + { + pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj<true>; + } + else + { + pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj<false>; + } + break; + + case TOP_POINT_LIST: + pfnPa = &PA_STATE_CUT::ProcessVertPointList; + break; + case TOP_LINE_LIST: + pfnPa = &PA_STATE_CUT::ProcessVertLineList; + break; + case TOP_LINE_LIST_ADJ: + pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj + : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; + break; + case TOP_LINE_STRIP: + pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; + break; + case TOP_LISTSTRIP_ADJ: + pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj + : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; + break; + case TOP_RECT_LIST: + pfnPa = &PA_STATE_CUT::ProcessVertRectList; + break; + default: + assert(0 && "Unimplemented topology"); } } SIMDVERTEX& GetNextVsOutput() { uint32_t vertexIndex = this->headVertex / SIMD_WIDTH; - this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts; - this->needOffsets = true; - SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride]; + this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts; + this->needOffsets = true; + SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride]; return *(SIMDVERTEX*)pVertex; } SIMDMASK& GetNextVsIndices() { - uint32_t vertexIndex = this->headVertex / SIMD_WIDTH; + uint32_t vertexIndex = this->headVertex / SIMD_WIDTH; SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex; return *pCurCutIndex; } @@ -543,12 +604,12 @@ struct PA_STATE_CUT : public PA_STATE #endif this->numRemainingVerts = this->numVertsToAssemble; this->numPrimsAssembled = 0; - this->curIndex = 0; - this->curVertex = 0; - this->tailVertex = 0; - this->headVertex = 0; - this->reverseWinding = false; - this->adjExtraVert = -1; + this->curIndex = 0; + this->curVertex = 0; + this->tailVertex = 0; + this->headVertex = 0; + this->reverseWinding = false; + this->adjExtraVert = -1; #if USE_SIMD16_FRONTEND this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); #else @@ -556,10 +617,7 @@ struct PA_STATE_CUT : public PA_STATE #endif } - bool HasWork() - { - return this->numRemainingVerts > 0 || this->adjExtraVert != -1; - } + bool HasWork() { return this->numRemainingVerts > 0 || this->adjExtraVert != -1; } bool IsVertexStoreFull() { @@ -568,14 +626,14 @@ struct PA_STATE_CUT : public PA_STATE void RestartTopology() { - this->curIndex = 0; + this->curIndex = 0; this->reverseWinding = false; - this->adjExtraVert = -1; + this->adjExtraVert = -1; } bool IsCutIndex(uint32_t vertex) { - uint32_t vertexIndex = vertex / SIMD_WIDTH; + uint32_t vertexIndex = vertex / SIMD_WIDTH; uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1); return CheckBit(this->pCutIndices[vertexIndex], vertexOffset); } @@ -584,9 +642,8 @@ struct PA_STATE_CUT : public PA_STATE // have assembled SIMD prims void ProcessVerts() { - while (this->numPrimsAssembled != SIMD_WIDTH && - this->numRemainingVerts > 0 && - this->curVertex != this->headVertex) + while (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0 && + this->curVertex != this->headVertex) { // if cut index, restart topology if (IsCutIndex(this->curVertex)) @@ -608,14 +665,16 @@ struct PA_STATE_CUT : public PA_STATE } this->curVertex++; - if (this->curVertex >= this->numVerts) { - this->curVertex = 0; + if (this->curVertex >= this->numVerts) + { + this->curVertex = 0; } this->numRemainingVerts--; } // special case last primitive for tri strip w/ adj - if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1) + if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 && + this->adjExtraVert != -1) { (this->*pfnPa)(this->curVertex, true); } @@ -625,7 +684,7 @@ struct PA_STATE_CUT : public PA_STATE { // done with current batch // advance tail to the current unsubmitted vertex - this->tailVertex = this->curVertex; + this->tailVertex = this->curVertex; this->numPrimsAssembled = 0; #if USE_SIMD16_FRONTEND this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH)); @@ -648,32 +707,38 @@ struct PA_STATE_CUT : public PA_STATE { for (uint32_t v = 0; v < this->vertsPerPrim; ++v) { - uint32_t vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR); - SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0]; + uint32_t vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR); + SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0]; // step to simdvertex batch const uint32_t simdShift = SIMD_WIDTH_LOG2; #if USE_SIMD16_FRONTEND SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift); - this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes)); + this->vOffsets[v] = + _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes)); #else SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift); - this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes)); + this->vOffsets[v] = + _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes)); #endif // step to index const uint32_t simdMask = SIMD_WIDTH - 1; #if USE_SIMD16_FRONTEND SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask)); - this->vOffsets[v] = _simd16_add_epi32(this->vOffsets[v], _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float)))); + this->vOffsets[v] = _simd16_add_epi32( + this->vOffsets[v], + _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float)))); #else SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask)); - this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float)))); + this->vOffsets[v] = + _simd_add_epi32(this->vOffsets[v], + _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float)))); #endif } } - bool Assemble(uint32_t slot, simdvector *verts) + bool Assemble(uint32_t slot, simdvector* verts) { // process any outstanding verts ProcessVerts(); @@ -684,7 +749,8 @@ struct PA_STATE_CUT : public PA_STATE return false; } - // cache off gather offsets given the current SIMD set of indices the first time we get an assemble + // cache off gather offsets given the current SIMD set of indices the first time we get an + // assemble if (this->needOffsets) { ComputeOffsets(); @@ -709,7 +775,8 @@ struct PA_STATE_CUT : public PA_STATE simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1); // Assigning to a temporary first to avoid an MSVC 2017 compiler bug - simdscalar t = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0); + simdscalar t = + useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0); verts[v].v[c] = t; #else verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1); @@ -728,7 +795,7 @@ struct PA_STATE_CUT : public PA_STATE // v1, v3 = v1 + v2 - v0, v2 // v1 stored in verts[0], v0 stored in verts[1], v2 stored in verts[2] simd16scalar temp = _simd16_add_ps(verts[0].v[c], verts[2].v[c]); - temp = _simd16_sub_ps(temp, verts[1].v[c]); + temp = _simd16_sub_ps(temp, verts[1].v[c]); temp = _simd16_blend_ps(verts[1].v[c], temp, 0xAAAA); // 1010 1010 1010 1010 verts[1].v[c] = _simd16_extract_ps(temp, 0); } @@ -740,7 +807,7 @@ struct PA_STATE_CUT : public PA_STATE #if ENABLE_AVX512_SIMD16 bool Assemble(uint32_t slot, simd16vector verts[]) { - // process any outstanding verts + // process any outstanding verts ProcessVerts(); // return false if we don't have enough prims assembled @@ -749,7 +816,8 @@ struct PA_STATE_CUT : public PA_STATE return false; } - // cache off gather offsets given the current SIMD set of indices the first time we get an assemble + // cache off gather offsets given the current SIMD set of indices the first time we get an + // assemble if (this->needOffsets) { ComputeOffsets(); @@ -773,7 +841,8 @@ struct PA_STATE_CUT : public PA_STATE #if USE_SIMD16_FRONTEND verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1); #else - verts[v].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0); + verts[v].v[c] = _simd16_insert_ps( + _simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0); #endif // move base to next component @@ -789,8 +858,9 @@ struct PA_STATE_CUT : public PA_STATE // v1, v3 = v1 + v2 - v0, v2 // v1 stored in verts[0], v0 stored in verts[1], v2 stored in verts[2] simd16scalar temp = _simd16_add_ps(verts[0].v[c], verts[2].v[c]); - temp = _simd16_sub_ps(temp, verts[1].v[c]); - verts[1].v[c] = _simd16_blend_ps(verts[1].v[c], temp, 0xAAAA); // 1010 1010 1010 1010 + temp = _simd16_sub_ps(temp, verts[1].v[c]); + verts[1].v[c] = + _simd16_blend_ps(verts[1].v[c], temp, 0xAAAA); // 1010 1010 1010 1010 } } @@ -800,12 +870,13 @@ struct PA_STATE_CUT : public PA_STATE #endif void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3]) { - // move to slot + // move to slot for (uint32_t v = 0; v < this->vertsPerPrim; ++v) { uint32_t* pOffset = (uint32_t*)&this->vOffsets[v]; #if USE_SIMD16_FRONTEND - uint32_t offset = useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex]; + uint32_t offset = + useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex]; #else uint32_t offset = pOffset[triIndex]; #endif @@ -814,7 +885,7 @@ struct PA_STATE_CUT : public PA_STATE for (uint32_t c = 0; c < 4; ++c) { float* pComponent = (float*)(this->pStreamBase + offset); - pVert[c] = *pComponent; + pVert[c] = *pComponent; offset += SIMD_WIDTH * sizeof(float); } } @@ -835,10 +906,7 @@ struct PA_STATE_CUT : public PA_STATE } } - uint32_t NumPrims() - { - return this->numPrimsAssembled; - } + uint32_t NumPrims() { return this->numPrimsAssembled; } // Per-topology functions void ProcessVertTriStrip(uint32_t index, bool finish) @@ -864,14 +932,14 @@ struct PA_STATE_CUT : public PA_STATE this->numPrimsAssembled++; // set up next prim state - this->vert[0] = this->vert[1]; - this->vert[1] = this->vert[2]; + this->vert[0] = this->vert[1]; + this->vert[1] = this->vert[2]; this->curIndex = 2; this->reverseWinding ^= 1; } } - template<bool gsEnabled> + template <bool gsEnabled> void AssembleTriStripAdj() { if (!gsEnabled) @@ -898,8 +966,7 @@ struct PA_STATE_CUT : public PA_STATE this->numPrimsAssembled++; } - - template<bool gsEnabled> + template <bool gsEnabled> void ProcessVertTriStripAdj(uint32_t index, bool finish) { // handle last primitive of tristrip @@ -1059,7 +1126,6 @@ struct PA_STATE_CUT : public PA_STATE } } - void ProcessVertLineList(uint32_t index, bool finish) { this->vert[this->curIndex] = index; @@ -1088,7 +1154,7 @@ struct PA_STATE_CUT : public PA_STATE this->numPrimsAssembled++; // set up next prim state - this->vert[0] = this->vert[1]; + this->vert[0] = this->vert[1]; this->curIndex = 1; } } @@ -1109,9 +1175,9 @@ struct PA_STATE_CUT : public PA_STATE this->numPrimsAssembled++; // set up next prim state - this->vert[0] = this->vert[1]; - this->vert[1] = this->vert[2]; - this->vert[2] = this->vert[3]; + this->vert[0] = this->vert[1]; + this->vert[1] = this->vert[2]; + this->vert[2] = this->vert[3]; this->curIndex = 3; } } @@ -1130,9 +1196,9 @@ struct PA_STATE_CUT : public PA_STATE this->numPrimsAssembled++; // set up next prim state - this->vert[0] = this->vert[1]; - this->vert[1] = this->vert[2]; - this->vert[2] = this->vert[3]; + this->vert[0] = this->vert[1]; + this->vert[1] = this->vert[2]; + this->vert[2] = this->vert[3]; this->curIndex = 3; } } @@ -1192,9 +1258,9 @@ struct PA_STATE_CUT : public PA_STATE // second triangle in the rectangle // v1, v3 = v1 + v2 - v0, v2 - this->indices[0][this->numPrimsAssembled+1] = this->vert[1]; - this->indices[1][this->numPrimsAssembled+1] = this->vert[0]; - this->indices[2][this->numPrimsAssembled+1] = this->vert[2]; + this->indices[0][this->numPrimsAssembled + 1] = this->vert[1]; + this->indices[1][this->numPrimsAssembled + 1] = this->vert[0]; + this->indices[2][this->numPrimsAssembled + 1] = this->vert[2]; // increment numPrimsAssembled this->numPrimsAssembled += 2; @@ -1208,29 +1274,26 @@ struct PA_STATE_CUT : public PA_STATE // Primitive Assembly for data output from the DomainShader. struct PA_TESS : PA_STATE { - PA_TESS( - DRAW_CONTEXT *in_pDC, - const SIMDSCALAR* in_pVertData, - uint32_t in_attributeStrideInVectors, - uint32_t in_vertexStride, - uint32_t in_numAttributes, - uint32_t* (&in_ppIndices)[3], - uint32_t in_numPrims, - PRIMITIVE_TOPOLOGY in_binTopology, - uint32_t numVertsPerPrim) : + PA_TESS(DRAW_CONTEXT* in_pDC, + const SIMDSCALAR* in_pVertData, + uint32_t in_attributeStrideInVectors, + uint32_t in_vertexStride, + uint32_t in_numAttributes, + uint32_t* (&in_ppIndices)[3], + uint32_t in_numPrims, + PRIMITIVE_TOPOLOGY in_binTopology, + uint32_t numVertsPerPrim) : PA_STATE(in_pDC, nullptr, 0, in_vertexStride, numVertsPerPrim), - m_pVertexData(in_pVertData), - m_attributeStrideInVectors(in_attributeStrideInVectors), - m_numAttributes(in_numAttributes), - m_numPrims(in_numPrims) + m_pVertexData(in_pVertData), m_attributeStrideInVectors(in_attributeStrideInVectors), + m_numAttributes(in_numAttributes), m_numPrims(in_numPrims) { #if USE_SIMD16_FRONTEND m_vPrimId = _simd16_setzero_si(); #else m_vPrimId = _simd_setzero_si(); #endif - binTopology = in_binTopology; + binTopology = in_binTopology; m_ppIndices[0] = in_ppIndices[0]; m_ppIndices[1] = in_ppIndices[1]; m_ppIndices[2] = in_ppIndices[2]; @@ -1255,10 +1318,7 @@ struct PA_TESS : PA_STATE } } - bool HasWork() - { - return m_numPrims != 0; - } + bool HasWork() { return m_numPrims != 0; } simdvector& GetSimdVector(uint32_t index, uint32_t slot) { @@ -1278,19 +1338,14 @@ struct PA_TESS : PA_STATE { SWR_ASSERT(numPrims <= SIMD_WIDTH); #if USE_SIMD16_FRONTEND - static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] = - { + static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - }; + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]); #else - static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] = - { - -1, -1, -1, -1, -1, -1, -1, -1, - 0, 0, 0, 0, 0, 0, 0, 0 - }; + static const OSALIGNLINE(int32_t) + maskGen[SIMD_WIDTH * 2] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]); #endif @@ -1308,7 +1363,8 @@ struct PA_TESS : PA_STATE SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble); - const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4]; + const float* pBaseAttrib = + (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4]; for (uint32_t i = 0; i < m_numVertsPerPrim; ++i) { #if USE_SIMD16_FRONTEND @@ -1321,21 +1377,21 @@ struct PA_TESS : PA_STATE for (uint32_t c = 0; c < 4; ++c) { #if USE_SIMD16_FRONTEND - simd16scalar temp = _simd16_mask_i32gather_ps( - _simd16_setzero_ps(), - pBase, - indices, - _simd16_castsi_ps(mask), - 4 /* gcc doesn't like sizeof(float) */); - - verts[i].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0); + simd16scalar temp = + _simd16_mask_i32gather_ps(_simd16_setzero_ps(), + pBase, + indices, + _simd16_castsi_ps(mask), + 4 /* gcc doesn't like sizeof(float) */); + + verts[i].v[c] = + useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0); #else - verts[i].v[c] = _simd_mask_i32gather_ps( - _simd_setzero_ps(), - pBase, - indices, - _simd_castsi_ps(mask), - 4); // gcc doesn't like sizeof(float) + verts[i].v[c] = _simd_mask_i32gather_ps(_simd_setzero_ps(), + pBase, + indices, + _simd_castsi_ps(mask), + 4); // gcc doesn't like sizeof(float) #endif pBase += m_attributeStrideInVectors * SIMD_WIDTH; } @@ -1357,7 +1413,8 @@ struct PA_TESS : PA_STATE SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble); - const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4]; + const float* pBaseAttrib = + (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4]; for (uint32_t i = 0; i < m_numVertsPerPrim; ++i) { #if USE_SIMD16_FRONTEND @@ -1370,20 +1427,18 @@ struct PA_TESS : PA_STATE for (uint32_t c = 0; c < 4; ++c) { #if USE_SIMD16_FRONTEND - verts[i].v[c] = _simd16_mask_i32gather_ps( - _simd16_setzero_ps(), - pBase, - indices, - _simd16_castsi_ps(mask), - 4 /* gcc doesn't like sizeof(float) */); + verts[i].v[c] = _simd16_mask_i32gather_ps(_simd16_setzero_ps(), + pBase, + indices, + _simd16_castsi_ps(mask), + 4 /* gcc doesn't like sizeof(float) */); #else - simdscalar temp = _simd_mask_i32gather_ps( - _simd_setzero_ps(), - pBase, - indices, - _simd_castsi_ps(mask), - 4 /* gcc doesn't like sizeof(float) */); - verts[i].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0); + simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), + pBase, + indices, + _simd_castsi_ps(mask), + 4 /* gcc doesn't like sizeof(float) */); + verts[i].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0); #endif pBase += m_attributeStrideInVectors * SIMD_WIDTH; } @@ -1396,19 +1451,22 @@ struct PA_TESS : PA_STATE void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) { SWR_ASSERT(slot < m_numAttributes); - SWR_ASSERT(primIndex < PA_TESS::NumPrims()); - const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4]; + SWR_ASSERT(primIndex < PA_TESS::NumPrims()); + + const float* pVertDataBase = + (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4]; for (uint32_t i = 0; i < m_numVertsPerPrim; ++i) { #if USE_SIMD16_FRONTEND - uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2] : m_ppIndices[i][primIndex]; + uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2] + : m_ppIndices[i][primIndex]; #else uint32_t index = m_ppIndices[i][primIndex]; #endif const float* pVertData = pVertDataBase; - float* pVert = (float*)&verts[i]; + float* pVert = (float*)&verts[i]; for (uint32_t c = 0; c < 4; ++c) { @@ -1447,15 +1505,9 @@ struct PA_TESS : PA_STATE return junkIndices; } - uint32_t NumPrims() - { - return std::min<uint32_t>(m_numPrims, SIMD_WIDTH); - } + uint32_t NumPrims() { return std::min<uint32_t>(m_numPrims, SIMD_WIDTH); } - void Reset() - { - SWR_NOT_IMPL; - } + void Reset() { SWR_NOT_IMPL; } SIMDSCALARI GetPrimID(uint32_t startID) { @@ -1467,57 +1519,77 @@ struct PA_TESS : PA_STATE } private: - const SIMDSCALAR* m_pVertexData = nullptr; - uint32_t m_attributeStrideInVectors = 0; - uint32_t m_numAttributes = 0; - uint32_t m_numPrims = 0; - uint32_t* m_ppIndices[3]; + const SIMDSCALAR* m_pVertexData = nullptr; + uint32_t m_attributeStrideInVectors = 0; + uint32_t m_numAttributes = 0; + uint32_t m_numPrims = 0; + uint32_t* m_ppIndices[3]; - uint32_t m_numVertsPerPrim = 0; + uint32_t m_numVertsPerPrim = 0; - SIMDSCALARI m_vPrimId; + SIMDSCALARI m_vPrimId; - simdvector junkVector; // junk simdvector for unimplemented API + simdvector junkVector; // junk simdvector for unimplemented API #if ENABLE_AVX512_SIMD16 - simd16vector junkVector_simd16; // junk simd16vector for unimplemented API + simd16vector junkVector_simd16; // junk simd16vector for unimplemented API #endif - SIMDVERTEX junkVertex; // junk SIMDVERTEX for unimplemented API - SIMDMASK junkIndices; // temporary index store for unused virtual function + SIMDVERTEX junkVertex; // junk SIMDVERTEX for unimplemented API + SIMDMASK junkIndices; // temporary index store for unused virtual function }; -// Primitive Assembler factory class, responsible for creating and initializing the correct assembler -// based on state. +// Primitive Assembler factory class, responsible for creating and initializing the correct +// assembler based on state. template <typename IsIndexedT, typename IsCutIndexEnabledT> struct PA_FACTORY { - PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX *pVertexStore, uint32_t vertexStoreSize, uint32_t vertexStride, uint32_t numVertsPerPrim) : topo(in_topo) + PA_FACTORY(DRAW_CONTEXT* pDC, + PRIMITIVE_TOPOLOGY in_topo, + uint32_t numVerts, + PA_STATE::SIMDVERTEX* pVertexStore, + uint32_t vertexStoreSize, + uint32_t vertexStride, + uint32_t numVertsPerPrim) : + topo(in_topo) { #if KNOB_ENABLE_CUT_AWARE_PA == TRUE const API_STATE& state = GetApiState(pDC); - if ((IsIndexedT::value && IsCutIndexEnabledT::value && ( - topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST || - topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP || - topo == TOP_TRIANGLE_LIST)) || - - // non-indexed draws with adjacency topologies must use cut-aware PA until we add support - // for them in the optimized PA - (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ)) + if ((IsIndexedT::value && IsCutIndexEnabledT::value && + (topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST || topo == TOP_LINE_LIST || + topo == TOP_LINE_STRIP || topo == TOP_TRIANGLE_LIST)) || + + // non-indexed draws with adjacency topologies must use cut-aware PA until we add + // support for them in the optimized PA + (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || + topo == TOP_TRI_STRIP_ADJ)) { memset(&indexStore, 0, sizeof(indexStore)); uint32_t numAttribs = state.feNumAttributes; - new (&this->paCut) PA_STATE_CUT(pDC, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, - vertexStride, &this->indexStore[0], numVerts, numAttribs, state.topology, false, numVertsPerPrim); + new (&this->paCut) PA_STATE_CUT(pDC, + reinterpret_cast<uint8_t*>(pVertexStore), + vertexStoreSize * PA_STATE::SIMD_WIDTH, + vertexStride, + &this->indexStore[0], + numVerts, + numAttribs, + state.topology, + false, + numVertsPerPrim); cutPA = true; } else #endif { uint32_t numPrims = GetNumPrims(in_topo, numVerts); - new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, vertexStride, false, numVertsPerPrim); + new (&this->paOpt) PA_STATE_OPT(pDC, + numPrims, + reinterpret_cast<uint8_t*>(pVertexStore), + vertexStoreSize * PA_STATE::SIMD_WIDTH, + vertexStride, + false, + numVertsPerPrim); cutPA = false; } - } PA_STATE& GetPA() @@ -1537,9 +1609,9 @@ struct PA_FACTORY PA_STATE_OPT paOpt; PA_STATE_CUT paCut; - bool cutPA{ false }; + bool cutPA{false}; - PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN }; + PRIMITIVE_TOPOLOGY topo{TOP_UNKNOWN}; - PA_STATE::SIMDMASK indexStore[MAX_NUM_VERTS_PER_PRIM]; + PA_STATE::SIMDMASK indexStore[MAX_NUM_VERTS_PER_PRIM]; }; diff --git a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp index 4f89e0c1799..25d7156ac63 100644 --- a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp @@ -1,136 +1,160 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file pa_avx.cpp -* -* @brief AVX implementation for primitive assembly. -* N primitives are assembled at a time, where N is the SIMD width. -* A state machine, that is specific for a given topology, drives the -* assembly of vertices into triangles. -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file pa_avx.cpp + * + * @brief AVX implementation for primitive assembly. + * N primitives are assembled at a time, where N is the SIMD width. + * A state machine, that is specific for a given topology, drives the + * assembly of vertices into triangles. + * + ******************************************************************************/ #include "context.h" #include "pa.h" #include "frontend.h" #if (KNOB_SIMD_WIDTH == 8) -INLINE simd4scalar swizzleLane0(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w) +INLINE simd4scalar swizzleLane0(const simdscalar& x, + const simdscalar& y, + const simdscalar& z, + const simdscalar& w) { simdscalar tmp0 = _mm256_unpacklo_ps(x, z); simdscalar tmp1 = _mm256_unpacklo_ps(y, w); return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); } -INLINE simd4scalar swizzleLane1(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w) +INLINE simd4scalar swizzleLane1(const simdscalar& x, + const simdscalar& y, + const simdscalar& z, + const simdscalar& w) { simdscalar tmp0 = _mm256_unpacklo_ps(x, z); simdscalar tmp1 = _mm256_unpacklo_ps(y, w); return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); } -INLINE simd4scalar swizzleLane2(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w) +INLINE simd4scalar swizzleLane2(const simdscalar& x, + const simdscalar& y, + const simdscalar& z, + const simdscalar& w) { simdscalar tmp0 = _mm256_unpackhi_ps(x, z); simdscalar tmp1 = _mm256_unpackhi_ps(y, w); return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); } -INLINE simd4scalar swizzleLane3(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w) +INLINE simd4scalar swizzleLane3(const simdscalar& x, + const simdscalar& y, + const simdscalar& z, + const simdscalar& w) { simdscalar tmp0 = _mm256_unpackhi_ps(x, z); simdscalar tmp1 = _mm256_unpackhi_ps(y, w); return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); } -INLINE simd4scalar swizzleLane4(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w) +INLINE simd4scalar swizzleLane4(const simdscalar& x, + const simdscalar& y, + const simdscalar& z, + const simdscalar& w) { simdscalar tmp0 = _mm256_unpacklo_ps(x, z); simdscalar tmp1 = _mm256_unpacklo_ps(y, w); return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); } -INLINE simd4scalar swizzleLane5(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w) +INLINE simd4scalar swizzleLane5(const simdscalar& x, + const simdscalar& y, + const simdscalar& z, + const simdscalar& w) { simdscalar tmp0 = _mm256_unpacklo_ps(x, z); simdscalar tmp1 = _mm256_unpacklo_ps(y, w); return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); } -INLINE simd4scalar swizzleLane6(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w) +INLINE simd4scalar swizzleLane6(const simdscalar& x, + const simdscalar& y, + const simdscalar& z, + const simdscalar& w) { simdscalar tmp0 = _mm256_unpackhi_ps(x, z); simdscalar tmp1 = _mm256_unpackhi_ps(y, w); return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); } -INLINE simd4scalar swizzleLane7(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w) +INLINE simd4scalar swizzleLane7(const simdscalar& x, + const simdscalar& y, + const simdscalar& z, + const simdscalar& w) { simdscalar tmp0 = _mm256_unpackhi_ps(x, z); simdscalar tmp1 = _mm256_unpackhi_ps(y, w); return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); } -INLINE simd4scalar swizzleLane0(const simdvector &v) +INLINE simd4scalar swizzleLane0(const simdvector& v) { return swizzleLane0(v.x, v.y, v.z, v.w); } -INLINE simd4scalar swizzleLane1(const simdvector &v) +INLINE simd4scalar swizzleLane1(const simdvector& v) { return swizzleLane1(v.x, v.y, v.z, v.w); } -INLINE simd4scalar swizzleLane2(const simdvector &v) +INLINE simd4scalar swizzleLane2(const simdvector& v) { return swizzleLane2(v.x, v.y, v.z, v.w); } -INLINE simd4scalar swizzleLane3(const simdvector &v) +INLINE simd4scalar swizzleLane3(const simdvector& v) { return swizzleLane3(v.x, v.y, v.z, v.w); } -INLINE simd4scalar swizzleLane4(const simdvector &v) +INLINE simd4scalar swizzleLane4(const simdvector& v) { return swizzleLane4(v.x, v.y, v.z, v.w); } -INLINE simd4scalar swizzleLane5(const simdvector &v) +INLINE simd4scalar swizzleLane5(const simdvector& v) { return swizzleLane5(v.x, v.y, v.z, v.w); } -INLINE simd4scalar swizzleLane6(const simdvector &v) +INLINE simd4scalar swizzleLane6(const simdvector& v) { return swizzleLane6(v.x, v.y, v.z, v.w); } -INLINE simd4scalar swizzleLane7(const simdvector &v) +INLINE simd4scalar swizzleLane7(const simdvector& v) { return swizzleLane7(v.x, v.y, v.z, v.w); } -INLINE simd4scalar swizzleLaneN(const simdvector &v, int lane) +INLINE simd4scalar swizzleLaneN(const simdvector& v, int lane) { switch (lane) { @@ -156,87 +180,135 @@ INLINE simd4scalar swizzleLaneN(const simdvector &v, int lane) } #if ENABLE_AVX512_SIMD16 -INLINE simd4scalar swizzleLane0(const simd16vector &v) +INLINE simd4scalar swizzleLane0(const simd16vector& v) { - return swizzleLane0(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0)); + return swizzleLane0(_simd16_extract_ps(v.x, 0), + _simd16_extract_ps(v.y, 0), + _simd16_extract_ps(v.z, 0), + _simd16_extract_ps(v.w, 0)); } -INLINE simd4scalar swizzleLane1(const simd16vector &v) +INLINE simd4scalar swizzleLane1(const simd16vector& v) { - return swizzleLane1(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0)); + return swizzleLane1(_simd16_extract_ps(v.x, 0), + _simd16_extract_ps(v.y, 0), + _simd16_extract_ps(v.z, 0), + _simd16_extract_ps(v.w, 0)); } -INLINE simd4scalar swizzleLane2(const simd16vector &v) +INLINE simd4scalar swizzleLane2(const simd16vector& v) { - return swizzleLane2(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0)); + return swizzleLane2(_simd16_extract_ps(v.x, 0), + _simd16_extract_ps(v.y, 0), + _simd16_extract_ps(v.z, 0), + _simd16_extract_ps(v.w, 0)); } -INLINE simd4scalar swizzleLane3(const simd16vector &v) +INLINE simd4scalar swizzleLane3(const simd16vector& v) { - return swizzleLane3(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0)); + return swizzleLane3(_simd16_extract_ps(v.x, 0), + _simd16_extract_ps(v.y, 0), + _simd16_extract_ps(v.z, 0), + _simd16_extract_ps(v.w, 0)); } -INLINE simd4scalar swizzleLane4(const simd16vector &v) +INLINE simd4scalar swizzleLane4(const simd16vector& v) { - return swizzleLane4(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0)); + return swizzleLane4(_simd16_extract_ps(v.x, 0), + _simd16_extract_ps(v.y, 0), + _simd16_extract_ps(v.z, 0), + _simd16_extract_ps(v.w, 0)); } -INLINE simd4scalar swizzleLane5(const simd16vector &v) +INLINE simd4scalar swizzleLane5(const simd16vector& v) { - return swizzleLane5(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0)); + return swizzleLane5(_simd16_extract_ps(v.x, 0), + _simd16_extract_ps(v.y, 0), + _simd16_extract_ps(v.z, 0), + _simd16_extract_ps(v.w, 0)); } -INLINE simd4scalar swizzleLane6(const simd16vector &v) +INLINE simd4scalar swizzleLane6(const simd16vector& v) { - return swizzleLane6(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0)); + return swizzleLane6(_simd16_extract_ps(v.x, 0), + _simd16_extract_ps(v.y, 0), + _simd16_extract_ps(v.z, 0), + _simd16_extract_ps(v.w, 0)); } -INLINE simd4scalar swizzleLane7(const simd16vector &v) +INLINE simd4scalar swizzleLane7(const simd16vector& v) { - return swizzleLane7(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0)); + return swizzleLane7(_simd16_extract_ps(v.x, 0), + _simd16_extract_ps(v.y, 0), + _simd16_extract_ps(v.z, 0), + _simd16_extract_ps(v.w, 0)); } -INLINE simd4scalar swizzleLane8(const simd16vector &v) +INLINE simd4scalar swizzleLane8(const simd16vector& v) { - return swizzleLane0(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1)); + return swizzleLane0(_simd16_extract_ps(v.x, 1), + _simd16_extract_ps(v.y, 1), + _simd16_extract_ps(v.z, 1), + _simd16_extract_ps(v.w, 1)); } -INLINE simd4scalar swizzleLane9(const simd16vector &v) +INLINE simd4scalar swizzleLane9(const simd16vector& v) { - return swizzleLane1(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1)); + return swizzleLane1(_simd16_extract_ps(v.x, 1), + _simd16_extract_ps(v.y, 1), + _simd16_extract_ps(v.z, 1), + _simd16_extract_ps(v.w, 1)); } -INLINE simd4scalar swizzleLaneA(const simd16vector &v) +INLINE simd4scalar swizzleLaneA(const simd16vector& v) { - return swizzleLane2(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1)); + return swizzleLane2(_simd16_extract_ps(v.x, 1), + _simd16_extract_ps(v.y, 1), + _simd16_extract_ps(v.z, 1), + _simd16_extract_ps(v.w, 1)); } -INLINE simd4scalar swizzleLaneB(const simd16vector &v) +INLINE simd4scalar swizzleLaneB(const simd16vector& v) { - return swizzleLane3(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1)); + return swizzleLane3(_simd16_extract_ps(v.x, 1), + _simd16_extract_ps(v.y, 1), + _simd16_extract_ps(v.z, 1), + _simd16_extract_ps(v.w, 1)); } -INLINE simd4scalar swizzleLaneC(const simd16vector &v) +INLINE simd4scalar swizzleLaneC(const simd16vector& v) { - return swizzleLane4(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1)); + return swizzleLane4(_simd16_extract_ps(v.x, 1), + _simd16_extract_ps(v.y, 1), + _simd16_extract_ps(v.z, 1), + _simd16_extract_ps(v.w, 1)); } -INLINE simd4scalar swizzleLaneD(const simd16vector &v) +INLINE simd4scalar swizzleLaneD(const simd16vector& v) { - return swizzleLane5(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1)); + return swizzleLane5(_simd16_extract_ps(v.x, 1), + _simd16_extract_ps(v.y, 1), + _simd16_extract_ps(v.z, 1), + _simd16_extract_ps(v.w, 1)); } -INLINE simd4scalar swizzleLaneE(const simd16vector &v) +INLINE simd4scalar swizzleLaneE(const simd16vector& v) { - return swizzleLane6(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1)); + return swizzleLane6(_simd16_extract_ps(v.x, 1), + _simd16_extract_ps(v.y, 1), + _simd16_extract_ps(v.z, 1), + _simd16_extract_ps(v.w, 1)); } -INLINE simd4scalar swizzleLaneF(const simd16vector &v) +INLINE simd4scalar swizzleLaneF(const simd16vector& v) { - return swizzleLane7(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1)); + return swizzleLane7(_simd16_extract_ps(v.x, 1), + _simd16_extract_ps(v.y, 1), + _simd16_extract_ps(v.z, 1), + _simd16_extract_ps(v.w, 1)); } -INLINE simd4scalar swizzleLaneN(const simd16vector &v, int lane) +INLINE simd4scalar swizzleLaneN(const simd16vector& v, int lane) { switch (lane) { @@ -374,11 +446,11 @@ void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd { uint32_t input_cp = primIndex * TotalControlPoints + cp; #if USE_SIMD16_FRONTEND - uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH; + uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH; uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH; #else - uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH; + uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH; uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH; #endif @@ -386,7 +458,8 @@ void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd for (uint32_t i = 0; i < 4; ++i) { #if USE_SIMD16_FRONTEND - const float* pInputVec = (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]); + const float* pInputVec = + (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]); #else const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]); #endif @@ -395,18 +468,17 @@ void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd } } -template<uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1> +template <uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1> static bool PaPatchList(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) { - SetNextPaState( - pa, - PaPatchList<TotalControlPoints, CurrentControlPoints + 1>, - PaPatchListSingle<TotalControlPoints>); + SetNextPaState(pa, + PaPatchList<TotalControlPoints, CurrentControlPoints + 1>, + PaPatchListSingle<TotalControlPoints>); return false; } -template<uint32_t TotalControlPoints> +template <uint32_t TotalControlPoints> static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) { // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output @@ -433,14 +505,15 @@ static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) for (uint32_t lane = 0; lane < KNOB_SIMD_WIDTH; ++lane) { #if USE_SIMD16_FRONTEND - uint32_t input_cp = (lane + lane_offset) * TotalControlPoints + cp; - uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH; + uint32_t input_cp = (lane + lane_offset) * TotalControlPoints + cp; + uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH; uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH; - const float* pInputVec = (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]); + const float* pInputVec = + (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]); #else - uint32_t input_cp = lane * TotalControlPoints + cp; - uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH; + uint32_t input_cp = lane * TotalControlPoints + cp; + uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH; uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH; const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]); @@ -451,31 +524,29 @@ static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) } } - SetNextPaState( - pa, - PaPatchList<TotalControlPoints>, - PaPatchListSingle<TotalControlPoints>, - 0, - PA_STATE_OPT::SIMD_WIDTH, - true); + SetNextPaState(pa, + PaPatchList<TotalControlPoints>, + PaPatchListSingle<TotalControlPoints>, + 0, + PA_STATE_OPT::SIMD_WIDTH, + true); return true; } #if ENABLE_AVX512_SIMD16 -template<uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1> +template <uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1> static bool PaPatchList_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) { - SetNextPaState_simd16( - pa, - PaPatchList_simd16<TotalControlPoints, CurrentControlPoints + 1>, - PaPatchList<TotalControlPoints, CurrentControlPoints + 1>, - PaPatchListSingle<TotalControlPoints>); + SetNextPaState_simd16(pa, + PaPatchList_simd16<TotalControlPoints, CurrentControlPoints + 1>, + PaPatchList<TotalControlPoints, CurrentControlPoints + 1>, + PaPatchListSingle<TotalControlPoints>); return false; } -template<uint32_t TotalControlPoints> +template <uint32_t TotalControlPoints> static bool PaPatchListTerm_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) { // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output @@ -492,33 +563,35 @@ static bool PaPatchListTerm_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector float vec[KNOB_SIMD16_WIDTH]; for (uint32_t lane = 0; lane < KNOB_SIMD16_WIDTH; ++lane) { - uint32_t input_cp = lane * TotalControlPoints + cp; - uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH; + uint32_t input_cp = lane * TotalControlPoints + cp; + uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH; uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH; const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]); - vec[lane] = pInputVec[input_lane]; + vec[lane] = pInputVec[input_lane]; } verts[cp][i] = _simd16_loadu_ps(vec); } } - SetNextPaState_simd16( - pa, - PaPatchList_simd16<TotalControlPoints>, - PaPatchList<TotalControlPoints>, - PaPatchListSingle<TotalControlPoints>, - 0, - PA_STATE_OPT::SIMD_WIDTH, - true); + SetNextPaState_simd16(pa, + PaPatchList_simd16<TotalControlPoints>, + PaPatchList<TotalControlPoints>, + PaPatchListSingle<TotalControlPoints>, + 0, + PA_STATE_OPT::SIMD_WIDTH, + true); return true; } #endif -#define PA_PATCH_LIST_TERMINATOR(N) \ - template<> bool PaPatchList<N, N>(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])\ - { return PaPatchListTerm<N>(pa, slot, verts); } +#define PA_PATCH_LIST_TERMINATOR(N) \ + template <> \ + bool PaPatchList<N, N>(PA_STATE_OPT & pa, uint32_t slot, simdvector verts[]) \ + { \ + return PaPatchListTerm<N>(pa, slot, verts); \ + } PA_PATCH_LIST_TERMINATOR(1) PA_PATCH_LIST_TERMINATOR(2) PA_PATCH_LIST_TERMINATOR(3) @@ -554,9 +627,12 @@ PA_PATCH_LIST_TERMINATOR(32) #undef PA_PATCH_LIST_TERMINATOR #if ENABLE_AVX512_SIMD16 -#define PA_PATCH_LIST_TERMINATOR_SIMD16(N) \ - template<> bool PaPatchList_simd16<N, N>(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])\ - { return PaPatchListTerm_simd16<N>(pa, slot, verts); } +#define PA_PATCH_LIST_TERMINATOR_SIMD16(N) \ + template <> \ + bool PaPatchList_simd16<N, N>(PA_STATE_OPT & pa, uint32_t slot, simd16vector verts[]) \ + { \ + return PaPatchListTerm_simd16<N>(pa, slot, verts); \ + } PA_PATCH_LIST_TERMINATOR_SIMD16(1) PA_PATCH_LIST_TERMINATOR_SIMD16(2) PA_PATCH_LIST_TERMINATOR_SIMD16(3) @@ -595,13 +671,13 @@ PA_PATCH_LIST_TERMINATOR_SIMD16(32) bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) { SetNextPaState(pa, PaTriList1, PaTriListSingle0); - return false; // Not enough vertices to assemble 4 or 8 triangles. + return false; // Not enough vertices to assemble 4 or 8 triangles. } bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) { SetNextPaState(pa, PaTriList2, PaTriListSingle0); - return false; // Not enough vertices to assemble 8 triangles. + return false; // Not enough vertices to assemble 8 triangles. } bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) @@ -614,8 +690,8 @@ bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) if (!pa.useAlternateOffset) { - const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot); - const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot); + const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot); + const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot); for (uint32_t i = 0; i < 4; i += 1) { @@ -626,8 +702,8 @@ bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) } else { - const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot); - const simd16vector &c_16 = PaGetSimdVector_simd16(pa, 2, slot); + const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot); + const simd16vector& c_16 = PaGetSimdVector_simd16(pa, 2, slot); for (uint32_t i = 0; i < 4; i += 1) { @@ -638,9 +714,9 @@ bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) } #else - simdvector &a = PaGetSimdVector(pa, 0, slot); - simdvector &b = PaGetSimdVector(pa, 1, slot); - simdvector &c = PaGetSimdVector(pa, 2, slot); + simdvector& a = PaGetSimdVector(pa, 0, slot); + simdvector& b = PaGetSimdVector(pa, 1, slot); + simdvector& c = PaGetSimdVector(pa, 2, slot); #endif simdscalar s; @@ -653,25 +729,25 @@ bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) for (int i = 0; i < 4; ++i) { simdvector& v0 = verts[0]; - v0[i] = _simd_blend_ps(a[i], b[i], 0x92); - v0[i] = _simd_blend_ps(v0[i], c[i], 0x24); - v0[i] = _simd_permute_ps_i(v0[i], 0x6C); - s = _simd_permute2f128_ps(v0[i], v0[i], 0x21); - v0[i] = _simd_blend_ps(v0[i], s, 0x44); + v0[i] = _simd_blend_ps(a[i], b[i], 0x92); + v0[i] = _simd_blend_ps(v0[i], c[i], 0x24); + v0[i] = _simd_permute_ps_i(v0[i], 0x6C); + s = _simd_permute2f128_ps(v0[i], v0[i], 0x21); + v0[i] = _simd_blend_ps(v0[i], s, 0x44); simdvector& v1 = verts[1]; - v1[i] = _simd_blend_ps(a[i], b[i], 0x24); - v1[i] = _simd_blend_ps(v1[i], c[i], 0x49); - v1[i] = _simd_permute_ps_i(v1[i], 0xB1); - s = _simd_permute2f128_ps(v1[i], v1[i], 0x21); - v1[i] = _simd_blend_ps(v1[i], s, 0x66); + v1[i] = _simd_blend_ps(a[i], b[i], 0x24); + v1[i] = _simd_blend_ps(v1[i], c[i], 0x49); + v1[i] = _simd_permute_ps_i(v1[i], 0xB1); + s = _simd_permute2f128_ps(v1[i], v1[i], 0x21); + v1[i] = _simd_blend_ps(v1[i], s, 0x66); simdvector& v2 = verts[2]; - v2[i] = _simd_blend_ps(a[i], b[i], 0x49); - v2[i] = _simd_blend_ps(v2[i], c[i], 0x92); - v2[i] = _simd_permute_ps_i(v2[i], 0xC6); - s = _simd_permute2f128_ps(v2[i], v2[i], 0x21); - v2[i] = _simd_blend_ps(v2[i], s, 0x22); + v2[i] = _simd_blend_ps(a[i], b[i], 0x49); + v2[i] = _simd_blend_ps(v2[i], c[i], 0x92); + v2[i] = _simd_permute_ps_i(v2[i], 0xC6); + s = _simd_permute2f128_ps(v2[i], v2[i], 0x21); + v2[i] = _simd_blend_ps(v2[i], s, 0x22); } #elif KNOB_ARCH >= KNOB_ARCH_AVX2 @@ -686,8 +762,8 @@ bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) if (!pa.useAlternateOffset) { - const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot); - const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot); + const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot); + const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot); for (uint32_t i = 0; i < 4; i += 1) { @@ -698,8 +774,8 @@ bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) } else { - const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot); - const simd16vector &c_16 = PaGetSimdVector_simd16(pa, 2, slot); + const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot); + const simd16vector& c_16 = PaGetSimdVector_simd16(pa, 2, slot); for (uint32_t i = 0; i < 4; i += 1) { @@ -710,18 +786,18 @@ bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) } #else - const simdvector &a = PaGetSimdVector(pa, 0, slot); - const simdvector &b = PaGetSimdVector(pa, 1, slot); - const simdvector &c = PaGetSimdVector(pa, 2, slot); + const simdvector& a = PaGetSimdVector(pa, 0, slot); + const simdvector& b = PaGetSimdVector(pa, 1, slot); + const simdvector& c = PaGetSimdVector(pa, 2, slot); #endif // v0 -> a0 a3 a6 b1 b4 b7 c2 c5 // v1 -> a1 a4 a7 b2 b5 c0 c3 c6 // v2 -> a2 a5 b0 b3 b6 c1 c4 c7 - simdvector &v0 = verts[0]; - simdvector &v1 = verts[1]; - simdvector &v2 = verts[2]; + simdvector& v0 = verts[0]; + simdvector& v1 = verts[1]; + simdvector& v2 = verts[2]; // for simd x, y, z, and w for (int i = 0; i < 4; ++i) @@ -744,30 +820,32 @@ bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) { SetNextPaState_simd16(pa, PaTriList1_simd16, PaTriList1, PaTriListSingle0); - return false; // Not enough vertices to assemble 16 triangles + return false; // Not enough vertices to assemble 16 triangles } bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) { SetNextPaState_simd16(pa, PaTriList2_simd16, PaTriList2, PaTriListSingle0); - return false; // Not enough vertices to assemble 16 triangles + return false; // Not enough vertices to assemble 16 triangles } bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) { + // clang-format off + #if KNOB_ARCH >= KNOB_ARCH_AVX2 const simd16scalari perm0 = _simd16_set_epi32(13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3, 0); const simd16scalari perm1 = _simd16_set_epi32(14, 11, 8, 5, 2, 15, 12, 9, 6, 3, 0, 13, 10, 7, 4, 1); const simd16scalari perm2 = _simd16_set_epi32(15, 12, 9, 6, 3, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2); -#else // KNOB_ARCH == KNOB_ARCH_AVX +#else // KNOB_ARCH == KNOB_ARCH_AVX simd16scalar perm0 = _simd16_setzero_ps(); simd16scalar perm1 = _simd16_setzero_ps(); simd16scalar perm2 = _simd16_setzero_ps(); #endif - const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot); - const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot); - const simd16vector &c = PaGetSimdVector_simd16(pa, 2, slot); + const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot); + const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot); + const simd16vector& c = PaGetSimdVector_simd16(pa, 2, slot); const simd16mask mask0 = 0x4924; const simd16mask mask1 = 0x2492; @@ -777,16 +855,16 @@ bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) // v1 -> a1 a4 a7 aA aD b0 b3 b6 b9 bC bF c2 c5 c8 cB cE // v2 -> a2 a5 a8 aB aE b1 b4 b7 bA bD c0 c3 c6 c9 cC cF - simd16vector &v0 = verts[0]; - simd16vector &v1 = verts[1]; - simd16vector &v2 = verts[2]; + simd16vector& v0 = verts[0]; + simd16vector& v1 = verts[1]; + simd16vector& v2 = verts[2]; // for simd16 x, y, z, and w for (int i = 0; i < 4; i += 1) { - simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float *>(&a[i])); - simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float *>(&b[i])); - simd16scalar tempc = _simd16_loadu_ps(reinterpret_cast<const float *>(&c[i])); + simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i])); + simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i])); + simd16scalar tempc = _simd16_loadu_ps(reinterpret_cast<const float*>(&c[i])); simd16scalar temp0 = _simd16_blend_ps(_simd16_blend_ps(tempa, tempb, mask0), tempc, mask1); simd16scalar temp1 = _simd16_blend_ps(_simd16_blend_ps(tempa, tempb, mask2), tempc, mask0); @@ -796,41 +874,43 @@ bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) v0[i] = _simd16_permute_ps(temp0, perm0); v1[i] = _simd16_permute_ps(temp1, perm1); v2[i] = _simd16_permute_ps(temp2, perm2); -#else // #if KNOB_ARCH == KNOB_ARCH_AVX - +#else // #if KNOB_ARCH == KNOB_ARCH_AVX + // the general permutes (above) are prohibitively slow to emulate on AVX (its scalar code) - temp0 = _simd16_permute_ps_i(temp0, 0x6C); // (0, 3, 2, 1) => 00 11 01 10 => 0x6C - perm0 = _simd16_permute2f128_ps(temp0, temp0, 0xB1);// (1, 0, 3, 2) => 01 00 11 10 => 0xB1 - temp0 = _simd16_blend_ps(temp0, perm0, 0x4444); // 0010 0010 0010 0010 - perm0 = _simd16_permute2f128_ps(temp0, temp0, 0x4E);// (2, 3, 0, 1) => 10 11 00 01 => 0x4E - v0[i] = _simd16_blend_ps(temp0, perm0, 0x3838); // 0001 1100 0001 1100 + temp0 = _simd16_permute_ps_i(temp0, 0x6C); // (0, 3, 2, 1) => 00 11 01 10 => 0x6C + perm0 = _simd16_permute2f128_ps(temp0, temp0, 0xB1); // (1, 0, 3, 2) => 01 00 11 10 => 0xB1 + temp0 = _simd16_blend_ps(temp0, perm0, 0x4444); // 0010 0010 0010 0010 + perm0 = _simd16_permute2f128_ps(temp0, temp0, 0x4E); // (2, 3, 0, 1) => 10 11 00 01 => 0x4E + v0[i] = _simd16_blend_ps(temp0, perm0, 0x3838); // 0001 1100 0001 1100 - temp1 = _simd16_permute_ps_i(temp1, 0xB1); // (1, 0, 3, 2) => 01 00 11 10 => 0xB1 - perm1 = _simd16_permute2f128_ps(temp1, temp1, 0xB1);// (1, 0, 3, 2) => 01 00 11 10 => 0xB1 - temp1 = _simd16_blend_ps(temp1, perm1, 0x6666); // 0010 0010 0010 0010 - perm1 = _simd16_permute2f128_ps(temp1, temp1, 0x4E);// (2, 3, 0, 1) => 10 11 00 01 => 0x4E - v1[i] = _simd16_blend_ps(temp1, perm1, 0x1818); // 0001 1000 0001 1000 + temp1 = _simd16_permute_ps_i(temp1, 0xB1); // (1, 0, 3, 2) => 01 00 11 10 => 0xB1 + perm1 = _simd16_permute2f128_ps(temp1, temp1, 0xB1); // (1, 0, 3, 2) => 01 00 11 10 => 0xB1 + temp1 = _simd16_blend_ps(temp1, perm1, 0x6666); // 0010 0010 0010 0010 + perm1 = _simd16_permute2f128_ps(temp1, temp1, 0x4E); // (2, 3, 0, 1) => 10 11 00 01 => 0x4E + v1[i] = _simd16_blend_ps(temp1, perm1, 0x1818); // 0001 1000 0001 1000 - temp2 = _simd16_permute_ps_i(temp2, 0xC6); // (2, 1, 0, 3) => 01 10 00 11 => 0xC6 - perm2 = _simd16_permute2f128_ps(temp2, temp2, 0xB1);// (1, 0, 3, 2) => 01 00 11 10 => 0xB1 - temp2 = _simd16_blend_ps(temp2, perm2, 0x2222); // 0100 0100 0100 0100 - perm2 = _simd16_permute2f128_ps(temp2, temp2, 0x4E);// (2, 3, 0, 1) => 10 11 00 01 => 0x4E - v2[i] = _simd16_blend_ps(temp2, perm2, 0x1C1C); // 0011 1000 0011 1000 + temp2 = _simd16_permute_ps_i(temp2, 0xC6); // (2, 1, 0, 3) => 01 10 00 11 => 0xC6 + perm2 = _simd16_permute2f128_ps(temp2, temp2, 0xB1); // (1, 0, 3, 2) => 01 00 11 10 => 0xB1 + temp2 = _simd16_blend_ps(temp2, perm2, 0x2222); // 0100 0100 0100 0100 + perm2 = _simd16_permute2f128_ps(temp2, temp2, 0x4E); // (2, 3, 0, 1) => 10 11 00 01 => 0x4E + v2[i] = _simd16_blend_ps(temp2, perm2, 0x1C1C); // 0011 1000 0011 1000 #endif } SetNextPaState_simd16(pa, PaTriList0_simd16, PaTriList0, PaTriListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); return true; + + // clang-format on } #endif void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]) { #if USE_SIMD16_FRONTEND - const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot); - const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot); - const simd16vector &c = PaGetSimdVector_simd16(pa, 2, slot); + const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot); + const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot); + const simd16vector& c = PaGetSimdVector_simd16(pa, 2, slot); if (pa.useAlternateOffset) { @@ -929,9 +1009,9 @@ void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4 // hold at least 8 triangles worth of data. We want to assemble a single // triangle with data in horizontal form. - const simdvector &a = PaGetSimdVector(pa, 0, slot); - const simdvector &b = PaGetSimdVector(pa, 1, slot); - const simdvector &c = PaGetSimdVector(pa, 2, slot); + const simdvector& a = PaGetSimdVector(pa, 0, slot); + const simdvector& b = PaGetSimdVector(pa, 1, slot); + const simdvector& c = PaGetSimdVector(pa, 2, slot); // Convert from vertical to horizontal. // Tri Pattern - provoking vertex is always v0 @@ -988,7 +1068,7 @@ void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4 bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) { SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0); - return false; // Not enough vertices to assemble 8 triangles. + return false; // Not enough vertices to assemble 8 triangles. } bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) @@ -999,7 +1079,7 @@ bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) if (!pa.useAlternateOffset) { - const simd16vector &a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot); + const simd16vector& a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot); for (uint32_t i = 0; i < 4; i += 1) { @@ -1009,7 +1089,7 @@ bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) } else { - const simd16vector &b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot); + const simd16vector& b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot); for (uint32_t i = 0; i < 4; i += 1) { @@ -1019,13 +1099,13 @@ bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) } #else - simdvector &a = PaGetSimdVector(pa, pa.prev, slot); - simdvector &b = PaGetSimdVector(pa, pa.cur, slot); + simdvector& a = PaGetSimdVector(pa, pa.prev, slot); + simdvector& b = PaGetSimdVector(pa, pa.cur, slot); #endif simdscalar s; - for(int i = 0; i < 4; ++i) + for (int i = 0; i < 4; ++i) { simdscalar a0 = a[i]; simdscalar b0 = b[i]; @@ -1035,9 +1115,9 @@ bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) // v1 -> 13355779 // v2 -> 22446688 simdvector& v0 = verts[0]; - v0[i] = a0; + v0[i] = a0; - // s -> 4567891011 + // s -> 4567891011 s = _simd_permute2f128_ps(a0, b0, 0x21); // s -> 23456789 s = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2)); @@ -1055,17 +1135,19 @@ bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) return true; } -#if ENABLE_AVX512_SIMD16 +#if ENABLE_AVX512_SIMD16 bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) { SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0); - return false; // Not enough vertices to assemble 16 triangles. + return false; // Not enough vertices to assemble 16 triangles. } bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) { - const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot); - const simd16vector &b = PaGetSimdVector_simd16(pa, pa.cur, slot); + // clang-format off + + const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot); + const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot); const simd16mask mask0 = 0xF000; @@ -1073,37 +1155,39 @@ bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) // v1 -> a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1 // v2 -> a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0 - simd16vector &v0 = verts[0]; - simd16vector &v1 = verts[1]; - simd16vector &v2 = verts[2]; + simd16vector& v0 = verts[0]; + simd16vector& v1 = verts[1]; + simd16vector& v2 = verts[2]; // for simd16 x, y, z, and w for (int i = 0; i < 4; i += 1) { - simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float *>(&a[i])); - simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float *>(&b[i])); + simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i])); + simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i])); - simd16scalar perm0 = _simd16_permute2f128_ps(tempa, tempa, 0x39);// (0 3 2 1) = 00 11 10 01 // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF a0 a1 a2 a3 - simd16scalar perm1 = _simd16_permute2f128_ps(tempb, tempb, 0x39);// (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3 + simd16scalar perm0 = _simd16_permute2f128_ps(tempa, tempa, 0x39); // (0 3 2 1) = 00 11 10 01 // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF a0 a1 a2 a3 + simd16scalar perm1 = _simd16_permute2f128_ps(tempb, tempb, 0x39); // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3 - simd16scalar blend = _simd16_blend_ps(perm0, perm1, mask0); // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1 b2 b3 - simd16scalar shuff = _simd16_shuffle_ps(tempa, blend, _MM_SHUFFLE(1, 0, 3, 2)); // a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1 + simd16scalar blend = _simd16_blend_ps(perm0, perm1, mask0); // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1 b2 b3 + simd16scalar shuff = _simd16_shuffle_ps(tempa, blend, _MM_SHUFFLE(1, 0, 3, 2)); // a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1 - v0[i] = tempa; // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF - v1[i] = _simd16_shuffle_ps(tempa, shuff, _MM_SHUFFLE(3, 1, 3, 1)); // a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1 - v2[i] = _simd16_shuffle_ps(tempa, shuff, _MM_SHUFFLE(2, 2, 2, 2)); // a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0 + v0[i] = tempa; // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF + v1[i] = _simd16_shuffle_ps(tempa, shuff, _MM_SHUFFLE(3, 1, 3, 1)); // a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1 + v2[i] = _simd16_shuffle_ps(tempa, shuff, _MM_SHUFFLE(2, 2, 2, 2)); // a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0 } SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH); return true; + + // clang-format on } #endif void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]) { #if USE_SIMD16_FRONTEND - const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot); - const simd16vector &b = PaGetSimdVector_simd16(pa, pa.cur, slot); + const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot); + const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot); if (pa.useAlternateOffset) { @@ -1198,8 +1282,8 @@ void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd break; }; #else - const simdvector &a = PaGetSimdVector(pa, pa.prev, slot); - const simdvector &b = PaGetSimdVector(pa, pa.cur, slot); + const simdvector& a = PaGetSimdVector(pa, pa.prev, slot); + const simdvector& b = PaGetSimdVector(pa, pa.cur, slot); // Convert from vertical to horizontal. // Tri Pattern - provoking vertex is always v0 @@ -1256,7 +1340,7 @@ void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) { SetNextPaState(pa, PaTriFan1, PaTriFanSingle0); - return false; // Not enough vertices to assemble 8 triangles. + return false; // Not enough vertices to assemble 8 triangles. } bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) @@ -1266,11 +1350,11 @@ bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) simdvector a; simdvector b; - const simd16vector &leadvert_16 = PaGetSimdVector_simd16(pa, pa.first, slot); + const simd16vector& leadvert_16 = PaGetSimdVector_simd16(pa, pa.first, slot); if (!pa.useAlternateOffset) { - const simd16vector &a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot); + const simd16vector& a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot); for (uint32_t i = 0; i < 4; i += 1) { @@ -1282,7 +1366,7 @@ bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) } else { - const simd16vector &b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot); + const simd16vector& b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot); for (uint32_t i = 0; i < 4; i += 1) { @@ -1294,15 +1378,15 @@ bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) } #else - const simdvector &leadVert = PaGetSimdVector(pa, pa.first, slot); - const simdvector &a = PaGetSimdVector(pa, pa.prev, slot); - const simdvector &b = PaGetSimdVector(pa, pa.cur, slot); + const simdvector& leadVert = PaGetSimdVector(pa, pa.first, slot); + const simdvector& a = PaGetSimdVector(pa, pa.prev, slot); + const simdvector& b = PaGetSimdVector(pa, pa.cur, slot); #endif simdscalar s; // need to fill vectors 1/2 with new verts, and v0 with anchor vert. - for(int i = 0; i < 4; ++i) + for (int i = 0; i < 4; ++i) { simdscalar a0 = a[i]; simdscalar b0 = b[i]; @@ -1310,15 +1394,15 @@ bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) simdscalar comp = leadVert[i]; simdvector& v0 = verts[0]; - v0[i] = _simd_shuffle_ps(comp, comp, _MM_SHUFFLE(0, 0, 0, 0)); - v0[i] = _simd_permute2f128_ps(v0[i], comp, 0x00); + v0[i] = _simd_shuffle_ps(comp, comp, _MM_SHUFFLE(0, 0, 0, 0)); + v0[i] = _simd_permute2f128_ps(v0[i], comp, 0x00); simdvector& v2 = verts[2]; - s = _simd_permute2f128_ps(a0, b0, 0x21); - v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2)); + s = _simd_permute2f128_ps(a0, b0, 0x21); + v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2)); simdvector& v1 = verts[1]; - v1[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1)); + v1[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1)); } SetNextPaState(pa, PaTriFan1, PaTriFanSingle0, 0, PA_STATE_OPT::SIMD_WIDTH); @@ -1329,14 +1413,16 @@ bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) { SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0); - return false; // Not enough vertices to assemble 16 triangles. + return false; // Not enough vertices to assemble 16 triangles. } bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) { - const simd16vector &a = PaGetSimdVector_simd16(pa, pa.first, slot); - const simd16vector &b = PaGetSimdVector_simd16(pa, pa.prev, slot); - const simd16vector &c = PaGetSimdVector_simd16(pa, pa.cur, slot); + // clang-format off + + const simd16vector& a = PaGetSimdVector_simd16(pa, pa.first, slot); + const simd16vector& b = PaGetSimdVector_simd16(pa, pa.prev, slot); + const simd16vector& c = PaGetSimdVector_simd16(pa, pa.cur, slot); const simd16mask mask0 = 0xF000; @@ -1344,49 +1430,45 @@ bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) // v1 -> b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 // v2 -> b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1 - simd16vector &v0 = verts[0]; - simd16vector &v1 = verts[1]; - simd16vector &v2 = verts[2]; + simd16vector& v0 = verts[0]; + simd16vector& v1 = verts[1]; + simd16vector& v2 = verts[2]; // for simd16 x, y, z, and w for (uint32_t i = 0; i < 4; i += 1) { - simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float *>(&a[i])); - simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float *>(&b[i])); - simd16scalar tempc = _simd16_loadu_ps(reinterpret_cast<const float *>(&c[i])); + simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i])); + simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i])); + simd16scalar tempc = _simd16_loadu_ps(reinterpret_cast<const float*>(&c[i])); - simd16scalar shuff = _simd16_shuffle_ps(tempa, tempa, _MM_SHUFFLE(0, 0, 0, 0)); // a0 a0 a0 a0 a4 a4 a4 a4 a0 a0 a0 a0 a4 a4 a4 a4 + simd16scalar shuff = _simd16_shuffle_ps(tempa, tempa, _MM_SHUFFLE(0, 0, 0, 0)); // a0 a0 a0 a0 a4 a4 a4 a4 a0 a0 a0 a0 a4 a4 a4 a4 - v0[i] = _simd16_permute2f128_ps(shuff, shuff, 0x00); // a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 + v0[i] = _simd16_permute2f128_ps(shuff, shuff, 0x00); // a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 - simd16scalar temp0 = _simd16_permute2f128_ps(tempb, tempb, 0x39);// (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3 - simd16scalar temp1 = _simd16_permute2f128_ps(tempc, tempc, 0x39);// (0 3 2 1) = 00 11 10 01 // c4 c5 c6 c7 c8 c9 cA cB cC cD cE cF c0 c1 c2 c3 + simd16scalar temp0 = _simd16_permute2f128_ps(tempb, tempb, 0x39); // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3 + simd16scalar temp1 = _simd16_permute2f128_ps(tempc, tempc, 0x39); // (0 3 2 1) = 00 11 10 01 // c4 c5 c6 c7 c8 c9 cA cB cC cD cE cF c0 c1 c2 c3 - simd16scalar blend = _simd16_blend_ps(temp0, temp1, mask0); // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1 c2 c3 -#if 0 + simd16scalar blend = _simd16_blend_ps(temp0, temp1, mask0); // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1 c2 c3 - v2[i] = _simd16_shuffle_ps(tempb, blend, _MM_SHUFFLE(1, 0, 3, 2)); // b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1 - v1[i] = _simd16_shuffle_ps(tempb, v2[i], _MM_SHUFFLE(2, 1, 2, 1)); // b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 -#else - - simd16scalar temp2 = _simd16_shuffle_ps(tempb, blend, _MM_SHUFFLE(1, 0, 3, 2)); // b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1 + simd16scalar temp2 = _simd16_shuffle_ps(tempb, blend, _MM_SHUFFLE(1, 0, 3, 2)); // b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1 - v1[i] = _simd16_shuffle_ps(tempb, temp2, _MM_SHUFFLE(2, 1, 2, 1)); // b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 - v2[i] = temp2; // b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1 -#endif + v1[i] = _simd16_shuffle_ps(tempb, temp2, _MM_SHUFFLE(2, 1, 2, 1)); // b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 + v2[i] = temp2; // b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1 } SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0, 0, PA_STATE_OPT::SIMD_WIDTH); return true; + + // clang-format on } #endif void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]) { #if USE_SIMD16_FRONTEND - const simd16vector &a = PaGetSimdVector_simd16(pa, pa.first, slot); - const simd16vector &b = PaGetSimdVector_simd16(pa, pa.prev, slot); - const simd16vector &c = PaGetSimdVector_simd16(pa, pa.cur, slot); + const simd16vector& a = PaGetSimdVector_simd16(pa, pa.first, slot); + const simd16vector& b = PaGetSimdVector_simd16(pa, pa.prev, slot); + const simd16vector& c = PaGetSimdVector_simd16(pa, pa.cur, slot); if (pa.useAlternateOffset) { @@ -1420,9 +1502,9 @@ void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4s verts[2] = swizzleLaneN(c, primIndex - 14); } #else - const simdvector &a = PaGetSimdVector(pa, pa.first, slot); - const simdvector &b = PaGetSimdVector(pa, pa.prev, slot); - const simdvector &c = PaGetSimdVector(pa, pa.cur, slot); + const simdvector& a = PaGetSimdVector(pa, pa.first, slot); + const simdvector& b = PaGetSimdVector(pa, pa.prev, slot); + const simdvector& c = PaGetSimdVector(pa, pa.cur, slot); // vert 0 from leading vertex verts[0] = swizzleLane0(a); @@ -1452,7 +1534,7 @@ void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4s bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) { SetNextPaState(pa, PaQuadList1, PaQuadListSingle0); - return false; // Not enough vertices to assemble 8 triangles. + return false; // Not enough vertices to assemble 8 triangles. } bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) @@ -1463,7 +1545,7 @@ bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) if (!pa.useAlternateOffset) { - const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot); + const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot); for (uint32_t i = 0; i < 4; i += 1) { @@ -1473,7 +1555,7 @@ bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) } else { - const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot); + const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot); for (uint32_t i = 0; i < 4; i += 1) { @@ -1483,13 +1565,13 @@ bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) } #else - simdvector &a = PaGetSimdVector(pa, 0, slot); - simdvector &b = PaGetSimdVector(pa, 1, slot); + simdvector& a = PaGetSimdVector(pa, 0, slot); + simdvector& b = PaGetSimdVector(pa, 1, slot); #endif simdscalar s1, s2; - for(int i = 0; i < 4; ++i) + for (int i = 0; i < 4; ++i) { simdscalar a0 = a[i]; simdscalar b0 = b[i]; @@ -1498,13 +1580,13 @@ bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) s2 = _mm256_permute2f128_ps(a0, b0, 0x31); simdvector& v0 = verts[0]; - v0[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(0, 0, 0, 0)); + v0[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(0, 0, 0, 0)); simdvector& v1 = verts[1]; - v1[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(2, 1, 2, 1)); + v1[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(2, 1, 2, 1)); simdvector& v2 = verts[2]; - v2[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2)); + v2[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2)); } SetNextPaState(pa, PaQuadList0, PaQuadListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); @@ -1515,46 +1597,50 @@ bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) { SetNextPaState_simd16(pa, PaQuadList1_simd16, PaQuadList1, PaQuadListSingle0); - return false; // Not enough vertices to assemble 16 triangles. + return false; // Not enough vertices to assemble 16 triangles. } bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) { - const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot); - const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot); + // clang-format off + + const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot); + const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot); // v0 -> a0 a0 a4 a4 a8 a8 aC aC b0 b0 b0 b0 b0 b0 bC bC // v1 -> a1 a2 a5 a6 a9 aA aD aE b1 b2 b5 b6 b9 bA bD bE // v2 -> a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF - simd16vector &v0 = verts[0]; - simd16vector &v1 = verts[1]; - simd16vector &v2 = verts[2]; + simd16vector& v0 = verts[0]; + simd16vector& v1 = verts[1]; + simd16vector& v2 = verts[2]; // for simd16 x, y, z, and w for (uint32_t i = 0; i < 4; i += 1) { - simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float *>(&a[i])); - simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float *>(&b[i])); + simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i])); + simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i])); - simd16scalar temp0 = _simd16_permute2f128_ps(tempa, tempb, 0x88);// (2 0 2 0) = 10 00 10 00 // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b8 b9 bA bB - simd16scalar temp1 = _simd16_permute2f128_ps(tempa, tempb, 0xDD);// (3 1 3 1) = 11 01 11 01 // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF + simd16scalar temp0 = _simd16_permute2f128_ps(tempa, tempb, 0x88); // (2 0 2 0) = 10 00 10 00 // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b8 b9 bA bB + simd16scalar temp1 = _simd16_permute2f128_ps(tempa, tempb, 0xDD); // (3 1 3 1) = 11 01 11 01 // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF - v0[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(0, 0, 0, 0)); // a0 a0 a4 a4 a8 a8 aC aC b0 b0 b4 b4 b8 b8 bC bC - v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 1, 2, 1)); // a1 a2 a5 a6 a9 aA aD aE b1 b2 b6 b6 b9 bA bD bE - v2[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 2, 3, 2)); // a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF + v0[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(0, 0, 0, 0)); // a0 a0 a4 a4 a8 a8 aC aC b0 b0 b4 b4 b8 b8 bC bC + v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 1, 2, 1)); // a1 a2 a5 a6 a9 aA aD aE b1 b2 b6 b6 b9 bA bD bE + v2[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 2, 3, 2)); // a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF } SetNextPaState_simd16(pa, PaQuadList0_simd16, PaQuadList0, PaQuadListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); return true; + + // clang-format on } #endif void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]) { #if USE_SIMD16_FRONTEND - const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot); - const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot); + const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot); + const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot); if (pa.useAlternateOffset) { @@ -1661,8 +1747,8 @@ void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd break; } #else - const simdvector &a = PaGetSimdVector(pa, 0, slot); - const simdvector &b = PaGetSimdVector(pa, 1, slot); + const simdvector& a = PaGetSimdVector(pa, 0, slot); + const simdvector& b = PaGetSimdVector(pa, 1, slot); switch (primIndex) { @@ -1736,7 +1822,7 @@ bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) #if USE_SIMD16_FRONTEND simdvector first; - const simd16vector &first_16 = PaGetSimdVector_simd16(pa, pa.first, slot); + const simd16vector& first_16 = PaGetSimdVector_simd16(pa, pa.first, slot); if (!pa.useAlternateOffset) { @@ -1754,14 +1840,14 @@ bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) } #else - simdvector &first = PaGetSimdVector(pa, pa.first, slot); + simdvector& first = PaGetSimdVector(pa, pa.first, slot); #endif for (int i = 0; i < 4; i++) { - float *firstVtx = (float *)&(first[i]); - float *targetVtx = (float *)&(verts[1][i]); - targetVtx[lane] = firstVtx[0]; + float* firstVtx = (float*)&(first[i]); + float* targetVtx = (float*)&(verts[1][i]); + targetVtx[lane] = firstVtx[0]; } } @@ -1785,17 +1871,18 @@ bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) // loop reconnect now const int lane = pa.numPrims - pa.numPrimsComplete - 1; - const simd16vector &first = PaGetSimdVector_simd16(pa, pa.first, slot); + const simd16vector& first = PaGetSimdVector_simd16(pa, pa.first, slot); for (int i = 0; i < 4; i++) { - float *firstVtx = (float *)&(first[i]); - float *targetVtx = (float *)&(verts[1][i]); - targetVtx[lane] = firstVtx[0]; + float* firstVtx = (float*)&(first[i]); + float* targetVtx = (float*)&(verts[1][i]); + targetVtx[lane] = firstVtx[0]; } } - SetNextPaState_simd16(pa, PaLineLoop1_simd16, PaLineLoop1, PaLineLoopSingle0, 0, PA_STATE_OPT::SIMD_WIDTH); + SetNextPaState_simd16( + pa, PaLineLoop1_simd16, PaLineLoop1, PaLineLoopSingle0, 0, PA_STATE_OPT::SIMD_WIDTH); return true; } @@ -1807,11 +1894,11 @@ void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd if (pa.numPrimsComplete + primIndex == pa.numPrims - 1) { #if USE_SIMD16_FRONTEND - const simd16vector &first = PaGetSimdVector_simd16(pa, pa.first, slot); + const simd16vector& first = PaGetSimdVector_simd16(pa, pa.first, slot); verts[1] = swizzleLane0(first); #else - const simdvector &first = PaGetSimdVector(pa, pa.first, slot); + const simdvector& first = PaGetSimdVector(pa, pa.first, slot); verts[1] = swizzleLane0(first); #endif @@ -1821,7 +1908,7 @@ void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) { SetNextPaState(pa, PaLineList1, PaLineListSingle0); - return false; // Not enough vertices to assemble 8 lines + return false; // Not enough vertices to assemble 8 lines } bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) @@ -1832,7 +1919,7 @@ bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) if (!pa.useAlternateOffset) { - const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot); + const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot); for (uint32_t i = 0; i < 4; i += 1) { @@ -1842,7 +1929,7 @@ bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) } else { - const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot); + const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot); for (uint32_t i = 0; i < 4; i += 1) { @@ -1852,8 +1939,8 @@ bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) } #else - simdvector &a = PaGetSimdVector(pa, 0, slot); - simdvector &b = PaGetSimdVector(pa, 1, slot); + simdvector& a = PaGetSimdVector(pa, 0, slot); + simdvector& b = PaGetSimdVector(pa, 1, slot); #endif /// @todo: verify provoking vertex is correct @@ -1885,43 +1972,47 @@ bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) { SetNextPaState_simd16(pa, PaLineList1_simd16, PaLineList1, PaLineListSingle0); - return false; // Not enough vertices to assemble 16 lines + return false; // Not enough vertices to assemble 16 lines } bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) { - const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot); - const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot); + // clang-format off + + const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot); + const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot); // v0 -> a0 a2 a4 a6 a8 aA aC aE b0 b2 b4 b6 b8 bA bC bE // v1 -> a1 a3 a5 a7 a9 aB aD aF b1 b3 b4 b7 b9 bB bD bF - simd16vector &v0 = verts[0]; - simd16vector &v1 = verts[1]; + simd16vector& v0 = verts[0]; + simd16vector& v1 = verts[1]; // for simd16 x, y, z, and w for (int i = 0; i < 4; i += 1) { - simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float *>(&a[i])); - simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float *>(&b[i])); + simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i])); + simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i])); - simd16scalar temp0 = _simd16_permute2f128_ps(tempa, tempb, 0x88);// (2 0 2 0) 10 00 10 00 // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b9 b9 bA bB - simd16scalar temp1 = _simd16_permute2f128_ps(tempa, tempb, 0xDD);// (3 1 3 1) 11 01 11 01 // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF + simd16scalar temp0 = _simd16_permute2f128_ps(tempa, tempb, 0x88); // (2 0 2 0) 10 00 10 00 // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b9 b9 bA bB + simd16scalar temp1 = _simd16_permute2f128_ps(tempa, tempb, 0xDD); // (3 1 3 1) 11 01 11 01 // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF - v0[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 0, 2, 0)); // a0 a2 a4 a6 a8 aA aC aE b0 b2 b4 b6 b8 bA bC bE - v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1)); // a1 a3 a5 a7 a9 aB aD aF b1 b3 b5 b7 b9 bB bD bF + v0[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 0, 2, 0)); // a0 a2 a4 a6 a8 aA aC aE b0 b2 b4 b6 b8 bA bC bE + v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1)); // a1 a3 a5 a7 a9 aB aD aF b1 b3 b5 b7 b9 bB bD bF } SetNextPaState_simd16(pa, PaLineList0_simd16, PaLineList0, PaLineListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); return true; + + // clang-format on } #endif void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]) { #if USE_SIMD16_FRONTEND - const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot); - const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot); + const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot); + const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot); if (pa.useAlternateOffset) { @@ -1996,8 +2087,8 @@ void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd break; } #else - const simdvector &a = PaGetSimdVector(pa, 0, slot); - const simdvector &b = PaGetSimdVector(pa, 1, slot); + const simdvector& a = PaGetSimdVector(pa, 0, slot); + const simdvector& b = PaGetSimdVector(pa, 1, slot); switch (primIndex) { @@ -2040,7 +2131,7 @@ void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) { SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0); - return false; // Not enough vertices to assemble 8 lines + return false; // Not enough vertices to assemble 8 lines } bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) @@ -2051,7 +2142,7 @@ bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) if (!pa.useAlternateOffset) { - const simd16vector &a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot); + const simd16vector& a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot); for (uint32_t i = 0; i < 4; i += 1) { @@ -2061,7 +2152,7 @@ bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) } else { - const simd16vector &b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot); + const simd16vector& b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot); for (uint32_t i = 0; i < 4; i += 1) { @@ -2071,8 +2162,8 @@ bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) } #else - simdvector &a = PaGetSimdVector(pa, pa.prev, slot); - simdvector &b = PaGetSimdVector(pa, pa.cur, slot); + simdvector& a = PaGetSimdVector(pa, pa.prev, slot); + simdvector& b = PaGetSimdVector(pa, pa.cur, slot); #endif /// @todo: verify provoking vertex is correct @@ -2085,7 +2176,7 @@ bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) verts[0] = a; - for(uint32_t i = 0; i < 4; ++i) + for (uint32_t i = 0; i < 4; ++i) { // 1 2 3 x 5 6 7 x __m256 vPermA = _mm256_permute_ps(a.v[i], 0x39); // indices hi->low 00 11 10 01 (0 3 2 1) @@ -2106,47 +2197,51 @@ bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) { SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStrip1, PaLineStripSingle0); - return false; // Not enough vertices to assemble 16 lines + return false; // Not enough vertices to assemble 16 lines } bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) { + // clang-format off + const simd16scalari perm = _simd16_set_epi32(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); - const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot); - const simd16vector &b = PaGetSimdVector_simd16(pa, pa.cur, slot); + const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot); + const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot); const simd16mask mask0 = 0x0001; // v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF // v1 -> a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 - simd16vector &v0 = verts[0]; - simd16vector &v1 = verts[1]; + simd16vector& v0 = verts[0]; + simd16vector& v1 = verts[1]; - v0 = a; // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF + v0 = a; // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF // for simd16 x, y, z, and w for (int i = 0; i < 4; i += 1) { - simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float *>(&a[i])); - simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float *>(&b[i])); + simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i])); + simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i])); - simd16scalar temp = _simd16_blend_ps(tempa, tempb, mask0); // b0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF + simd16scalar temp = _simd16_blend_ps(tempa, tempb, mask0); // b0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF - v1[i] = _simd16_permute_ps(temp, perm); // a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 + v1[i] = _simd16_permute_ps(temp, perm); // a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 } SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStrip1, PaLineStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH); return true; + + // clang-format on } #endif void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]) { #if USE_SIMD16_FRONTEND - const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot); - const simd16vector &b = PaGetSimdVector_simd16(pa, pa.cur, slot); + const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot); + const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot); if (pa.useAlternateOffset) { @@ -2221,8 +2316,8 @@ void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, sim break; } #else - const simdvector &a = PaGetSimdVector(pa, pa.prev, slot); - const simdvector &b = PaGetSimdVector(pa, pa.cur, slot); + const simdvector& a = PaGetSimdVector(pa, pa.prev, slot); + const simdvector& b = PaGetSimdVector(pa, pa.cur, slot); switch (primIndex) { @@ -2267,7 +2362,7 @@ bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) #if USE_SIMD16_FRONTEND simdvector a; - const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot); + const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot); if (!pa.useAlternateOffset) { @@ -2285,10 +2380,10 @@ bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) } #else - simdvector &a = PaGetSimdVector(pa, 0, slot); + simdvector& a = PaGetSimdVector(pa, 0, slot); #endif - verts[0] = a; // points only have 1 vertex. + verts[0] = a; // points only have 1 vertex. SetNextPaState(pa, PaPoints0, PaPointsSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); return true; @@ -2297,11 +2392,12 @@ bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) #if ENABLE_AVX512_SIMD16 bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) { - simd16vector &a = PaGetSimdVector_simd16(pa, pa.cur, slot); + simd16vector& a = PaGetSimdVector_simd16(pa, pa.cur, slot); - verts[0] = a; // points only have 1 vertex. + verts[0] = a; // points only have 1 vertex. - SetNextPaState_simd16(pa, PaPoints0_simd16, PaPoints0, PaPointsSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); + SetNextPaState_simd16( + pa, PaPoints0_simd16, PaPoints0, PaPointsSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); return true; } @@ -2309,7 +2405,7 @@ bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]) { #if USE_SIMD16_FRONTEND - const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot); + const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot); if (pa.useAlternateOffset) { @@ -2318,7 +2414,7 @@ void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4s verts[0] = swizzleLaneN(a, primIndex); #else - const simdvector &a = PaGetSimdVector(pa, 0, slot); + const simdvector& a = PaGetSimdVector(pa, 0, slot); verts[0] = swizzleLaneN(a, primIndex); #endif @@ -2332,7 +2428,7 @@ bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) SetNextPaState(pa, PaRectList1, PaRectListSingle0); return false; } - + ////////////////////////////////////////////////////////////////////////// /// @brief State 1 for RECT_LIST topology. /// Rect lists has the following format. @@ -2341,16 +2437,16 @@ bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) /// | \ | | \ | | \ | | \ | /// v1 o---o v4 o---o v7 o---o v10 o---o /// v0 v3 v6 v9 -/// +/// /// Only 3 vertices of the rectangle are supplied. The 4th vertex is implied. -/// +/// /// tri0 = { v0, v1, v2 } tri1 = { v0, v2, w } <-- w = v0 - v1 + v2 /// tri2 = { v3, v4, v5 } tri3 = { v3, v5, x } <-- x = v3 - v4 + v5 /// etc. -/// +/// /// PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2 /// where v0 contains all the first vertices for 8 triangles. -/// +/// /// Result: /// verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 } /// verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 } @@ -2358,20 +2454,18 @@ bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) /// /// @param pa - State for PA state machine. /// @param slot - Index into VS output which is either a position (slot 0) or attribute. -/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc. -bool PaRectList1( - PA_STATE_OPT& pa, - uint32_t slot, - simdvector verts[]) +/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, +/// etc. +bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) { - // SIMD vectors a and b are the last two vertical outputs from the vertex shader. +// SIMD vectors a and b are the last two vertical outputs from the vertex shader. #if USE_SIMD16_FRONTEND simdvector a; simdvector b; if (!pa.useAlternateOffset) { - const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot); + const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot); for (uint32_t i = 0; i < 4; i += 1) { @@ -2381,54 +2475,60 @@ bool PaRectList1( } else { - const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot); + const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot); for (uint32_t i = 0; i < 4; i += 1) { a[i] = _simd16_extract_ps(b_16[i], 0); - b[i] = _simd16_extract_ps(b_16[i], 1);; + b[i] = _simd16_extract_ps(b_16[i], 1); + ; } } #else - simdvector &a = PaGetSimdVector(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7 } - simdvector &b = PaGetSimdVector(pa, 1, slot); // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 } + simdvector& a = PaGetSimdVector(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7 } + simdvector& b = PaGetSimdVector(pa, 1, slot); // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 } #endif __m256 tmp0, tmp1, tmp2; // Loop over each component in the simdvector. - for(int i = 0; i < 4; ++i) + for (int i = 0; i < 4; ++i) { - simdvector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 } - tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 } - v0[i] = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care. - tmp1 = _mm256_permute_ps(v0[i], 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * } - v0[i] = _mm256_permute_ps(v0[i], 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 } - v0[i] = _mm256_blend_ps(tmp1, v0[i], 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 } + simdvector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 } + tmp0 = _mm256_permute2f128_ps( + b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 } + v0[i] = _mm256_blend_ps( + a[i], + tmp0, + 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care. + tmp1 = _mm256_permute_ps(v0[i], 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * } + v0[i] = _mm256_permute_ps(v0[i], 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 } + v0[i] = + _mm256_blend_ps(tmp1, v0[i], 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 } /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'. /// AVX2 should make this much cheaper. - simdvector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 } - v1[i] = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * } - tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 } - tmp2 = _mm256_blend_ps(v1[i], tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 } - tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * } - v1[i] = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 } - v1[i] = _mm256_blend_ps(tmp2, v1[i], 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 } - v1[i] = _mm256_blend_ps(v1[i], tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 } + simdvector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 } + v1[i] = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * } + tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 } + tmp2 = _mm256_blend_ps(v1[i], tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 } + tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, *, *, *, * } + v1[i] = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 } + v1[i] = _mm256_blend_ps(tmp2, v1[i], 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 } + v1[i] = _mm256_blend_ps(v1[i], tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 } // verts[2] = { v2, w, v5, x, v8, y, v11, z } - simdvector& v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z } - v2[i] = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * } - tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * } - v2[i] = _mm256_blend_ps(tmp1, v2[i], 0xF0); + simdvector& v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z } + v2[i] = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * } + tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * } + v2[i] = _mm256_blend_ps(tmp1, v2[i], 0xF0); // Need to compute 4th implied vertex for the rectangle. tmp2 = _mm256_sub_ps(v0[i], v1[i]); - tmp2 = _mm256_add_ps(tmp2, v2[i]); // tmp2 = { w, *, x, *, y, *, z, * } - tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z } - v2[i] = _mm256_blend_ps(v2[i], tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z } + tmp2 = _mm256_add_ps(tmp2, v2[i]); // tmp2 = { w, *, x, *, y, *, z, * } + tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z } + v2[i] = _mm256_blend_ps(v2[i], tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z } } SetNextPaState(pa, PaRectList1, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); @@ -2440,11 +2540,9 @@ bool PaRectList1( /// Not implemented unless there is a use case for more then 8 rects. /// @param pa - State for PA state machine. /// @param slot - Index into VS output which is either a position (slot 0) or attribute. -/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc. -bool PaRectList2( - PA_STATE_OPT& pa, - uint32_t slot, - simdvector verts[]) +/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, +/// etc. +bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) { SWR_INVALID("Is rect list used for anything other then clears?"); SetNextPaState(pa, PaRectList0, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); @@ -2469,16 +2567,16 @@ bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) /// | \ | | \ | | \ | | \ | /// v1 o---o v4 o---o v7 o---o v10 o---o /// v0 v3 v6 v9 -/// +/// /// Only 3 vertices of the rectangle are supplied. The 4th vertex is implied. -/// +/// /// tri0 = { v0, v1, v2 } tri1 = { v0, v2, w } <-- w = v0 - v1 + v2 /// tri2 = { v3, v4, v5 } tri3 = { v3, v5, x } <-- x = v3 - v4 + v5 /// etc. -/// +/// /// PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2 /// where v0 contains all the first vertices for 8 triangles. -/// +/// /// Result: /// verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 } /// verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 } @@ -2486,18 +2584,19 @@ bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) /// /// @param pa - State for PA state machine. /// @param slot - Index into VS output which is either a position (slot 0) or attribute. -/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc. -bool PaRectList1_simd16( - PA_STATE_OPT& pa, - uint32_t slot, - simd16vector verts[]) +/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, +/// etc. +bool PaRectList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) { + // clang-format off + simdvector a; simdvector b; if (!pa.useAlternateOffset) { - const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15 } + const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7, + // v8, v9, v10, v11, v12, v13, v14, v15 } for (uint32_t i = 0; i < 4; i += 1) { @@ -2507,7 +2606,7 @@ bool PaRectList1_simd16( } else { - const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot); // b[] = { v16...but not used by this implementation.. } + const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot); // b[] = { v16...but not used by this implementation.. } for (uint32_t i = 0; i < 4; i += 1) { @@ -2516,45 +2615,45 @@ bool PaRectList1_simd16( } } - simd16vector &v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 } - simd16vector &v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 } - simd16vector &v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z } + simd16vector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 } + simd16vector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 } + simd16vector& v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z } // Loop over each component in the simdvector. for (int i = 0; i < 4; i += 1) { - simdscalar v0_lo; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 } - simdscalar v1_lo; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 } - simdscalar v2_lo; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z } + simdscalar v0_lo; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 } + simdscalar v1_lo; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 } + simdscalar v2_lo; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z } __m256 tmp0, tmp1, tmp2; - tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 } - v0_lo = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care. - tmp1 = _mm256_permute_ps(v0_lo, 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * } - v0_lo = _mm256_permute_ps(v0_lo, 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 } - v0_lo = _mm256_blend_ps(tmp1, v0_lo, 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 } + tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 } + v0_lo = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care. + tmp1 = _mm256_permute_ps(v0_lo, 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * } + v0_lo = _mm256_permute_ps(v0_lo, 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 } + v0_lo = _mm256_blend_ps(tmp1, v0_lo, 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 } /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'. /// AVX2 should make this much cheaper. - v1_lo = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * } - tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 } - tmp2 = _mm256_blend_ps(v1_lo, tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 } - tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * } - v1_lo = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 } - v1_lo = _mm256_blend_ps(tmp2, v1_lo, 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 } - v1_lo = _mm256_blend_ps(v1_lo, tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 } + v1_lo = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * } + tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 } + tmp2 = _mm256_blend_ps(v1_lo, tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 } + tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, *, *, *, * } + v1_lo = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 } + v1_lo = _mm256_blend_ps(tmp2, v1_lo, 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 } + v1_lo = _mm256_blend_ps(v1_lo, tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 } // verts[2] = { v2, w, v5, x, v8, y, v11, z } - v2_lo = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * } - tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * } + v2_lo = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * } + tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * } v2_lo = _mm256_blend_ps(tmp1, v2_lo, 0xF0); // Need to compute 4th implied vertex for the rectangle. - tmp2 = _mm256_sub_ps(v0_lo, v1_lo); - tmp2 = _mm256_add_ps(tmp2, v2_lo); // tmp2 = { w, *, x, *, y, *, z, * } - tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z } - v2_lo = _mm256_blend_ps(v2_lo, tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z } + tmp2 = _mm256_sub_ps(v0_lo, v1_lo); + tmp2 = _mm256_add_ps(tmp2, v2_lo); // tmp2 = { w, *, x, *, y, *, z, * } + tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z } + v2_lo = _mm256_blend_ps(v2_lo, tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z } v0[i] = _simd16_insert_ps(_simd16_setzero_ps(), v0_lo, 0); v1[i] = _simd16_insert_ps(_simd16_setzero_ps(), v1_lo, 0); @@ -2563,6 +2662,8 @@ bool PaRectList1_simd16( SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectList1, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); return true; + + // clang-format on } ////////////////////////////////////////////////////////////////////////// @@ -2570,14 +2671,13 @@ bool PaRectList1_simd16( /// Not implemented unless there is a use case for more then 8 rects. /// @param pa - State for PA state machine. /// @param slot - Index into VS output which is either a position (slot 0) or attribute. -/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc. -bool PaRectList2_simd16( - PA_STATE_OPT& pa, - uint32_t slot, - simd16vector verts[]) +/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, +/// etc. +bool PaRectList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) { SWR_INVALID("Is rect list used for anything other then clears?"); - SetNextPaState_simd16(pa, PaRectList0_simd16, PaRectList0, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); + SetNextPaState_simd16( + pa, PaRectList0_simd16, PaRectList0, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); return true; } @@ -2591,23 +2691,20 @@ bool PaRectList2_simd16( /// @param pa - State for PA state machine. /// @param slot - Index into VS output for a given attribute. /// @param primIndex - Binner processes each triangle individually. -/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc. -void PaRectListSingle0( - PA_STATE_OPT& pa, - uint32_t slot, - uint32_t primIndex, - simd4scalar verts[]) +/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, +/// etc. +void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]) { - // We have 12 simdscalars contained within 3 simdvectors which - // hold at least 8 triangles worth of data. We want to assemble a single - // triangle with data in horizontal form. +// We have 12 simdscalars contained within 3 simdvectors which +// hold at least 8 triangles worth of data. We want to assemble a single +// triangle with data in horizontal form. #if USE_SIMD16_FRONTEND simdvector a; simdvector b; if (!pa.useAlternateOffset) { - const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot); + const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot); for (uint32_t i = 0; i < 4; i += 1) { @@ -2617,12 +2714,13 @@ void PaRectListSingle0( } else { - const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot); + const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot); for (uint32_t i = 0; i < 4; i += 1) { a[i] = _simd16_extract_ps(b_16[i], 0); - b[i] = _simd16_extract_ps(b_16[i], 1);; + b[i] = _simd16_extract_ps(b_16[i], 1); + ; } } @@ -2631,7 +2729,7 @@ void PaRectListSingle0( #endif // Convert from vertical to horizontal. - switch(primIndex) + switch (primIndex) { case 0: verts[0] = swizzleLane0(a); @@ -2654,10 +2752,17 @@ void PaRectListSingle0( }; } -PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* pStream, uint32_t in_streamSizeInVerts, - uint32_t in_vertexStride, bool in_isStreaming, uint32_t numVertsPerPrim, PRIMITIVE_TOPOLOGY topo) : - PA_STATE(in_pDC, pStream, in_streamSizeInVerts, in_vertexStride, numVertsPerPrim), numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0), - cur(0), prev(0), first(0), counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming) +PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT* in_pDC, + uint32_t in_numPrims, + uint8_t* pStream, + uint32_t in_streamSizeInVerts, + uint32_t in_vertexStride, + bool in_isStreaming, + uint32_t numVertsPerPrim, + PRIMITIVE_TOPOLOGY topo) : + PA_STATE(in_pDC, pStream, in_streamSizeInVerts, in_vertexStride, numVertsPerPrim), + numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0), cur(0), prev(0), first(0), + counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming) { const API_STATE& state = GetApiState(pDC); @@ -2669,271 +2774,271 @@ PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* #endif switch (this->binTopology) { - case TOP_TRIANGLE_LIST: - this->pfnPaFunc = PaTriList0; + case TOP_TRIANGLE_LIST: + this->pfnPaFunc = PaTriList0; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaTriList0_simd16; + this->pfnPaFunc_simd16 = PaTriList0_simd16; #endif - break; - case TOP_TRIANGLE_STRIP: - this->pfnPaFunc = PaTriStrip0; + break; + case TOP_TRIANGLE_STRIP: + this->pfnPaFunc = PaTriStrip0; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaTriStrip0_simd16; + this->pfnPaFunc_simd16 = PaTriStrip0_simd16; #endif - break; - case TOP_TRIANGLE_FAN: - this->pfnPaFunc = PaTriFan0; + break; + case TOP_TRIANGLE_FAN: + this->pfnPaFunc = PaTriFan0; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaTriFan0_simd16; + this->pfnPaFunc_simd16 = PaTriFan0_simd16; #endif - break; - case TOP_QUAD_LIST: - this->pfnPaFunc = PaQuadList0; + break; + case TOP_QUAD_LIST: + this->pfnPaFunc = PaQuadList0; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaQuadList0_simd16; + this->pfnPaFunc_simd16 = PaQuadList0_simd16; #endif - this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles - break; - case TOP_QUAD_STRIP: - // quad strip pattern when decomposed into triangles is the same as verts strips - this->pfnPaFunc = PaTriStrip0; + this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles + break; + case TOP_QUAD_STRIP: + // quad strip pattern when decomposed into triangles is the same as verts strips + this->pfnPaFunc = PaTriStrip0; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaTriStrip0_simd16; + this->pfnPaFunc_simd16 = PaTriStrip0_simd16; #endif - this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles - break; - case TOP_LINE_LIST: - this->pfnPaFunc = PaLineList0; + this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles + break; + case TOP_LINE_LIST: + this->pfnPaFunc = PaLineList0; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaLineList0_simd16; + this->pfnPaFunc_simd16 = PaLineList0_simd16; #endif - this->numPrims = in_numPrims; - break; - case TOP_LINE_STRIP: - this->pfnPaFunc = PaLineStrip0; + this->numPrims = in_numPrims; + break; + case TOP_LINE_STRIP: + this->pfnPaFunc = PaLineStrip0; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaLineStrip0_simd16; + this->pfnPaFunc_simd16 = PaLineStrip0_simd16; #endif - this->numPrims = in_numPrims; - break; - case TOP_LINE_LOOP: - this->pfnPaFunc = PaLineLoop0; + this->numPrims = in_numPrims; + break; + case TOP_LINE_LOOP: + this->pfnPaFunc = PaLineLoop0; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaLineLoop0_simd16; + this->pfnPaFunc_simd16 = PaLineLoop0_simd16; #endif - this->numPrims = in_numPrims; - break; - case TOP_POINT_LIST: - this->pfnPaFunc = PaPoints0; + this->numPrims = in_numPrims; + break; + case TOP_POINT_LIST: + this->pfnPaFunc = PaPoints0; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPoints0_simd16; + this->pfnPaFunc_simd16 = PaPoints0_simd16; #endif - this->numPrims = in_numPrims; - break; - case TOP_RECT_LIST: - this->pfnPaFunc = PaRectList0; + this->numPrims = in_numPrims; + break; + case TOP_RECT_LIST: + this->pfnPaFunc = PaRectList0; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaRectList0_simd16; + this->pfnPaFunc_simd16 = PaRectList0_simd16; #endif - this->numPrims = in_numPrims * 2; - break; + this->numPrims = in_numPrims * 2; + break; - case TOP_PATCHLIST_1: - this->pfnPaFunc = PaPatchList<1>; + case TOP_PATCHLIST_1: + this->pfnPaFunc = PaPatchList<1>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<1>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<1>; #endif - break; - case TOP_PATCHLIST_2: - this->pfnPaFunc = PaPatchList<2>; + break; + case TOP_PATCHLIST_2: + this->pfnPaFunc = PaPatchList<2>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<2>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<2>; #endif - break; - case TOP_PATCHLIST_3: - this->pfnPaFunc = PaPatchList<3>; + break; + case TOP_PATCHLIST_3: + this->pfnPaFunc = PaPatchList<3>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<3>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<3>; #endif - break; - case TOP_PATCHLIST_4: - this->pfnPaFunc = PaPatchList<4>; + break; + case TOP_PATCHLIST_4: + this->pfnPaFunc = PaPatchList<4>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<4>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<4>; #endif - break; - case TOP_PATCHLIST_5: - this->pfnPaFunc = PaPatchList<5>; + break; + case TOP_PATCHLIST_5: + this->pfnPaFunc = PaPatchList<5>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<5>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<5>; #endif - break; - case TOP_PATCHLIST_6: - this->pfnPaFunc = PaPatchList<6>; + break; + case TOP_PATCHLIST_6: + this->pfnPaFunc = PaPatchList<6>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<6>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<6>; #endif - break; - case TOP_PATCHLIST_7: - this->pfnPaFunc = PaPatchList<7>; + break; + case TOP_PATCHLIST_7: + this->pfnPaFunc = PaPatchList<7>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<7>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<7>; #endif - break; - case TOP_PATCHLIST_8: - this->pfnPaFunc = PaPatchList<8>; + break; + case TOP_PATCHLIST_8: + this->pfnPaFunc = PaPatchList<8>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<8>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<8>; #endif - break; - case TOP_PATCHLIST_9: - this->pfnPaFunc = PaPatchList<9>; + break; + case TOP_PATCHLIST_9: + this->pfnPaFunc = PaPatchList<9>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<9>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<9>; #endif - break; - case TOP_PATCHLIST_10: - this->pfnPaFunc = PaPatchList<10>; + break; + case TOP_PATCHLIST_10: + this->pfnPaFunc = PaPatchList<10>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<10>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<10>; #endif - break; - case TOP_PATCHLIST_11: - this->pfnPaFunc = PaPatchList<11>; + break; + case TOP_PATCHLIST_11: + this->pfnPaFunc = PaPatchList<11>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<11>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<11>; #endif - break; - case TOP_PATCHLIST_12: - this->pfnPaFunc = PaPatchList<12>; + break; + case TOP_PATCHLIST_12: + this->pfnPaFunc = PaPatchList<12>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<12>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<12>; #endif - break; - case TOP_PATCHLIST_13: - this->pfnPaFunc = PaPatchList<13>; + break; + case TOP_PATCHLIST_13: + this->pfnPaFunc = PaPatchList<13>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<13>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<13>; #endif - break; - case TOP_PATCHLIST_14: - this->pfnPaFunc = PaPatchList<14>; + break; + case TOP_PATCHLIST_14: + this->pfnPaFunc = PaPatchList<14>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<14>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<14>; #endif - break; - case TOP_PATCHLIST_15: - this->pfnPaFunc = PaPatchList<15>; + break; + case TOP_PATCHLIST_15: + this->pfnPaFunc = PaPatchList<15>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<15>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<15>; #endif - break; - case TOP_PATCHLIST_16: - this->pfnPaFunc = PaPatchList<16>; + break; + case TOP_PATCHLIST_16: + this->pfnPaFunc = PaPatchList<16>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<16>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<16>; #endif - break; - case TOP_PATCHLIST_17: - this->pfnPaFunc = PaPatchList<17>; + break; + case TOP_PATCHLIST_17: + this->pfnPaFunc = PaPatchList<17>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<17>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<17>; #endif - break; - case TOP_PATCHLIST_18: - this->pfnPaFunc = PaPatchList<18>; + break; + case TOP_PATCHLIST_18: + this->pfnPaFunc = PaPatchList<18>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<18>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<18>; #endif - break; - case TOP_PATCHLIST_19: - this->pfnPaFunc = PaPatchList<19>; + break; + case TOP_PATCHLIST_19: + this->pfnPaFunc = PaPatchList<19>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<19>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<19>; #endif - break; - case TOP_PATCHLIST_20: - this->pfnPaFunc = PaPatchList<20>; + break; + case TOP_PATCHLIST_20: + this->pfnPaFunc = PaPatchList<20>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<20>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<20>; #endif - break; - case TOP_PATCHLIST_21: - this->pfnPaFunc = PaPatchList<21>; + break; + case TOP_PATCHLIST_21: + this->pfnPaFunc = PaPatchList<21>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<21>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<21>; #endif - break; - case TOP_PATCHLIST_22: - this->pfnPaFunc = PaPatchList<22>; + break; + case TOP_PATCHLIST_22: + this->pfnPaFunc = PaPatchList<22>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<22>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<22>; #endif - break; - case TOP_PATCHLIST_23: - this->pfnPaFunc = PaPatchList<23>; + break; + case TOP_PATCHLIST_23: + this->pfnPaFunc = PaPatchList<23>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<23>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<23>; #endif - break; - case TOP_PATCHLIST_24: - this->pfnPaFunc = PaPatchList<24>; + break; + case TOP_PATCHLIST_24: + this->pfnPaFunc = PaPatchList<24>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<24>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<24>; #endif - break; - case TOP_PATCHLIST_25: - this->pfnPaFunc = PaPatchList<25>; + break; + case TOP_PATCHLIST_25: + this->pfnPaFunc = PaPatchList<25>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<25>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<25>; #endif - break; - case TOP_PATCHLIST_26: - this->pfnPaFunc = PaPatchList<26>; + break; + case TOP_PATCHLIST_26: + this->pfnPaFunc = PaPatchList<26>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<26>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<26>; #endif - break; - case TOP_PATCHLIST_27: - this->pfnPaFunc = PaPatchList<27>; + break; + case TOP_PATCHLIST_27: + this->pfnPaFunc = PaPatchList<27>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<27>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<27>; #endif - break; - case TOP_PATCHLIST_28: - this->pfnPaFunc = PaPatchList<28>; + break; + case TOP_PATCHLIST_28: + this->pfnPaFunc = PaPatchList<28>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<28>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<28>; #endif - break; - case TOP_PATCHLIST_29: - this->pfnPaFunc = PaPatchList<29>; + break; + case TOP_PATCHLIST_29: + this->pfnPaFunc = PaPatchList<29>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<29>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<29>; #endif - break; - case TOP_PATCHLIST_30: - this->pfnPaFunc = PaPatchList<30>; + break; + case TOP_PATCHLIST_30: + this->pfnPaFunc = PaPatchList<30>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<30>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<30>; #endif - break; - case TOP_PATCHLIST_31: - this->pfnPaFunc = PaPatchList<31>; + break; + case TOP_PATCHLIST_31: + this->pfnPaFunc = PaPatchList<31>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<31>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<31>; #endif - break; - case TOP_PATCHLIST_32: - this->pfnPaFunc = PaPatchList<32>; + break; + case TOP_PATCHLIST_32: + this->pfnPaFunc = PaPatchList<32>; #if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<32>; + this->pfnPaFunc_simd16 = PaPatchList_simd16<32>; #endif - break; + break; - default: - SWR_INVALID("Invalid topology: %d", this->binTopology); - break; + default: + SWR_INVALID("Invalid topology: %d", this->binTopology); + break; }; this->pfnPaFuncReset = this->pfnPaFunc; @@ -2943,95 +3048,94 @@ PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* #if USE_SIMD16_FRONTEND simd16scalari id16 = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - simd16scalari id82 = _simd16_set_epi32( 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); + simd16scalari id82 = _simd16_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); #else simdscalari id8 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); simdscalari id4 = _simd_set_epi32(3, 3, 2, 2, 1, 1, 0, 0); #endif - switch(this->binTopology) + switch (this->binTopology) { - case TOP_TRIANGLE_LIST: - case TOP_TRIANGLE_STRIP: - case TOP_TRIANGLE_FAN: - case TOP_LINE_STRIP: - case TOP_LINE_LIST: - case TOP_LINE_LOOP: + case TOP_TRIANGLE_LIST: + case TOP_TRIANGLE_STRIP: + case TOP_TRIANGLE_FAN: + case TOP_LINE_STRIP: + case TOP_LINE_LIST: + case TOP_LINE_LOOP: #if USE_SIMD16_FRONTEND - this->primIDIncr = 16; - this->primID = id16; + this->primIDIncr = 16; + this->primID = id16; #else - this->primIDIncr = 8; - this->primID = id8; + this->primIDIncr = 8; + this->primID = id8; #endif - break; - case TOP_QUAD_LIST: - case TOP_QUAD_STRIP: - case TOP_RECT_LIST: + break; + case TOP_QUAD_LIST: + case TOP_QUAD_STRIP: + case TOP_RECT_LIST: #if USE_SIMD16_FRONTEND - this->primIDIncr = 8; - this->primID = id82; + this->primIDIncr = 8; + this->primID = id82; #else - this->primIDIncr = 4; - this->primID = id4; + this->primIDIncr = 4; + this->primID = id4; #endif - break; - case TOP_POINT_LIST: + break; + case TOP_POINT_LIST: #if USE_SIMD16_FRONTEND - this->primIDIncr = 16; - this->primID = id16; + this->primIDIncr = 16; + this->primID = id16; #else - this->primIDIncr = 8; - this->primID = id8; -#endif - break; - case TOP_PATCHLIST_1: - case TOP_PATCHLIST_2: - case TOP_PATCHLIST_3: - case TOP_PATCHLIST_4: - case TOP_PATCHLIST_5: - case TOP_PATCHLIST_6: - case TOP_PATCHLIST_7: - case TOP_PATCHLIST_8: - case TOP_PATCHLIST_9: - case TOP_PATCHLIST_10: - case TOP_PATCHLIST_11: - case TOP_PATCHLIST_12: - case TOP_PATCHLIST_13: - case TOP_PATCHLIST_14: - case TOP_PATCHLIST_15: - case TOP_PATCHLIST_16: - case TOP_PATCHLIST_17: - case TOP_PATCHLIST_18: - case TOP_PATCHLIST_19: - case TOP_PATCHLIST_20: - case TOP_PATCHLIST_21: - case TOP_PATCHLIST_22: - case TOP_PATCHLIST_23: - case TOP_PATCHLIST_24: - case TOP_PATCHLIST_25: - case TOP_PATCHLIST_26: - case TOP_PATCHLIST_27: - case TOP_PATCHLIST_28: - case TOP_PATCHLIST_29: - case TOP_PATCHLIST_30: - case TOP_PATCHLIST_31: - case TOP_PATCHLIST_32: - // Always run KNOB_SIMD_WIDTH number of patches at a time. + this->primIDIncr = 8; + this->primID = id8; +#endif + break; + case TOP_PATCHLIST_1: + case TOP_PATCHLIST_2: + case TOP_PATCHLIST_3: + case TOP_PATCHLIST_4: + case TOP_PATCHLIST_5: + case TOP_PATCHLIST_6: + case TOP_PATCHLIST_7: + case TOP_PATCHLIST_8: + case TOP_PATCHLIST_9: + case TOP_PATCHLIST_10: + case TOP_PATCHLIST_11: + case TOP_PATCHLIST_12: + case TOP_PATCHLIST_13: + case TOP_PATCHLIST_14: + case TOP_PATCHLIST_15: + case TOP_PATCHLIST_16: + case TOP_PATCHLIST_17: + case TOP_PATCHLIST_18: + case TOP_PATCHLIST_19: + case TOP_PATCHLIST_20: + case TOP_PATCHLIST_21: + case TOP_PATCHLIST_22: + case TOP_PATCHLIST_23: + case TOP_PATCHLIST_24: + case TOP_PATCHLIST_25: + case TOP_PATCHLIST_26: + case TOP_PATCHLIST_27: + case TOP_PATCHLIST_28: + case TOP_PATCHLIST_29: + case TOP_PATCHLIST_30: + case TOP_PATCHLIST_31: + case TOP_PATCHLIST_32: + // Always run KNOB_SIMD_WIDTH number of patches at a time. #if USE_SIMD16_FRONTEND - this->primIDIncr = 16; - this->primID = id16; + this->primIDIncr = 16; + this->primID = id16; #else - this->primIDIncr = 8; - this->primID = id8; + this->primIDIncr = 8; + this->primID = id8; #endif - break; + break; - default: - SWR_INVALID("Invalid topology: %d", this->binTopology); - break; + default: + SWR_INVALID("Invalid topology: %d", this->binTopology); + break; }; - } #endif diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp index 67c28ad97c4..a392035700d 100644 --- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp @@ -1,30 +1,30 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file rasterizer.cpp -* -* @brief Implementation for the rasterizer. -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file rasterizer.cpp + * + * @brief Implementation for the rasterizer. + * + ******************************************************************************/ #include <vector> #include <algorithm> @@ -39,11 +39,12 @@ #include "memory/tilingtraits.h" #include "rasterizer_impl.h" -PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT][STATE_VALID_TRI_EDGE_COUNT][2]; +PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT] + [STATE_VALID_TRI_EDGE_COUNT][2]; -void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) +void RasterizeLine(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData) { - const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pData); + const TRIANGLE_WORK_DESC& workDesc = *((TRIANGLE_WORK_DESC*)pData); #if KNOB_ENABLE_TOSS_POINTS if (KNOB_TOSS_BIN_TRIS) { @@ -54,23 +55,24 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi // bloat line to two tris and call the triangle rasterizer twice RDTSC_BEGIN(BERasterizeLine, pDC->drawId); - const API_STATE &state = GetApiState(pDC); - const SWR_RASTSTATE &rastState = state.rastState; + const API_STATE& state = GetApiState(pDC); + const SWR_RASTSTATE& rastState = state.rastState; // macrotile dimensioning uint32_t macroX, macroY; MacroTileMgr::getTileIndices(macroTile, macroX, macroY); - int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED; - int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1; - int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; + int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED; + int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1; + int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1; - const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex]; + const SWR_RECT& scissorInFixedPoint = + state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex]; // create a copy of the triangle buffer to write our adjusted vertices to OSALIGNSIMD(float) newTriBuffer[4 * 4]; TRIANGLE_WORK_DESC newWorkDesc = workDesc; - newWorkDesc.pTriBuffer = &newTriBuffer[0]; + newWorkDesc.pTriBuffer = &newTriBuffer[0]; // create a copy of the attrib buffer to write our adjusted attribs to OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS]; @@ -81,20 +83,20 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi __m128 vX, vY, vZ, vRecipW; - vX = _mm_load_ps(workDesc.pTriBuffer); - vY = _mm_load_ps(workDesc.pTriBuffer + 4); - vZ = _mm_load_ps(workDesc.pTriBuffer + 8); + vX = _mm_load_ps(workDesc.pTriBuffer); + vY = _mm_load_ps(workDesc.pTriBuffer + 4); + vZ = _mm_load_ps(workDesc.pTriBuffer + 8); vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12); // triangle 0 // v0,v1 -> v0,v0,v1 - __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0)); - __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0)); - __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0)); + __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0)); + __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0)); + __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0)); __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0)); __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth); - __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0); + __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0); if (workDesc.triFlags.yMajor) { vXa = _mm_add_ps(vAdjust, vXa); @@ -123,7 +125,7 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi } // Store user clip distances for triangle 0 - float newClipBuffer[3 * 8]; + float newClipBuffer[3 * 8]; uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask); if (numClipDist) { @@ -151,8 +153,12 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi // setup triangle rasterizer function PFN_WORK_FUNC pfnTriRast; // conservative rast not supported for points/lines - pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false, - SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false)); + pfnTriRast = GetRasterizerFunc(rastState.sampleCount, + rastState.bIsCenterPattern, + false, + SWR_INPUT_COVERAGE_NONE, + EdgeValToEdgeState(ALL_EDGES_VALID), + (pDC->pState->state.scissorsTileAligned == false)); // make sure this macrotile intersects the triangle __m128i vXai = fpToFixedPoint(vXa); @@ -160,23 +166,20 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi OSALIGNSIMD(SWR_RECT) bboxA; calcBoundingBoxInt(vXai, vYai, bboxA); - if (!(bboxA.xmin > macroBoxRight || - bboxA.xmin > scissorInFixedPoint.xmax || - bboxA.xmax - 1 < macroBoxLeft || - bboxA.xmax - 1 < scissorInFixedPoint.xmin || - bboxA.ymin > macroBoxBottom || - bboxA.ymin > scissorInFixedPoint.ymax || - bboxA.ymax - 1 < macroBoxTop || - bboxA.ymax - 1 < scissorInFixedPoint.ymin)) { + if (!(bboxA.xmin > macroBoxRight || bboxA.xmin > scissorInFixedPoint.xmax || + bboxA.xmax - 1 < macroBoxLeft || bboxA.xmax - 1 < scissorInFixedPoint.xmin || + bboxA.ymin > macroBoxBottom || bboxA.ymin > scissorInFixedPoint.ymax || + bboxA.ymax - 1 < macroBoxTop || bboxA.ymax - 1 < scissorInFixedPoint.ymin)) + { // rasterize triangle pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); } // triangle 1 // v0,v1 -> v1,v1,v0 - vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1)); - vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1)); - vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1)); + vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1)); + vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1)); + vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1)); vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1)); vAdjust = _mm_mul_ps(vLineWidth, vBloat1); @@ -233,14 +236,11 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi vYai = fpToFixedPoint(vYa); calcBoundingBoxInt(vXai, vYai, bboxA); - if (!(bboxA.xmin > macroBoxRight || - bboxA.xmin > scissorInFixedPoint.xmax || - bboxA.xmax - 1 < macroBoxLeft || - bboxA.xmax - 1 < scissorInFixedPoint.xmin || - bboxA.ymin > macroBoxBottom || - bboxA.ymin > scissorInFixedPoint.ymax || - bboxA.ymax - 1 < macroBoxTop || - bboxA.ymax - 1 < scissorInFixedPoint.ymin)) { + if (!(bboxA.xmin > macroBoxRight || bboxA.xmin > scissorInFixedPoint.xmax || + bboxA.xmax - 1 < macroBoxLeft || bboxA.xmax - 1 < scissorInFixedPoint.xmin || + bboxA.ymin > macroBoxBottom || bboxA.ymin > scissorInFixedPoint.ymax || + bboxA.ymax - 1 < macroBoxTop || bboxA.ymax - 1 < scissorInFixedPoint.ymin)) + { // rasterize triangle pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); } @@ -248,7 +248,7 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi RDTSC_BEGIN(BERasterizeLine, 1); } -void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData) +void RasterizeSimplePoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData) { #if KNOB_ENABLE_TOSS_POINTS if (KNOB_TOSS_BIN_TRIS) @@ -257,21 +257,19 @@ void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTi } #endif - const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData; - const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; + const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData; + const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; - // map x,y relative offsets from start of raster tile to bit position in + // map x,y relative offsets from start of raster tile to bit position in // coverage mask for the point - static const uint32_t coverageMap[8][8] = { - { 0, 1, 4, 5, 8, 9, 12, 13 }, - { 2, 3, 6, 7, 10, 11, 14, 15 }, - { 16, 17, 20, 21, 24, 25, 28, 29 }, - { 18, 19, 22, 23, 26, 27, 30, 31 }, - { 32, 33, 36, 37, 40, 41, 44, 45 }, - { 34, 35, 38, 39, 42, 43, 46, 47 }, - { 48, 49, 52, 53, 56, 57, 60, 61 }, - { 50, 51, 54, 55, 58, 59, 62, 63 } - }; + static const uint32_t coverageMap[8][8] = {{0, 1, 4, 5, 8, 9, 12, 13}, + {2, 3, 6, 7, 10, 11, 14, 15}, + {16, 17, 20, 21, 24, 25, 28, 29}, + {18, 19, 22, 23, 26, 27, 30, 31}, + {32, 33, 36, 37, 40, 41, 44, 45}, + {34, 35, 38, 39, 42, 43, 46, 47}, + {48, 49, 52, 53, 56, 57, 60, 61}, + {50, 51, 54, 55, 58, 59, 62, 63}}; OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc; @@ -279,7 +277,7 @@ void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTi // @todo use structs for readability uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer; uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1); - float z = *(workDesc.pTriBuffer + 2); + float z = *(workDesc.pTriBuffer + 2); // construct triangle descriptor for point // no interpolation, set up i,j for constant interpolation of z and attribs @@ -294,27 +292,32 @@ void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTi // no persp divide needed for points triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs; - triDesc.triFlags = workDesc.triFlags; - triDesc.recipDet = 1.0f; + triDesc.triFlags = workDesc.triFlags; + triDesc.recipDet = 1.0f; triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f; triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f; triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f; triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z; RenderOutputBuffers renderBuffers; - GetRenderHotTiles(pDC, workerId, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT, - renderBuffers, triDesc.triFlags.renderTargetArrayIndex); + GetRenderHotTiles(pDC, + workerId, + macroTile, + tileAlignedX >> KNOB_TILE_X_DIM_SHIFT, + tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT, + renderBuffers, + triDesc.triFlags.renderTargetArrayIndex); RDTSC_BEGIN(BEPixelBackend, pDC->drawId); backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers); RDTSC_END(BEPixelBackend, 0); } -void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData) +void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData) { - const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData; - const SWR_RASTSTATE& rastState = pDC->pState->state.rastState; - const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState; + const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData; + const SWR_RASTSTATE& rastState = pDC->pState->state.rastState; + const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState; bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0; @@ -326,28 +329,28 @@ void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, // create a copy of the triangle buffer to write our adjusted vertices to OSALIGNSIMD(float) newTriBuffer[4 * 4]; TRIANGLE_WORK_DESC newWorkDesc = workDesc; - newWorkDesc.pTriBuffer = &newTriBuffer[0]; + newWorkDesc.pTriBuffer = &newTriBuffer[0]; // create a copy of the attrib buffer to write our adjusted attribs to OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS]; newWorkDesc.pAttribs = &newAttribBuffer[0]; newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer; - newWorkDesc.numAttribs = workDesc.numAttribs; - newWorkDesc.triFlags = workDesc.triFlags; + newWorkDesc.numAttribs = workDesc.numAttribs; + newWorkDesc.triFlags = workDesc.triFlags; // construct two tris by bloating point by point size float halfPointSize = workDesc.triFlags.pointSize * 0.5f; - float lowerX = x - halfPointSize; - float upperX = x + halfPointSize; - float lowerY = y - halfPointSize; - float upperY = y + halfPointSize; + float lowerX = x - halfPointSize; + float upperX = x + halfPointSize; + float lowerY = y - halfPointSize; + float upperY = y + halfPointSize; // tri 0 - float *pBuf = &newTriBuffer[0]; - *pBuf++ = lowerX; - *pBuf++ = lowerX; - *pBuf++ = upperX; + float* pBuf = &newTriBuffer[0]; + *pBuf++ = lowerX; + *pBuf++ = lowerX; + *pBuf++ = upperX; pBuf++; *pBuf++ = lowerY; *pBuf++ = upperY; @@ -359,8 +362,12 @@ void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, // setup triangle rasterizer function PFN_WORK_FUNC pfnTriRast; // conservative rast not supported for points/lines - pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false, - SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false)); + pfnTriRast = GetRasterizerFunc(rastState.sampleCount, + rastState.bIsCenterPattern, + false, + SWR_INPUT_COVERAGE_NONE, + EdgeValToEdgeState(ALL_EDGES_VALID), + (pDC->pState->state.scissorsTileAligned == false)); // overwrite texcoords for point sprites if (isPointSpriteTexCoordEnabled) @@ -370,8 +377,8 @@ void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, newWorkDesc.pAttribs = &newAttribBuffer[0]; // overwrite texcoord for point sprites - uint32_t texCoordMask = backendState.pointSpriteTexCoordMask; - DWORD texCoordAttrib = 0; + uint32_t texCoordMask = backendState.pointSpriteTexCoordMask; + DWORD texCoordAttrib = 0; while (_BitScanForward(&texCoordAttrib, texCoordMask)) { @@ -400,7 +407,7 @@ void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); // tri 1 - pBuf = &newTriBuffer[0]; + pBuf = &newTriBuffer[0]; *pBuf++ = lowerX; *pBuf++ = upperX; *pBuf++ = upperX; @@ -412,8 +419,8 @@ void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, if (isPointSpriteTexCoordEnabled) { - uint32_t texCoordMask = backendState.pointSpriteTexCoordMask; - DWORD texCoordAttrib = 0; + uint32_t texCoordMask = backendState.pointSpriteTexCoordMask; + DWORD texCoordAttrib = 0; while (_BitScanForward(&texCoordAttrib, texCoordMask)) { @@ -424,7 +431,6 @@ void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0); pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1); pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1); - } else { @@ -444,20 +450,19 @@ void InitRasterizerFunctions() } // Selector for correct templated RasterizeTriangle function -PFN_WORK_FUNC GetRasterizerFunc( - SWR_MULTISAMPLE_COUNT numSamples, - bool IsCenter, - bool IsConservative, - SWR_INPUT_COVERAGE InputCoverage, - uint32_t EdgeEnable, - bool RasterizeScissorEdges -) +PFN_WORK_FUNC GetRasterizerFunc(SWR_MULTISAMPLE_COUNT numSamples, + bool IsCenter, + bool IsConservative, + SWR_INPUT_COVERAGE InputCoverage, + uint32_t EdgeEnable, + bool RasterizeScissorEdges) { SWR_ASSERT(numSamples >= 0 && numSamples < SWR_MULTISAMPLE_TYPE_COUNT); SWR_ASSERT(InputCoverage >= 0 && InputCoverage < SWR_INPUT_COVERAGE_COUNT); SWR_ASSERT(EdgeEnable < STATE_VALID_TRI_EDGE_COUNT); - PFN_WORK_FUNC func = gRasterizerFuncs[numSamples][IsCenter][IsConservative][InputCoverage][EdgeEnable][RasterizeScissorEdges]; + PFN_WORK_FUNC func = gRasterizerFuncs[numSamples][IsCenter][IsConservative][InputCoverage] + [EdgeEnable][RasterizeScissorEdges]; SWR_ASSERT(func); return func; diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h index 414d0f07819..f15cc193129 100644 --- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h +++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h @@ -1,30 +1,30 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file rasterizer.h -* -* @brief Definitions for the rasterizer. -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file rasterizer.h + * + * @brief Definitions for the rasterizer. + * + ******************************************************************************/ #pragma once #include "context.h" @@ -32,9 +32,9 @@ #include "conservativeRast.h" #include "multisample.h" -void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); -void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); -void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); +void RasterizeLine(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData); +void RasterizeSimplePoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData); +void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData); void InitRasterizerFunctions(); INLINE @@ -56,43 +56,43 @@ enum TriEdgesStates enum TriEdgesValues { - NO_VALID_EDGES = 0, - E0_E1_VALID = 0x3, - E0_E2_VALID = 0x5, - E1_E2_VALID = 0x6, + NO_VALID_EDGES = 0, + E0_E1_VALID = 0x3, + E0_E2_VALID = 0x5, + E1_E2_VALID = 0x6, ALL_EDGES_VALID = 0x7, VALID_TRI_EDGE_COUNT, }; // Selector for correct templated RasterizeTriangle function -PFN_WORK_FUNC GetRasterizerFunc( - SWR_MULTISAMPLE_COUNT numSamples, - bool IsCenter, - bool IsConservative, - SWR_INPUT_COVERAGE InputCoverage, - uint32_t EdgeEnable, - bool RasterizeScissorEdges); +PFN_WORK_FUNC GetRasterizerFunc(SWR_MULTISAMPLE_COUNT numSamples, + bool IsCenter, + bool IsConservative, + SWR_INPUT_COVERAGE InputCoverage, + uint32_t EdgeEnable, + bool RasterizeScissorEdges); ////////////////////////////////////////////////////////////////////////// -/// @brief ValidTriEdges convenience typedefs used for templated function +/// @brief ValidTriEdges convenience typedefs used for templated function /// specialization supported Fixed Point precisions typedef std::integral_constant<uint32_t, ALL_EDGES_VALID> AllEdgesValidT; -typedef std::integral_constant<uint32_t, E0_E1_VALID> E0E1ValidT; -typedef std::integral_constant<uint32_t, E0_E2_VALID> E0E2ValidT; -typedef std::integral_constant<uint32_t, E1_E2_VALID> E1E2ValidT; -typedef std::integral_constant<uint32_t, NO_VALID_EDGES> NoEdgesValidT; +typedef std::integral_constant<uint32_t, E0_E1_VALID> E0E1ValidT; +typedef std::integral_constant<uint32_t, E0_E2_VALID> E0E2ValidT; +typedef std::integral_constant<uint32_t, E1_E2_VALID> E1E2ValidT; +typedef std::integral_constant<uint32_t, NO_VALID_EDGES> NoEdgesValidT; typedef std::integral_constant<uint32_t, STATE_ALL_EDGES_VALID> StateAllEdgesValidT; -typedef std::integral_constant<uint32_t, STATE_E0_E1_VALID> StateE0E1ValidT; -typedef std::integral_constant<uint32_t, STATE_E0_E2_VALID> StateE0E2ValidT; -typedef std::integral_constant<uint32_t, STATE_E1_E2_VALID> StateE1E2ValidT; -typedef std::integral_constant<uint32_t, STATE_NO_VALID_EDGES> StateNoEdgesValidT; +typedef std::integral_constant<uint32_t, STATE_E0_E1_VALID> StateE0E1ValidT; +typedef std::integral_constant<uint32_t, STATE_E0_E2_VALID> StateE0E2ValidT; +typedef std::integral_constant<uint32_t, STATE_E1_E2_VALID> StateE1E2ValidT; +typedef std::integral_constant<uint32_t, STATE_NO_VALID_EDGES> StateNoEdgesValidT; // some specializations to convert from edge state to edge bitmask values template <typename EdgeMask> struct EdgeMaskVal { - static_assert(EdgeMask::value > STATE_ALL_EDGES_VALID, "Primary EdgeMaskVal shouldn't be instantiated"); + static_assert(EdgeMask::value > STATE_ALL_EDGES_VALID, + "Primary EdgeMaskVal shouldn't be instantiated"); }; template <> @@ -128,15 +128,15 @@ struct EdgeMaskVal<StateNoEdgesValidT> INLINE uint32_t EdgeValToEdgeState(uint32_t val) { SWR_ASSERT(val < VALID_TRI_EDGE_COUNT, "Unexpected tri edge mask"); - static const uint32_t edgeValToEdgeState[VALID_TRI_EDGE_COUNT] = { 0, 0, 0, 1, 0, 2, 3, 4 }; - return edgeValToEdgeState[val]; + static const uint32_t edgeValToEdgeState[VALID_TRI_EDGE_COUNT] = {0, 0, 0, 1, 0, 2, 3, 4}; + return edgeValToEdgeState[val]; } ////////////////////////////////////////////////////////////////////////// /// @struct RasterScissorEdgesT -/// @brief Primary RasterScissorEdgesT templated struct that holds compile -/// time information about the number of edges needed to be rasterized, -/// If either the scissor rect or conservative rast is enabled, +/// @brief Primary RasterScissorEdgesT templated struct that holds compile +/// time information about the number of edges needed to be rasterized, +/// If either the scissor rect or conservative rast is enabled, /// the scissor test is enabled and the rasterizer will test /// 3 triangle edges + 4 scissor edges for coverage. /// @tparam RasterScissorEdgesT: number of multisamples @@ -145,20 +145,20 @@ INLINE uint32_t EdgeValToEdgeState(uint32_t val) template <typename RasterScissorEdgesT, typename ConservativeT, typename EdgeMaskT> struct RasterEdgeTraits { - typedef std::true_type RasterizeScissorEdgesT; + typedef std::true_type RasterizeScissorEdgesT; typedef std::integral_constant<uint32_t, 7> NumEdgesT; - //typedef std::integral_constant<uint32_t, EdgeMaskT::value> ValidEdgeMaskT; + // typedef std::integral_constant<uint32_t, EdgeMaskT::value> ValidEdgeMaskT; typedef typename EdgeMaskVal<EdgeMaskT>::T ValidEdgeMaskT; }; ////////////////////////////////////////////////////////////////////////// /// @brief specialization of RasterEdgeTraits. If neither scissor rect -/// nor conservative rast is enabled, only test 3 triangle edges +/// nor conservative rast is enabled, only test 3 triangle edges /// for coverage template <typename EdgeMaskT> struct RasterEdgeTraits<std::false_type, std::false_type, EdgeMaskT> { - typedef std::false_type RasterizeScissorEdgesT; + typedef std::false_type RasterizeScissorEdgesT; typedef std::integral_constant<uint32_t, 3> NumEdgesT; // no need for degenerate edge masking in non-conservative case; rasterize all triangle edges typedef std::integral_constant<uint32_t, ALL_EDGES_VALID> ValidEdgeMaskT; @@ -166,45 +166,72 @@ struct RasterEdgeTraits<std::false_type, std::false_type, EdgeMaskT> ////////////////////////////////////////////////////////////////////////// /// @struct RasterizerTraits -/// @brief templated struct that holds compile time information used +/// @brief templated struct that holds compile time information used /// during rasterization. Inherits EdgeTraits and ConservativeRastBETraits. /// @tparam NumSamplesT: number of multisamples /// @tparam ConservativeT: is this a conservative rasterization /// @tparam InputCoverageT: what type of input coverage is the PS expecting? /// (only used with conservative rasterization) /// @tparam RasterScissorEdgesT: do we need to rasterize with a scissor? -template <typename NumSamplesT, typename CenterPatternT, typename ConservativeT, typename InputCoverageT, typename EdgeEnableT, typename RasterScissorEdgesT> +template <typename NumSamplesT, + typename CenterPatternT, + typename ConservativeT, + typename InputCoverageT, + typename EdgeEnableT, + typename RasterScissorEdgesT> struct _RasterizerTraits : public ConservativeRastBETraits<ConservativeT, InputCoverageT>, - public RasterEdgeTraits<RasterScissorEdgesT, ConservativeT, EdgeEnableT> + public RasterEdgeTraits<RasterScissorEdgesT, ConservativeT, EdgeEnableT> { - typedef MultisampleTraits<static_cast<SWR_MULTISAMPLE_COUNT>(NumSamplesT::value), CenterPatternT::value> MT; + typedef MultisampleTraits<static_cast<SWR_MULTISAMPLE_COUNT>(NumSamplesT::value), + CenterPatternT::value> + MT; /// Fixed point precision the rasterizer is using typedef FixedPointTraits<Fixed_16_8> PrecisionT; /// Fixed point precision of the edge tests used during rasterization typedef FixedPointTraits<Fixed_X_16> EdgePrecisionT; - // If conservative rast or MSAA center pattern is enabled, only need a single sample coverage test, with the result copied to all samples - typedef std::integral_constant<int, ConservativeT::value ? 1 : MT::numCoverageSamples> NumCoverageSamplesT; + // If conservative rast or MSAA center pattern is enabled, only need a single sample coverage + // test, with the result copied to all samples + typedef std::integral_constant<int, ConservativeT::value ? 1 : MT::numCoverageSamples> + NumCoverageSamplesT; - static_assert(EdgePrecisionT::BitsT::value >= ConservativeRastBETraits<ConservativeT, InputCoverageT>::ConservativePrecisionT::BitsT::value, - "Rasterizer edge fixed point precision < required conservative rast precision"); + static_assert( + EdgePrecisionT::BitsT::value >= + ConservativeRastBETraits<ConservativeT, + InputCoverageT>::ConservativePrecisionT::BitsT::value, + "Rasterizer edge fixed point precision < required conservative rast precision"); /// constants used to offset between different types of raster tiles - static const int colorRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8)) * MT::numSamples}; - static const int depthRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8)) * MT::numSamples}; - static const int stencilRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8)) * MT::numSamples}; - static const int colorRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * colorRasterTileStep}; - static const int depthRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM)* depthRasterTileStep}; - static const int stencilRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * stencilRasterTileStep}; + static const int colorRasterTileStep{ + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8)) * + MT::numSamples}; + static const int depthRasterTileStep{ + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8)) * + MT::numSamples}; + static const int stencilRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * + (FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8)) * + MT::numSamples}; + static const int colorRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * + colorRasterTileStep}; + static const int depthRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * + depthRasterTileStep}; + static const int stencilRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * + stencilRasterTileStep}; }; -template <uint32_t NumSamplesT, uint32_t CenterPatternT, uint32_t ConservativeT, uint32_t InputCoverageT, uint32_t EdgeEnableT, uint32_t RasterScissorEdgesT> -struct RasterizerTraits final : public _RasterizerTraits < - std::integral_constant<uint32_t, NumSamplesT>, - std::integral_constant<bool, CenterPatternT != 0>, - std::integral_constant<bool, ConservativeT != 0>, - std::integral_constant<uint32_t, InputCoverageT>, - std::integral_constant<uint32_t, EdgeEnableT>, - std::integral_constant<bool, RasterScissorEdgesT != 0> > -{}; +template <uint32_t NumSamplesT, + uint32_t CenterPatternT, + uint32_t ConservativeT, + uint32_t InputCoverageT, + uint32_t EdgeEnableT, + uint32_t RasterScissorEdgesT> +struct RasterizerTraits final + : public _RasterizerTraits<std::integral_constant<uint32_t, NumSamplesT>, + std::integral_constant<bool, CenterPatternT != 0>, + std::integral_constant<bool, ConservativeT != 0>, + std::integral_constant<uint32_t, InputCoverageT>, + std::integral_constant<uint32_t, EdgeEnableT>, + std::integral_constant<bool, RasterScissorEdgesT != 0>> +{ +}; diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h index ca39d7c38f8..20206eaaaf5 100644 --- a/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h +++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h @@ -1,30 +1,30 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file rasterizer.cpp -* -* @brief Implementation for the rasterizer. -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file rasterizer.cpp + * + * @brief Implementation for the rasterizer. + * + ******************************************************************************/ #include <vector> #include <algorithm> @@ -37,18 +37,29 @@ #include "tilemgr.h" #include "memory/tilingtraits.h" -extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT][STATE_VALID_TRI_EDGE_COUNT][2]; +extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT] + [STATE_VALID_TRI_EDGE_COUNT][2]; template <uint32_t numSamples = 1> -void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex); +void GetRenderHotTiles(DRAW_CONTEXT* pDC, + uint32_t workerId, + uint32_t macroID, + uint32_t x, + uint32_t y, + RenderOutputBuffers& renderBuffers, + uint32_t renderTargetArrayIndex); template <typename RT> -void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers); +void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers& buffers); template <typename RT> -void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow); +void StepRasterTileY(uint32_t colorHotTileMask, + RenderOutputBuffers& buffers, + RenderOutputBuffers& startBufferRow); -#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3} -static const __m256d gMaskToVecpd[] = -{ +#define MASKTOVEC(i3, i2, i1, i0) \ + { \ + -i0, -i1, -i2, -i3 \ + } +static const __m256d gMaskToVecpd[] = { MASKTOVEC(0, 0, 0, 0), MASKTOVEC(0, 0, 0, 1), MASKTOVEC(0, 0, 1, 0), @@ -74,11 +85,11 @@ struct POS struct EDGE { - double a, b; // a, b edge coefficients in fix8 - double stepQuadX; // step to adjacent horizontal quad in fix16 - double stepQuadY; // step to adjacent vertical quad in fix16 - double stepRasterTileX; // step to adjacent horizontal raster tile in fix16 - double stepRasterTileY; // step to adjacent vertical raster tile in fix16 + double a, b; // a, b edge coefficients in fix8 + double stepQuadX; // step to adjacent horizontal quad in fix16 + double stepQuadY; // step to adjacent vertical quad in fix16 + double stepRasterTileX; // step to adjacent horizontal raster tile in fix16 + double stepRasterTileY; // step to adjacent vertical raster tile in fix16 __m256d vQuadOffsets; // offsets for 4 samples of a quad __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile @@ -86,12 +97,15 @@ struct EDGE ////////////////////////////////////////////////////////////////////////// /// @brief rasterize a raster tile partially covered by the triangle -/// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile +/// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster +/// tile /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C) /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad. /// Used to step between quads when sweeping over the raster tile. -template<uint32_t NumEdges, typename EdgeMaskT> -INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges) +template <uint32_t NumEdges, typename EdgeMaskT> +INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT* pDC, + double startEdges[NumEdges], + EDGE* pRastEdges) { uint64_t coverageMask = 0; @@ -111,50 +125,49 @@ INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdg // fast unrolled version for 8x8 tile #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8 - int edgeMask[NumEdges]; + int edgeMask[NumEdges]; uint64_t mask; - auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);}; - auto update_lambda = [&](int e){mask &= edgeMask[e];}; - auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);}; - auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);}; - auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);}; + auto eval_lambda = [&](int e) { edgeMask[e] = _mm256_movemask_pd(vEdges[e]); }; + auto update_lambda = [&](int e) { mask &= edgeMask[e]; }; + auto incx_lambda = [&](int e) { vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]); }; + auto incy_lambda = [&](int e) { vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]); }; + auto decx_lambda = [&](int e) { vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]); }; // evaluate which pixels in the quad are covered -#define EVAL \ - UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda); +#define EVAL UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda); // update coverage mask // if edge 0 is degenerate and will be skipped; init the mask -#define UPDATE_MASK(bit) \ - if(std::is_same<EdgeMaskT, E1E2ValidT>::value || std::is_same<EdgeMaskT, NoEdgesValidT>::value){\ - mask = 0xf;\ - }\ - else{\ - mask = edgeMask[0]; \ - }\ - UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \ - coverageMask |= (mask << bit); - - // step in the +x direction to the next quad -#define INCX \ - UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda); - - // step in the +y direction to the next quad -#define INCY \ - UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda); - - // step in the -x direction to the next quad -#define DECX \ - UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda); - - // sweep 2x2 quad back and forth through the raster tile, +#define UPDATE_MASK(bit) \ + if (std::is_same<EdgeMaskT, E1E2ValidT>::value || \ + std::is_same<EdgeMaskT, NoEdgesValidT>::value) \ + { \ + mask = 0xf; \ + } \ + else \ + { \ + mask = edgeMask[0]; \ + } \ + UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \ + coverageMask |= (mask << bit); + + // step in the +x direction to the next quad +#define INCX UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda); + + // step in the +y direction to the next quad +#define INCY UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda); + + // step in the -x direction to the next quad +#define DECX UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda); + + // sweep 2x2 quad back and forth through the raster tile, // computing coverage masks for the entire tile // raster tile - // 0 1 2 3 4 5 6 7 + // 0 1 2 3 4 5 6 7 // x x - // x x ------------------> + // x x ------------------> // x x | // <-----------------x x V // .. @@ -173,7 +186,7 @@ INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdg UPDATE_MASK(12); INCY; - //row 1 + // row 1 EVAL; UPDATE_MASK(28); DECX; @@ -215,7 +228,7 @@ INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdg UPDATE_MASK(48); #else uint32_t bit = 0; - for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y) + for (uint32_t y = 0; y < KNOB_TILE_Y_DIM / 2; ++y) { __m256d vStartOfRowEdge[NumEdges]; for (uint32_t e = 0; e < NumEdges; ++e) @@ -223,7 +236,7 @@ INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdg vStartOfRowEdge[e] = vEdges[e]; } - for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x) + for (uint32_t x = 0; x < KNOB_TILE_X_DIM / 2; ++x) { int edgeMask[NumEdges]; for (uint32_t e = 0; e < NumEdges; ++e) @@ -243,7 +256,7 @@ INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdg { vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]); } - bit+=4; + bit += 4; } // step to the next row @@ -254,20 +267,19 @@ INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdg } #endif return coverageMask; - } // Top left rule: // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge -// Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge -// Top left: a sample is in if it is a top or left edge. -// Out: !(horizontal && above) = !horizontal && below -// Out: !horizontal && left = !(!horizontal && left) = horizontal and right -INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge) +// Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it +// is a 'left' edge Top left: a sample is in if it is a top or left edge. Out: !(horizontal && +// above) = !horizontal && below Out: !horizontal && left = !(!horizontal && left) = horizontal and +// right +INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d& vEdge) { // if vA < 0, vC-- // if vA == 0 && vB < 0, vC-- - __m256d vEdgeOut = vEdge; + __m256d vEdgeOut = vEdge; __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0)); // if vA < 0 (line is not horizontal and below) @@ -275,7 +287,7 @@ INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256 // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri) __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128()); - int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp)); + int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp)); msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB)); // if either of these are true and we're on the line (edge == 0), bump it outside the line @@ -285,17 +297,19 @@ INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256 ////////////////////////////////////////////////////////////////////////// /// @brief calculates difference in precision between the result of manh /// calculation and the edge precision, based on compile time trait values -template<typename RT> +template <typename RT> constexpr int64_t ManhToEdgePrecisionAdjust() { - static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value, + static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= + RT::EdgePrecisionT::BitsT::value, "Inadequate precision of result of manh calculation "); - return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value); + return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - + RT::EdgePrecisionT::BitsT::value); } ////////////////////////////////////////////////////////////////////////// /// @struct adjustEdgeConservative -/// @brief Primary template definition used for partially specializing +/// @brief Primary template definition used for partially specializing /// the adjustEdgeConservative function. This struct should never /// be instantiated. /// @tparam RT: rasterizer traits @@ -306,38 +320,42 @@ struct adjustEdgeConservative ////////////////////////////////////////////////////////////////////////// /// @brief Performs calculations to adjust each edge of a triangle away /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y - /// direction. + /// direction. /// /// Uncertainty regions arise from fixed point rounding, which /// can snap a vertex +/- by min fixed point value. /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners. - /// This allows the rasterizer to test for coverage only at the pixel center, + /// This allows the rasterizer to test for coverage only at the pixel center, /// instead of having to test individual pixel corners for conservative coverage - INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) + INLINE adjustEdgeConservative(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge) { - // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away - // from the pixel center (in the direction of the edge normal A/B) + // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge + // away from the pixel center (in the direction of the edge normal A/B) // edge = Ax + Bx + C - (manh/e) // manh = manhattan distance = abs(A) + abs(B) // e = absolute rounding error from snapping from float to fixed point precision - // 'fixed point' multiply (in double to be avx1 friendly) + // 'fixed point' multiply (in double to be avx1 friendly) // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example - __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi)); - __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)), - _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value))); - - static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value, + __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), + vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi)); + __m256d manh = + _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)), + _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value))); + + static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= + RT::EdgePrecisionT::BitsT::value, "Inadequate precision of result of manh calculation "); - // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision - // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right + // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the + // same precision since we're doing fixed math in double format, multiply by multiples of + // 1/2 instead of a bit shift right manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5)); - // move the edge away from the pixel center by the required conservative precision + 1/2 pixel - // this allows the rasterizer to do a single conservative coverage test to see if the primitive - // intersects the pixel at all + // move the edge away from the pixel center by the required conservative precision + 1/2 + // pixel this allows the rasterizer to do a single conservative coverage test to see if the + // primitive intersects the pixel at all vEdge = _mm256_sub_pd(vEdge, manh); }; }; @@ -347,43 +365,51 @@ struct adjustEdgeConservative template <typename RT> struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>> { - INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {}; + INLINE adjustEdgeConservative(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge){}; }; ////////////////////////////////////////////////////////////////////////// -/// @brief calculates the distance a degenerate BBox needs to be adjusted +/// @brief calculates the distance a degenerate BBox needs to be adjusted /// for conservative rast based on compile time trait values -template<typename RT> +template <typename RT> constexpr int64_t ConservativeScissorOffset() { - static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, "Rasterizer precision > conservative precision"); - // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges - typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> DegenerateEdgeOffsetT; + static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, + "Rasterizer precision > conservative precision"); + // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox + // when calculating scissor edges + typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> + DegenerateEdgeOffsetT; // 1/2 pixel edge offset + conservative offset - degenerateTriangle - return RT::ConservativeEdgeOffsetT::value - (DegenerateEdgeOffsetT::value << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value)); + return RT::ConservativeEdgeOffsetT::value - + (DegenerateEdgeOffsetT::value + << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value)); } ////////////////////////////////////////////////////////////////////////// /// @brief Performs calculations to adjust each a vector of evaluated edges out /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y -/// direction. +/// direction. template <typename RT> -INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge) +INLINE void adjustScissorEdge(const double a, const double b, __m256d& vEdge) { int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b)); - int64_t manh = ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> ManhToEdgePrecisionAdjust<RT>(); + int64_t manh = + ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> + ManhToEdgePrecisionAdjust<RT>(); vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh)); }; ////////////////////////////////////////////////////////////////////////// /// @brief Performs calculations to adjust each a scalar evaluated edge out /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y -/// direction. +/// direction. template <typename RT, typename OffsetT> INLINE double adjustScalarEdge(const double a, const double b, const double Edge) { int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b)); - int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>(); + int64_t manh = + ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>(); return (Edge - manh); }; @@ -392,12 +418,14 @@ INLINE double adjustScalarEdge(const double a, const double b, const double Edge template <typename RT, typename EdgeOffsetT> struct adjustEdgesFix16 { - INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) + INLINE adjustEdgesFix16(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge) { - static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value, - "Edge equation expected to be in x.16 fixed point"); + static_assert( + std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value, + "Edge equation expected to be in x.16 fixed point"); - static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled"); + static_assert(RT::IsConservativeT::value, + "Edge offset assumes conservative rasterization is enabled"); // need to apply any edge offsets before applying the top-left rule adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge); @@ -411,7 +439,7 @@ struct adjustEdgesFix16 template <typename RT> struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>> { - INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) + INLINE adjustEdgesFix16(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge) { adjustTopLeftRuleIntFix16(vAi, vBi, vEdge); } @@ -449,7 +477,8 @@ INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc) return std::max(dzdx, dzdy); } -INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z) +INLINE float +ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z) { if (pState->depthFormat == R24_UNORM_X8_TYPELESS) { @@ -464,7 +493,7 @@ INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_D SWR_ASSERT(pState->depthFormat == R32_FLOAT); // for f32 depth, factor = 2^(exponent(max(abs(z) - 23) - float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2]))); + float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2]))); uint32_t zMaxInt = *(uint32_t*)&zMax; zMaxInt &= 0x7f800000; zMax = *(float*)&zMaxInt; @@ -473,7 +502,8 @@ INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_D } } -INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z) +INLINE float +ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z) { if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0) { @@ -512,7 +542,8 @@ __declspec(thread) volatile uint64_t gToss; static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4; // try to avoid _chkstk insertions; make this thread local -static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib]; +static THREAD +OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib]; INLINE void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge) @@ -534,11 +565,13 @@ void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge) __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8); __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8); - edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16); + edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16); // compute raster tile offsets - const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0); - const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0); + const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd( + (KNOB_TILE_X_DIM - 1) * FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1) * FIXED_POINT_SCALE, 0); + const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd( + (KNOB_TILE_Y_DIM - 1) * FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1) * FIXED_POINT_SCALE, 0, 0); __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8); __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8); @@ -552,30 +585,33 @@ void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge) } ////////////////////////////////////////////////////////////////////////// -/// @brief Primary template definition used for partially specializing -/// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel +/// @brief Primary template definition used for partially specializing +/// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel /// corner to sample position, and test for coverage /// @tparam sampleCount: multisample count template <typename NumSamplesT> -INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16, - int32_t &mask0, int32_t &mask1, int32_t &mask2) +INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], + const __m256d* vEdgeFix16, + int32_t& mask0, + int32_t& mask1, + int32_t& mask2) { __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2; // evaluate edge equations at the tile multisample bounding box vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]); vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]); vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]); - mask0 = _mm256_movemask_pd(vSampleBboxTest0); - mask1 = _mm256_movemask_pd(vSampleBboxTest1); - mask2 = _mm256_movemask_pd(vSampleBboxTest2); + mask0 = _mm256_movemask_pd(vSampleBboxTest0); + mask1 = _mm256_movemask_pd(vSampleBboxTest1); + mask2 = _mm256_movemask_pd(vSampleBboxTest2); } ////////////////////////////////////////////////////////////////////////// /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated /// when only rasterizing a single coverage test point template <> -INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* vEdgeFix16, - int32_t &mask0, int32_t &mask1, int32_t &mask2) +INLINE void UpdateEdgeMasks<SingleSampleT>( + const __m256d (&)[3], const __m256d* vEdgeFix16, int32_t& mask0, int32_t& mask1, int32_t& mask2) { mask0 = _mm256_movemask_pd(vEdgeFix16[0]); mask1 = _mm256_movemask_pd(vEdgeFix16[1]); @@ -585,7 +621,7 @@ INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* v ////////////////////////////////////////////////////////////////////////// /// @struct ComputeScissorEdges /// @brief Primary template definition. Allows the function to be generically -/// called. When paired with below specializations, will result in an empty +/// called. When paired with below specializations, will result in an empty /// inlined function if scissor is not enabled /// @tparam RasterScissorEdgesT: is scissor enabled? /// @tparam IsConservativeT: is conservative rast enabled? @@ -593,21 +629,29 @@ INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* v template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT> struct ComputeScissorEdges { - INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y, - EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){}; + INLINE ComputeScissorEdges(const SWR_RECT& triBBox, + const SWR_RECT& scissorBBox, + const int32_t x, + const int32_t y, + EDGE (&rastEdges)[RT::NumEdgesT::value], + __m256d (&vEdgeFix16)[7]){}; }; ////////////////////////////////////////////////////////////////////////// -/// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial +/// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial /// specialization. Instantiated when conservative rast and scissor are enabled template <typename RT> struct ComputeScissorEdges<std::true_type, std::true_type, RT> { ////////////////////////////////////////////////////////////////////////// - /// @brief Intersect tri bbox with scissor, compute scissor edge vectors, + /// @brief Intersect tri bbox with scissor, compute scissor edge vectors, /// evaluate edge equations and offset them away from pixel center. - INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y, - EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]) + INLINE ComputeScissorEdges(const SWR_RECT& triBBox, + const SWR_RECT& scissorBBox, + const int32_t x, + const int32_t y, + EDGE (&rastEdges)[RT::NumEdgesT::value], + __m256d (&vEdgeFix16)[7]) { // if conservative rasterizing, triangle bbox intersected with scissor bbox is used SWR_RECT scissor; @@ -627,12 +671,17 @@ struct ComputeScissorEdges<std::true_type, std::true_type, RT> ComputeEdgeData(bottomRight, topRight, rastEdges[5]); ComputeEdgeData(topRight, topLeft, rastEdges[6]); - vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin))); - vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax))); - vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax))); - vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin))); - - // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing + vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + + (rastEdges[3].b * (y - scissor.ymin))); + vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + + (rastEdges[4].b * (y - scissor.ymax))); + vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + + (rastEdges[5].b * (y - scissor.ymax))); + vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + + (rastEdges[6].b * (y - scissor.ymin))); + + // if conservative rasterizing, need to bump the scissor edges out by the conservative + // uncertainty distance, else do nothing adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]); adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]); adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]); @@ -645,7 +694,7 @@ struct ComputeScissorEdges<std::true_type, std::true_type, RT> }; ////////////////////////////////////////////////////////////////////////// -/// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial +/// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial /// specialization. Instantiated when scissor is enabled and conservative rast /// is disabled. template <typename RT> @@ -653,14 +702,18 @@ struct ComputeScissorEdges<std::true_type, std::false_type, RT> { ////////////////////////////////////////////////////////////////////////// /// @brief Compute scissor edge vectors and evaluate edge equations - INLINE ComputeScissorEdges(const SWR_RECT &, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y, - EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]) + INLINE ComputeScissorEdges(const SWR_RECT&, + const SWR_RECT& scissorBBox, + const int32_t x, + const int32_t y, + EDGE (&rastEdges)[RT::NumEdgesT::value], + __m256d (&vEdgeFix16)[7]) { - const SWR_RECT &scissor = scissorBBox; - POS topLeft{scissor.xmin, scissor.ymin}; - POS bottomLeft{scissor.xmin, scissor.ymax}; - POS topRight{scissor.xmax, scissor.ymin}; - POS bottomRight{scissor.xmax, scissor.ymax}; + const SWR_RECT& scissor = scissorBBox; + POS topLeft{scissor.xmin, scissor.ymin}; + POS bottomLeft{scissor.xmin, scissor.ymax}; + POS topRight{scissor.xmax, scissor.ymin}; + POS bottomRight{scissor.xmax, scissor.ymax}; // construct 4 scissor edges in ccw direction ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]); @@ -668,10 +721,14 @@ struct ComputeScissorEdges<std::true_type, std::false_type, RT> ComputeEdgeData(bottomRight, topRight, rastEdges[5]); ComputeEdgeData(topRight, topLeft, rastEdges[6]); - vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin))); - vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax))); - vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax))); - vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin))); + vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + + (rastEdges[3].b * (y - scissor.ymin))); + vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + + (rastEdges[4].b * (y - scissor.ymax))); + vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + + (rastEdges[5].b * (y - scissor.ymax))); + vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + + (rastEdges[6].b * (y - scissor.ymin))); // Upper left rule for scissor vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0)); @@ -723,7 +780,8 @@ INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int template <> INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2) { - return (!(mask0 && mask1 && mask2)) ? true : false;; + return (!(mask0 && mask1 && mask2)) ? true : false; + ; }; ////////////////////////////////////////////////////////////////////////// @@ -737,7 +795,7 @@ INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int) ////////////////////////////////////////////////////////////////////////// /// @brief Primary function template for TrivialAcceptTest. Always returns -/// false, since it will only be called for degenerate tris, and as such +/// false, since it will only be called for degenerate tris, and as such /// will never cover the entire raster tile template <typename ScissorEnableT> INLINE bool TrivialAcceptTest(const int, const int, const int) @@ -760,27 +818,33 @@ INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT> struct GenerateSVInnerCoverage { - INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*, uint64_t &){}; + INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*, uint64_t&){}; }; ////////////////////////////////////////////////////////////////////////// /// @brief Specialization of GenerateSVInnerCoverage where all edges -/// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated +/// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated /// edge values from OuterConservative to InnerConservative and rasterizes. template <typename RT> struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT> { - INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, uint32_t workerId, EDGE* pRastEdges, double* pStartQuadEdges, uint64_t &innerCoverageMask) + INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, + uint32_t workerId, + EDGE* pRastEdges, + double* pStartQuadEdges, + uint64_t& innerCoverageMask) { double startQuadEdgesAdj[RT::NumEdgesT::value]; - for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e) + for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) { - startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]); + startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>( + pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]); } // not trivial accept or reject, must rasterize full tile RDTSC_BEGIN(BERasterizePartial, pDC->drawId); - innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges); + innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>( + pDC, startQuadEdgesAdj, pRastEdges); RDTSC_END(BERasterizePartial, 0); } }; @@ -791,43 +855,62 @@ struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT> template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT> struct UpdateEdgeMasksInnerConservative { - INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*, - const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){}; + INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], + const __m256d*, + const __m128i, + const __m128i, + int32_t&, + int32_t&, + int32_t&){}; }; ////////////////////////////////////////////////////////////////////////// /// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges -/// are non-degenerate and SVInnerCoverage is requested. Offsets the edges -/// evaluated at raster tile corners to inner conservative position and +/// are non-degenerate and SVInnerCoverage is requested. Offsets the edges +/// evaluated at raster tile corners to inner conservative position and /// updates edge masks template <typename RT> struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT> { - INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16, - const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2) + INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], + const __m256d* vEdgeFix16, + const __m128i vAi, + const __m128i vBi, + int32_t& mask0, + int32_t& mask1, + int32_t& mask2) { __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]}; - // instead of keeping 2 copies of evaluated edges around, just compensate for the outer + // instead of keeping 2 copies of evaluated edges around, just compensate for the outer // conservative evaluated edge when adjusting the edge in for inner conservative tests - adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[0]); - adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[1]); - adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[2]); - - UpdateEdgeMasks<typename RT::NumCoverageSamplesT>(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2); + adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>( + vAi, vBi, vTempEdge[0]); + adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>( + vAi, vBi, vTempEdge[1]); + adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>( + vAi, vBi, vTempEdge[2]); + + UpdateEdgeMasks<typename RT::NumCoverageSamplesT>( + vEdgeTileBbox, vTempEdge, mask0, mask1, mask2); } }; ////////////////////////////////////////////////////////////////////////// -/// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage -/// is requested but at least one edge is degenerate. Since a degenerate triangle cannot +/// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage +/// is requested but at least one edge is degenerate. Since a degenerate triangle cannot /// cover an entire raster tile, set mask0 to 0 to force it down the /// rastierizePartialTile path template <typename RT, typename ValidEdgeMaskT> struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT> { - INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*, - const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &) + INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], + const __m256d*, + const __m128i, + const __m128i, + int32_t& mask0, + int32_t&, + int32_t&) { // set one mask to zero to force the triangle down the rastierizePartialTile path mask0 = 0; @@ -837,7 +920,7 @@ struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCov template <typename RT> void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc) { - const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc); + const TRIANGLE_WORK_DESC& workDesc = *((TRIANGLE_WORK_DESC*)pDesc); #if KNOB_ENABLE_TOSS_POINTS if (KNOB_TOSS_BIN_TRIS) { @@ -847,24 +930,25 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, RDTSC_BEGIN(BERasterizeTriangle, pDC->drawId); RDTSC_BEGIN(BETriangleSetup, pDC->drawId); - const API_STATE &state = GetApiState(pDC); - const SWR_RASTSTATE &rastState = state.rastState; + const API_STATE& state = GetApiState(pDC); + const SWR_RASTSTATE& rastState = state.rastState; const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc; triDesc.pUserClipBuffer = workDesc.pUserClipBuffer; __m128 vX, vY, vZ, vRecipW; - + // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care // eg: vX = [x0 x1 x2 dc] - vX = _mm_load_ps(workDesc.pTriBuffer); - vY = _mm_load_ps(workDesc.pTriBuffer + 4); - vZ = _mm_load_ps(workDesc.pTriBuffer + 8); + vX = _mm_load_ps(workDesc.pTriBuffer); + vY = _mm_load_ps(workDesc.pTriBuffer + 4); + vZ = _mm_load_ps(workDesc.pTriBuffer + 8); vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12); // convert to fixed point - static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, "Rasterizer expects 16.8 fixed point precision"); + static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, + "Rasterizer expects 16.8 fixed point precision"); __m128i vXi = fpToFixedPoint(vX); __m128i vYi = fpToFixedPoint(vY); @@ -879,12 +963,12 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, __m128i vAi, vBi; triangleSetupABInt(vXi, vYi, vAi, vBi); - + // determinant float det = calcDeterminantInt(vAi, vBi); // Verts in Pixel Coordinate Space at this point - // Det > 0 = CW winding order + // Det > 0 = CW winding order // Convert CW triangles to CCW if (det > 0.0) { @@ -899,9 +983,9 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, // Finish triangle setup - C edge coef triangleSetupC(vX, vY, vA, vB, vC); - if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID) + if (RT::ValidEdgeMaskT::value != ALL_EDGES_VALID) { - // If we have degenerate edge(s) to rasterize, set I and J coefs + // If we have degenerate edge(s) to rasterize, set I and J coefs // to 0 for constant interpolation of attributes triDesc.I[0] = 0.0f; triDesc.I[1] = 0.0f; @@ -915,7 +999,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, } else { - // only extract coefs for 2 of the barycentrics; the 3rd can be + // only extract coefs for 2 of the barycentrics; the 3rd can be // determined from the barycentric equation: // i + j + k = 1 <=> k = 1 - j - i _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1); @@ -926,7 +1010,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2); // compute recipDet, used to calculate barycentric i and j in the backend - triDesc.recipDet = 1.0f/det; + triDesc.recipDet = 1.0f / det; } OSALIGNSIMD(float) oneOverW[4]; @@ -935,31 +1019,31 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2]; triDesc.OneOverW[2] = oneOverW[2]; - // calculate perspective correct coefs per vertex attrib - float* pPerspAttribs = perspAttribsTLS; - float* pAttribs = workDesc.pAttribs; + // calculate perspective correct coefs per vertex attrib + float* pPerspAttribs = perspAttribsTLS; + float* pAttribs = workDesc.pAttribs; triDesc.pPerspAttribs = pPerspAttribs; - triDesc.pAttribs = pAttribs; - float *pRecipW = workDesc.pTriBuffer + 12; - triDesc.pRecipW = pRecipW; - __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW); - __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1); - __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1); - for(uint32_t i = 0; i < workDesc.numAttribs; i++) + triDesc.pAttribs = pAttribs; + float* pRecipW = workDesc.pTriBuffer + 12; + triDesc.pRecipW = pRecipW; + __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW); + __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW += 1); + __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW += 1); + for (uint32_t i = 0; i < workDesc.numAttribs; i++) { __m128 attribA = _mm_load_ps(pAttribs); - __m128 attribB = _mm_load_ps(pAttribs+=4); - __m128 attribC = _mm_load_ps(pAttribs+=4); - pAttribs+=4; + __m128 attribB = _mm_load_ps(pAttribs += 4); + __m128 attribC = _mm_load_ps(pAttribs += 4); + pAttribs += 4; attribA = _mm_mul_ps(attribA, vOneOverWV0); attribB = _mm_mul_ps(attribB, vOneOverWV1); attribC = _mm_mul_ps(attribC, vOneOverWV2); _mm_store_ps(pPerspAttribs, attribA); - _mm_store_ps(pPerspAttribs+=4, attribB); - _mm_store_ps(pPerspAttribs+=4, attribC); - pPerspAttribs+=4; + _mm_store_ps(pPerspAttribs += 4, attribB); + _mm_store_ps(pPerspAttribs += 4, attribC); + pPerspAttribs += 4; } // compute bary Z @@ -969,7 +1053,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, triDesc.Z[0] = a[0] - a[2]; triDesc.Z[1] = a[1] - a[2]; triDesc.Z[2] = a[2]; - + // add depth bias triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8); @@ -977,12 +1061,17 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, OSALIGNSIMD(SWR_RECT) bbox; calcBoundingBoxInt(vXi, vYi, bbox); - const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex]; + const SWR_RECT& scissorInFixedPoint = + state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex]; - if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID) + if (RT::ValidEdgeMaskT::value != ALL_EDGES_VALID) { - // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid - bbox.xmin--; bbox.xmax++; bbox.ymin--; bbox.ymax++; + // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is + // valid + bbox.xmin--; + bbox.xmax++; + bbox.ymin--; + bbox.ymax++; SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0, "Conservative rast degenerate handling requires a valid scissor rect"); } @@ -996,12 +1085,13 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, triDesc.triFlags = workDesc.triFlags; - // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox + // further constrain backend to intersecting bounding box of macro tile and scissored triangle + // bbox uint32_t macroX, macroY; MacroTileMgr::getTileIndices(macroTile, macroX, macroY); - int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED; - int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1; - int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; + int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED; + int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1; + int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1; intersect.xmin = std::max(intersect.xmin, macroBoxLeft); @@ -1009,19 +1099,21 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, intersect.xmax = std::min(intersect.xmax, macroBoxRight); intersect.ymax = std::min(intersect.ymax, macroBoxBottom); - SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && intersect.ymax >= 0); + SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && + intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && + intersect.ymax >= 0); RDTSC_END(BETriangleSetup, 0); // update triangle desc - uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); - uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); - uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); - uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); + uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); + uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); + uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); + uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); uint32_t numTilesX = maxTileX - minTileX + 1; uint32_t numTilesY = maxTileY - minTileY + 1; - if (numTilesX == 0 || numTilesY == 0) + if (numTilesX == 0 || numTilesY == 0) { RDTSC_EVENT(BEEmptyTriangle, 1, 0); RDTSC_END(BERasterizeTriangle, 1); @@ -1040,7 +1132,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, // single sample rasterization evaluates edges at pixel center, // multisample evaluates edges UL pixel corner and steps to each sample position - if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value) + if (std::is_same<NumCoverageSamplesT, SingleSampleT>::value) { // Add 0.5, in fixed point, to offset to pixel center x += (FIXED_POINT_SCALE / 2); @@ -1051,7 +1143,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, __m128i vTopLeftY = _mm_set1_epi32(y); // evaluate edge equations at top-left pixel using 64bit math - // + // // line = Ax + By + C // solving for C: // C = -Ax - By @@ -1061,21 +1153,21 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, // line = Ax - By - Ax0 - By0 // line = A(x - x0) + B(y - y0) // dX = (x-x0), dY = (y-y0) - // so all this simplifies to + // so all this simplifies to // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi); __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi); // evaluate A(dx) and B(dY) for all points - __m256d vAipd = _mm256_cvtepi32_pd(vAi); - __m256d vBipd = _mm256_cvtepi32_pd(vBi); + __m256d vAipd = _mm256_cvtepi32_pd(vAi); + __m256d vBipd = _mm256_cvtepi32_pd(vBi); __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX); __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY); __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd); __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd); - __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16); + __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16); // apply any edge adjustments(top-left, crast, etc) adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge); @@ -1098,8 +1190,8 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]); // Compute and store triangle edge data if scissor needs to rasterized - ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT> - (bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16); + ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>( + bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16); // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile // used to for testing if entire raster tile is inside a triangle @@ -1117,9 +1209,9 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, __m256d vEdgeTileBbox[3]; if (NumCoverageSamplesT::value > 1) { - const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions; - const __m128i vTileSampleBBoxXh = samplePos.TileSampleOffsetsX(); - const __m128i vTileSampleBBoxYh = samplePos.TileSampleOffsetsY(); + const SWR_MULTISAMPLE_POS& samplePos = rastState.samplePositions; + const __m128i vTileSampleBBoxXh = samplePos.TileSampleOffsetsX(); + const __m128i vTileSampleBBoxYh = samplePos.TileSampleOffsetsY(); __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh); __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh); @@ -1128,24 +1220,33 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, // used to for testing if entire raster tile is inside a triangle for (uint32_t e = 0; e < 3; ++e) { - __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8); - __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8); + __m256d vResultAxFix16 = + _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8); + __m256d vResultByFix16 = + _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8); vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16); // adjust for msaa tile bbox edges outward for conservative rast, if enabled - adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdgeTileBbox[e]); + adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>( + vAi, vBi, vEdgeTileBbox[e]); } } RDTSC_END(BEStepSetup, 0); - uint32_t tY = minTileY; - uint32_t tX = minTileX; + uint32_t tY = minTileY; + uint32_t tX = minTileX; uint32_t maxY = maxTileY; uint32_t maxX = maxTileX; RenderOutputBuffers renderBuffers, currentRenderBufferRow; - GetRenderHotTiles<RT::MT::numSamples>(pDC, workerId, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex); + GetRenderHotTiles<RT::MT::numSamples>(pDC, + workerId, + macroTile, + minTileX, + minTileY, + renderBuffers, + triDesc.triFlags.renderTargetArrayIndex); currentRenderBufferRow = renderBuffers; // rasterize and generate coverage masks per sample @@ -1168,26 +1269,31 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++) { // trivial reject, at least one edge has all 4 corners of raster tile outside - bool trivialReject = TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2); + bool trivialReject = + TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2); if (!trivialReject) { // trivial accept mask triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL; - // Update the raster tile edge masks based on inner conservative edge offsets, if enabled - UpdateEdgeMasksInnerConservative<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT> - (vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2); + // Update the raster tile edge masks based on inner conservative edge offsets, + // if enabled + UpdateEdgeMasksInnerConservative<RT, + typename RT::ValidEdgeMaskT, + typename RT::InputCoverageT>( + vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2); // @todo Make this a bit smarter to allow use of trivial accept when: // 1) scissor/vp intersection rect is raster tile aligned // 2) raster tile is entirely within scissor/vp intersection rect if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2)) { - // trivial accept, all 4 corners of all 3 edges are negative + // trivial accept, all 4 corners of all 3 edges are negative // i.e. raster tile completely inside triangle triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum]; - if(std::is_same<typename RT::InputCoverageT, InnerConservativeCoverageT>::value) + if (std::is_same<typename RT::InputCoverageT, + InnerConservativeCoverageT>::value) { triDesc.innerCoverageMask = 0xffffffffffffffffULL; } @@ -1196,9 +1302,10 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, else { __m256d vEdgeAtSample[RT::NumEdgesT::value]; - if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value) + if (std::is_same<NumCoverageSamplesT, SingleSampleT>::value) { - // should get optimized out for single sample case (global value numbering or copy propagation) + // should get optimized out for single sample case (global value + // numbering or copy propagation) for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) { vEdgeAtSample[e] = vEdgeFix16[e]; @@ -1206,23 +1313,25 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, } else { - const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions; - __m128i vSampleOffsetXh = samplePos.vXi(sampleNum); - __m128i vSampleOffsetYh = samplePos.vYi(sampleNum); + const SWR_MULTISAMPLE_POS& samplePos = rastState.samplePositions; + __m128i vSampleOffsetXh = samplePos.vXi(sampleNum); + __m128i vSampleOffsetYh = samplePos.vYi(sampleNum); __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh); __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh); // step edge equation tests from UL tile corner to pixel sample position for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) { - __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX); - __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY); + __m256d vResultAxFix16 = + _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX); + __m256d vResultByFix16 = + _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY); vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16); vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]); } } - double startQuadEdges[RT::NumEdgesT::value]; + double startQuadEdges[RT::NumEdgesT::value]; const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) { @@ -1231,19 +1340,25 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, // not trivial accept or reject, must rasterize full tile RDTSC_BEGIN(BERasterizePartial, pDC->drawId); - triDesc.coverageMask[sampleNum] = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdges, rastEdges); + triDesc.coverageMask[sampleNum] = + rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>( + pDC, startQuadEdges, rastEdges); RDTSC_END(BERasterizePartial, 0); - triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; - + triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; + // Output SV InnerCoverage, if needed - GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask); + GenerateSVInnerCoverage<RT, + typename RT::ValidEdgeMaskT, + typename RT::InputCoverageT>( + pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask); } } else { - // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything - if(NumCoverageSamplesT::value > 1) + // if we're calculating coverage per sample, need to store it off. otherwise no + // covered samples, don't need to do anything + if (NumCoverageSamplesT::value > 1) { triDesc.coverageMask[sampleNum] = 0; } @@ -1252,19 +1367,22 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, } #if KNOB_ENABLE_TOSS_POINTS - if(KNOB_TOSS_RS) + if (KNOB_TOSS_RS) { gToss = triDesc.coverageMask[0]; } else #endif - if(triDesc.anyCoveredSamples) + if (triDesc.anyCoveredSamples) { - // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered - // copy conservative coverage result to all samples - if(RT::IsConservativeT::value) + // if conservative rast and MSAA are enabled, conservative coverage for a pixel + // means all samples in that pixel are covered copy conservative coverage result to + // all samples + if (RT::IsConservativeT::value) { - auto copyCoverage = [&](int sample){triDesc.coverageMask[sample] = triDesc.coverageMask[0]; }; + auto copyCoverage = [&](int sample) { + triDesc.coverageMask[sample] = triDesc.coverageMask[0]; + }; UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage); } @@ -1272,14 +1390,20 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, AR_EVENT(RasterTileCount(pDC->drawId, 1)); RDTSC_BEGIN(BEPixelBackend, pDC->drawId); - backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers); + backendFuncs.pfnBackend(pDC, + workerId, + tileX << KNOB_TILE_X_DIM_SHIFT, + tileY << KNOB_TILE_Y_DIM_SHIFT, + triDesc, + renderBuffers); RDTSC_END(BEPixelBackend, 0); } // step to the next tile in X for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) { - vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX)); + vEdgeFix16[e] = + _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX)); } StepRasterTileX<RT>(state.colorHottileEnable, renderBuffers); } @@ -1287,7 +1411,8 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, // step to the next tile in Y for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) { - vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY)); + vEdgeFix16[e] = + _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY)); } StepRasterTileY<RT>(state.colorHottileEnable, renderBuffers, currentRenderBufferRow); } @@ -1297,10 +1422,16 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, // Get pointers to hot tile memory for color RT, depth, stencil template <uint32_t numSamples> -void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex) +void GetRenderHotTiles(DRAW_CONTEXT* pDC, + uint32_t workerId, + uint32_t macroID, + uint32_t tileX, + uint32_t tileY, + RenderOutputBuffers& renderBuffers, + uint32_t renderTargetArrayIndex) { - const API_STATE& state = GetApiState(pDC); - SWR_CONTEXT *pContext = pDC->pContext; + const API_STATE& state = GetApiState(pDC); + SWR_CONTEXT* pContext = pDC->pContext; HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; uint32_t mx, my; @@ -1310,46 +1441,73 @@ void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroID, u // compute tile offset for active hottile buffers const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8; - uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY); - offset*=numSamples; - - unsigned long rtSlot = 0; - uint32_t colorHottileEnableMask = state.colorHottileEnable; - while(_BitScanForward(&rtSlot, colorHottileEnableMask)) + uint32_t offset = ComputeTileOffset2D< + TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp>>( + pitch, tileX, tileY); + offset *= numSamples; + + unsigned long rtSlot = 0; + uint32_t colorHottileEnableMask = state.colorHottileEnable; + while (_BitScanForward(&rtSlot, colorHottileEnableMask)) { - HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, - numSamples, renderTargetArrayIndex); - pColor->state = HOTTILE_DIRTY; + HOTTILE* pColor = pContext->pHotTileMgr->GetHotTile( + pContext, + pDC, + hWorkerPrivateData, + macroID, + (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), + true, + numSamples, + renderTargetArrayIndex); + pColor->state = HOTTILE_DIRTY; renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset; - + colorHottileEnableMask &= ~(1 << rtSlot); } - if(state.depthHottileEnable) + if (state.depthHottileEnable) { - const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8; - uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY); - offset*=numSamples; - HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_DEPTH, true, - numSamples, renderTargetArrayIndex); - pDepth->state = HOTTILE_DIRTY; + const uint32_t pitch = + KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8; + uint32_t offset = ComputeTileOffset2D< + TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp>>( + pitch, tileX, tileY); + offset *= numSamples; + HOTTILE* pDepth = pContext->pHotTileMgr->GetHotTile(pContext, + pDC, + hWorkerPrivateData, + macroID, + SWR_ATTACHMENT_DEPTH, + true, + numSamples, + renderTargetArrayIndex); + pDepth->state = HOTTILE_DIRTY; SWR_ASSERT(pDepth->pBuffer != nullptr); renderBuffers.pDepth = pDepth->pBuffer + offset; } - if(state.stencilHottileEnable) + if (state.stencilHottileEnable) { - const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8; - uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY); - offset*=numSamples; - HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_STENCIL, true, - numSamples, renderTargetArrayIndex); - pStencil->state = HOTTILE_DIRTY; + const uint32_t pitch = + KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8; + uint32_t offset = ComputeTileOffset2D< + TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp>>( + pitch, tileX, tileY); + offset *= numSamples; + HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, + pDC, + hWorkerPrivateData, + macroID, + SWR_ATTACHMENT_STENCIL, + true, + numSamples, + renderTargetArrayIndex); + pStencil->state = HOTTILE_DIRTY; SWR_ASSERT(pStencil->pBuffer != nullptr); renderBuffers.pStencil = pStencil->pBuffer + offset; } } template <typename RT> -INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers) +INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers& buffers) { DWORD rt = 0; while (_BitScanForward(&rt, colorHotTileMask)) @@ -1357,13 +1515,15 @@ INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buff colorHotTileMask &= ~(1 << rt); buffers.pColor[rt] += RT::colorRasterTileStep; } - + buffers.pDepth += RT::depthRasterTileStep; buffers.pStencil += RT::stencilRasterTileStep; } template <typename RT> -INLINE void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow) +INLINE void StepRasterTileY(uint32_t colorHotTileMask, + RenderOutputBuffers& buffers, + RenderOutputBuffers& startBufferRow) { DWORD rt = 0; while (_BitScanForward(&rt, colorHotTileMask)) @@ -1378,4 +1538,3 @@ INLINE void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buff startBufferRow.pStencil += RT::stencilRasterTileRowStep; buffers.pStencil = startBufferRow.pStencil; } - diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp index 48ea397018b..e858a7d599e 100644 --- a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp @@ -1,99 +1,100 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #include "rdtsc_core.h" #include "common/rdtsc_buckets.h" // must match CORE_BUCKETS enum order BUCKET_DESC gCoreBuckets[] = { - { "APIClearRenderTarget", "", true, 0xff0b8bea }, - { "APIDraw", "", true, 0xff000066 }, - { "APIDrawWakeAllThreads", "", false, 0xffffffff }, - { "APIDrawIndexed", "", true, 0xff000066 }, - { "APIDispatch", "", true, 0xff660000 }, - { "APIStoreTiles", "", true, 0xff00ffff }, - { "APIGetDrawContext", "", false, 0xffffffff }, - { "APISync", "", true, 0xff6666ff }, - { "APIWaitForIdle", "", true, 0xff0000ff }, - { "FEProcessDraw", "", true, 0xff009900 }, - { "FEProcessDrawIndexed", "", true, 0xff009900 }, - { "FEFetchShader", "", false, 0xffffffff }, - { "FEVertexShader", "", false, 0xffffffff }, - { "FEHullShader", "", false, 0xffffffff }, - { "FETessellation", "", false, 0xffffffff }, - { "FEDomainShader", "", false, 0xffffffff }, - { "FEGeometryShader", "", false, 0xffffffff }, - { "FEStreamout", "", false, 0xffffffff }, - { "FEPAAssemble", "", false, 0xffffffff }, - { "FEBinPoints", "", false, 0xff29b854 }, - { "FEBinLines", "", false, 0xff29b854 }, - { "FEBinTriangles", "", false, 0xff29b854 }, - { "FETriangleSetup", "", false, 0xffffffff }, - { "FEViewportCull", "", false, 0xffffffff }, - { "FEGuardbandClip", "", false, 0xffffffff }, - { "FEClipPoints", "", false, 0xffffffff }, - { "FEClipLines", "", false, 0xffffffff }, - { "FEClipTriangles", "", false, 0xffffffff }, - { "FEClipRectangles", "", false, 0xffffffff }, - { "FECullZeroAreaAndBackface", "", false, 0xffffffff }, - { "FECullBetweenCenters", "", false, 0xffffffff }, - { "FEEarlyRastEnter", "", false, 0xffffffff }, - { "FEEarlyRastExit", "", false, 0xffffffff }, - { "FEProcessStoreTiles", "", true, 0xff39c864 }, - { "FEProcessInvalidateTiles", "", true, 0xffffffff }, - { "WorkerWorkOnFifoBE", "", false, 0xff40261c }, - { "WorkerFoundWork", "", false, 0xff573326 }, - { "BELoadTiles", "", true, 0xffb0e2ff }, - { "BEDispatch", "", true, 0xff00a2ff }, - { "BEClear", "", true, 0xff00ccbb }, - { "BERasterizeLine", "", true, 0xffb26a4e }, - { "BERasterizeTriangle", "", true, 0xffb26a4e }, - { "BETriangleSetup", "", false, 0xffffffff }, - { "BEStepSetup", "", false, 0xffffffff }, - { "BECullZeroArea", "", false, 0xffffffff }, - { "BEEmptyTriangle", "", false, 0xffffffff }, - { "BETrivialAccept", "", false, 0xffffffff }, - { "BETrivialReject", "", false, 0xffffffff }, - { "BERasterizePartial", "", false, 0xffffffff }, - { "BEPixelBackend", "", false, 0xffffffff }, - { "BESetup", "", false, 0xffffffff }, - { "BEBarycentric", "", false, 0xffffffff }, - { "BEEarlyDepthTest", "", false, 0xffffffff }, - { "BEPixelShader", "", false, 0xffffffff }, - { "BESingleSampleBackend", "", false, 0xffffffff }, - { "BEPixelRateBackend", "", false, 0xffffffff }, - { "BESampleRateBackend", "", false, 0xffffffff }, - { "BENullBackend", "", false, 0xffffffff }, - { "BELateDepthTest", "", false, 0xffffffff }, - { "BEOutputMerger", "", false, 0xffffffff }, - { "BEStoreTiles", "", true, 0xff00cccc }, - { "BEEndTile", "", false, 0xffffffff }, + {"APIClearRenderTarget", "", true, 0xff0b8bea}, + {"APIDraw", "", true, 0xff000066}, + {"APIDrawWakeAllThreads", "", false, 0xffffffff}, + {"APIDrawIndexed", "", true, 0xff000066}, + {"APIDispatch", "", true, 0xff660000}, + {"APIStoreTiles", "", true, 0xff00ffff}, + {"APIGetDrawContext", "", false, 0xffffffff}, + {"APISync", "", true, 0xff6666ff}, + {"APIWaitForIdle", "", true, 0xff0000ff}, + {"FEProcessDraw", "", true, 0xff009900}, + {"FEProcessDrawIndexed", "", true, 0xff009900}, + {"FEFetchShader", "", false, 0xffffffff}, + {"FEVertexShader", "", false, 0xffffffff}, + {"FEHullShader", "", false, 0xffffffff}, + {"FETessellation", "", false, 0xffffffff}, + {"FEDomainShader", "", false, 0xffffffff}, + {"FEGeometryShader", "", false, 0xffffffff}, + {"FEStreamout", "", false, 0xffffffff}, + {"FEPAAssemble", "", false, 0xffffffff}, + {"FEBinPoints", "", false, 0xff29b854}, + {"FEBinLines", "", false, 0xff29b854}, + {"FEBinTriangles", "", false, 0xff29b854}, + {"FETriangleSetup", "", false, 0xffffffff}, + {"FEViewportCull", "", false, 0xffffffff}, + {"FEGuardbandClip", "", false, 0xffffffff}, + {"FEClipPoints", "", false, 0xffffffff}, + {"FEClipLines", "", false, 0xffffffff}, + {"FEClipTriangles", "", false, 0xffffffff}, + {"FEClipRectangles", "", false, 0xffffffff}, + {"FECullZeroAreaAndBackface", "", false, 0xffffffff}, + {"FECullBetweenCenters", "", false, 0xffffffff}, + {"FEEarlyRastEnter", "", false, 0xffffffff}, + {"FEEarlyRastExit", "", false, 0xffffffff}, + {"FEProcessStoreTiles", "", true, 0xff39c864}, + {"FEProcessInvalidateTiles", "", true, 0xffffffff}, + {"WorkerWorkOnFifoBE", "", false, 0xff40261c}, + {"WorkerFoundWork", "", false, 0xff573326}, + {"BELoadTiles", "", true, 0xffb0e2ff}, + {"BEDispatch", "", true, 0xff00a2ff}, + {"BEClear", "", true, 0xff00ccbb}, + {"BERasterizeLine", "", true, 0xffb26a4e}, + {"BERasterizeTriangle", "", true, 0xffb26a4e}, + {"BETriangleSetup", "", false, 0xffffffff}, + {"BEStepSetup", "", false, 0xffffffff}, + {"BECullZeroArea", "", false, 0xffffffff}, + {"BEEmptyTriangle", "", false, 0xffffffff}, + {"BETrivialAccept", "", false, 0xffffffff}, + {"BETrivialReject", "", false, 0xffffffff}, + {"BERasterizePartial", "", false, 0xffffffff}, + {"BEPixelBackend", "", false, 0xffffffff}, + {"BESetup", "", false, 0xffffffff}, + {"BEBarycentric", "", false, 0xffffffff}, + {"BEEarlyDepthTest", "", false, 0xffffffff}, + {"BEPixelShader", "", false, 0xffffffff}, + {"BESingleSampleBackend", "", false, 0xffffffff}, + {"BEPixelRateBackend", "", false, 0xffffffff}, + {"BESampleRateBackend", "", false, 0xffffffff}, + {"BENullBackend", "", false, 0xffffffff}, + {"BELateDepthTest", "", false, 0xffffffff}, + {"BEOutputMerger", "", false, 0xffffffff}, + {"BEStoreTiles", "", true, 0xff00cccc}, + {"BEEndTile", "", false, 0xffffffff}, }; -static_assert(NumBuckets == (sizeof(gCoreBuckets) / sizeof(gCoreBuckets[0])), "RDTSC Bucket enum and description table size mismatched."); +static_assert(NumBuckets == (sizeof(gCoreBuckets) / sizeof(gCoreBuckets[0])), + "RDTSC Bucket enum and description table size mismatched."); /// @todo bucketmanager and mapping should probably be a part of the SWR context std::vector<uint32_t> gBucketMap; -BucketManager gBucketMgr; +BucketManager gBucketMgr; -uint32_t gCurrentFrame = 0; -bool gBucketsInitialized = false; +uint32_t gCurrentFrame = 0; +bool gBucketsInitialized = false; diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h index 704da650d85..dc20e5be98d 100644 --- a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h +++ b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #pragma once #include "knobs.h" @@ -124,10 +124,10 @@ void rdtscEndFrame(); #endif extern std::vector<uint32_t> gBucketMap; -extern BucketManager gBucketMgr; -extern BUCKET_DESC gCoreBuckets[]; -extern uint32_t gCurrentFrame; -extern bool gBucketsInitialized; +extern BucketManager gBucketMgr; +extern BUCKET_DESC gCoreBuckets[]; +extern uint32_t gCurrentFrame; +extern bool gBucketsInitialized; INLINE void rdtscReset() { @@ -174,12 +174,14 @@ INLINE void rdtscEndFrame() { gCurrentFrame++; - if (gCurrentFrame == KNOB_BUCKETS_START_FRAME && KNOB_BUCKETS_START_FRAME < KNOB_BUCKETS_END_FRAME) + if (gCurrentFrame == KNOB_BUCKETS_START_FRAME && + KNOB_BUCKETS_START_FRAME < KNOB_BUCKETS_END_FRAME) { gBucketMgr.StartCapture(); } - if (gCurrentFrame == KNOB_BUCKETS_END_FRAME && KNOB_BUCKETS_START_FRAME < KNOB_BUCKETS_END_FRAME) + if (gCurrentFrame == KNOB_BUCKETS_END_FRAME && + KNOB_BUCKETS_START_FRAME < KNOB_BUCKETS_END_FRAME) { gBucketMgr.StopCapture(); gBucketMgr.PrintReport("rdtsc.txt"); diff --git a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h index f1bef2190fb..133420e6f3d 100644 --- a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h +++ b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h @@ -1,56 +1,52 @@ /**************************************************************************** -* Copyright (C) 2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file arena.h -* -* @brief RingBuffer -* The RingBuffer class manages all aspects of the ring buffer including -* the head/tail indices, etc. -* -******************************************************************************/ + * Copyright (C) 2016 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file arena.h + * + * @brief RingBuffer + * The RingBuffer class manages all aspects of the ring buffer including + * the head/tail indices, etc. + * + ******************************************************************************/ #pragma once -template<typename T> +template <typename T> class RingBuffer { public: - RingBuffer() - : mpRingBuffer(nullptr), mNumEntries(0), mRingHead(0), mRingTail(0) - { - } + RingBuffer() : mpRingBuffer(nullptr), mNumEntries(0), mRingHead(0), mRingTail(0) {} - ~RingBuffer() - { - Destroy(); - } + ~RingBuffer() { Destroy(); } void Init(uint32_t numEntries) { SWR_ASSERT(numEntries > 0); - SWR_ASSERT(((1ULL << 32) % numEntries) == 0, "%d is not evenly divisible into 2 ^ 32. Wrap errors will occur!", numEntries); - mNumEntries = numEntries; - mpRingBuffer = (T*)AlignedMalloc(sizeof(T)*numEntries, 64); + SWR_ASSERT(((1ULL << 32) % numEntries) == 0, + "%d is not evenly divisible into 2 ^ 32. Wrap errors will occur!", + numEntries); + mNumEntries = numEntries; + mpRingBuffer = (T*)AlignedMalloc(sizeof(T) * numEntries, 64); SWR_ASSERT(mpRingBuffer != nullptr); - memset(mpRingBuffer, 0, sizeof(T)*numEntries); + memset(mpRingBuffer, 0, sizeof(T) * numEntries); } void Destroy() @@ -77,10 +73,7 @@ public: InterlockedIncrement(&mRingTail); // There are multiple consumers. } - INLINE bool IsEmpty() - { - return (GetHead() == GetTail()); - } + INLINE bool IsEmpty() { return (GetHead() == GetTail()); } INLINE bool IsFull() { @@ -94,9 +87,9 @@ public: INLINE uint32_t GetHead() volatile { return mRingHead; } protected: - T* mpRingBuffer; + T* mpRingBuffer; uint32_t mNumEntries; - OSALIGNLINE(volatile uint32_t) mRingHead; // Consumer Counter - OSALIGNLINE(volatile uint32_t) mRingTail; // Producer Counter + OSALIGNLINE(volatile uint32_t) mRingHead; // Consumer Counter + OSALIGNLINE(volatile uint32_t) mRingTail; // Producer Counter }; diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index 9db17eeed01..0b42a457945 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -1,30 +1,32 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file state.h -* -* @brief Definitions for API state. -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file state.h + * + * @brief Definitions for API state. + * + ******************************************************************************/ +// Skipping clang-format due to parsing by simplistic python scripts +// clang-format off #pragma once #include "common/formats.h" @@ -39,63 +41,63 @@ using gfxptr_t = unsigned long long; ////////////////////////////////////////////////////////////////////////// enum PRIMITIVE_TOPOLOGY { - TOP_UNKNOWN = 0x0, - TOP_POINT_LIST = 0x1, - TOP_LINE_LIST = 0x2, - TOP_LINE_STRIP = 0x3, - TOP_TRIANGLE_LIST = 0x4, - TOP_TRIANGLE_STRIP = 0x5, - TOP_TRIANGLE_FAN = 0x6, - TOP_QUAD_LIST = 0x7, - TOP_QUAD_STRIP = 0x8, - TOP_LINE_LIST_ADJ = 0x9, - TOP_LISTSTRIP_ADJ = 0xA, - TOP_TRI_LIST_ADJ = 0xB, - TOP_TRI_STRIP_ADJ = 0xC, - TOP_TRI_STRIP_REVERSE = 0xD, - TOP_POLYGON = 0xE, - TOP_RECT_LIST = 0xF, - TOP_LINE_LOOP = 0x10, - TOP_POINT_LIST_BF = 0x11, - TOP_LINE_STRIP_CONT = 0x12, - TOP_LINE_STRIP_BF = 0x13, - TOP_LINE_STRIP_CONT_BF = 0x14, + TOP_UNKNOWN = 0x0, + TOP_POINT_LIST = 0x1, + TOP_LINE_LIST = 0x2, + TOP_LINE_STRIP = 0x3, + TOP_TRIANGLE_LIST = 0x4, + TOP_TRIANGLE_STRIP = 0x5, + TOP_TRIANGLE_FAN = 0x6, + TOP_QUAD_LIST = 0x7, + TOP_QUAD_STRIP = 0x8, + TOP_LINE_LIST_ADJ = 0x9, + TOP_LISTSTRIP_ADJ = 0xA, + TOP_TRI_LIST_ADJ = 0xB, + TOP_TRI_STRIP_ADJ = 0xC, + TOP_TRI_STRIP_REVERSE = 0xD, + TOP_POLYGON = 0xE, + TOP_RECT_LIST = 0xF, + TOP_LINE_LOOP = 0x10, + TOP_POINT_LIST_BF = 0x11, + TOP_LINE_STRIP_CONT = 0x12, + TOP_LINE_STRIP_BF = 0x13, + TOP_LINE_STRIP_CONT_BF = 0x14, TOP_TRIANGLE_FAN_NOSTIPPLE = 0x16, - TOP_TRIANGLE_DISC = 0x17, /// @todo What is this?? - - TOP_PATCHLIST_BASE = 0x1F, // Invalid topology, used to calculate num verts for a patchlist. - TOP_PATCHLIST_1 = 0x20, // List of 1-vertex patches - TOP_PATCHLIST_2 = 0x21, - TOP_PATCHLIST_3 = 0x22, - TOP_PATCHLIST_4 = 0x23, - TOP_PATCHLIST_5 = 0x24, - TOP_PATCHLIST_6 = 0x25, - TOP_PATCHLIST_7 = 0x26, - TOP_PATCHLIST_8 = 0x27, - TOP_PATCHLIST_9 = 0x28, - TOP_PATCHLIST_10 = 0x29, - TOP_PATCHLIST_11 = 0x2A, - TOP_PATCHLIST_12 = 0x2B, - TOP_PATCHLIST_13 = 0x2C, - TOP_PATCHLIST_14 = 0x2D, - TOP_PATCHLIST_15 = 0x2E, - TOP_PATCHLIST_16 = 0x2F, - TOP_PATCHLIST_17 = 0x30, - TOP_PATCHLIST_18 = 0x31, - TOP_PATCHLIST_19 = 0x32, - TOP_PATCHLIST_20 = 0x33, - TOP_PATCHLIST_21 = 0x34, - TOP_PATCHLIST_22 = 0x35, - TOP_PATCHLIST_23 = 0x36, - TOP_PATCHLIST_24 = 0x37, - TOP_PATCHLIST_25 = 0x38, - TOP_PATCHLIST_26 = 0x39, - TOP_PATCHLIST_27 = 0x3A, - TOP_PATCHLIST_28 = 0x3B, - TOP_PATCHLIST_29 = 0x3C, - TOP_PATCHLIST_30 = 0x3D, - TOP_PATCHLIST_31 = 0x3E, - TOP_PATCHLIST_32 = 0x3F, // List of 32-vertex patches + TOP_TRIANGLE_DISC = 0x17, /// @todo What is this?? + + TOP_PATCHLIST_BASE = 0x1F, // Invalid topology, used to calculate num verts for a patchlist. + TOP_PATCHLIST_1 = 0x20, // List of 1-vertex patches + TOP_PATCHLIST_2 = 0x21, + TOP_PATCHLIST_3 = 0x22, + TOP_PATCHLIST_4 = 0x23, + TOP_PATCHLIST_5 = 0x24, + TOP_PATCHLIST_6 = 0x25, + TOP_PATCHLIST_7 = 0x26, + TOP_PATCHLIST_8 = 0x27, + TOP_PATCHLIST_9 = 0x28, + TOP_PATCHLIST_10 = 0x29, + TOP_PATCHLIST_11 = 0x2A, + TOP_PATCHLIST_12 = 0x2B, + TOP_PATCHLIST_13 = 0x2C, + TOP_PATCHLIST_14 = 0x2D, + TOP_PATCHLIST_15 = 0x2E, + TOP_PATCHLIST_16 = 0x2F, + TOP_PATCHLIST_17 = 0x30, + TOP_PATCHLIST_18 = 0x31, + TOP_PATCHLIST_19 = 0x32, + TOP_PATCHLIST_20 = 0x33, + TOP_PATCHLIST_21 = 0x34, + TOP_PATCHLIST_22 = 0x35, + TOP_PATCHLIST_23 = 0x36, + TOP_PATCHLIST_24 = 0x37, + TOP_PATCHLIST_25 = 0x38, + TOP_PATCHLIST_26 = 0x39, + TOP_PATCHLIST_27 = 0x3A, + TOP_PATCHLIST_28 = 0x3B, + TOP_PATCHLIST_29 = 0x3C, + TOP_PATCHLIST_30 = 0x3D, + TOP_PATCHLIST_31 = 0x3E, + TOP_PATCHLIST_32 = 0x3F, // List of 32-vertex patches }; ////////////////////////////////////////////////////////////////////////// @@ -173,7 +175,6 @@ enum SWR_OUTER_TESSFACTOR_ID SWR_NUM_OUTER_TESS_FACTORS, }; - ///////////////////////////////////////////////////////////////////////// /// simdvertex /// @brief Defines a vertex element that holds all the data for SIMD vertices. @@ -182,9 +183,9 @@ enum SWR_OUTER_TESSFACTOR_ID enum SWR_VTX_SLOTS { VERTEX_SGV_SLOT = 0, - VERTEX_SGV_RTAI_COMP = 0, - VERTEX_SGV_VAI_COMP = 1, - VERTEX_SGV_POINT_SIZE_COMP = 2, + VERTEX_SGV_RTAI_COMP = 0, + VERTEX_SGV_VAI_COMP = 1, + VERTEX_SGV_POINT_SIZE_COMP = 2, VERTEX_POSITION_SLOT = 1, VERTEX_POSITION_END_SLOT = 1, VERTEX_CLIPCULL_DIST_LO_SLOT = (1 + VERTEX_POSITION_END_SLOT), // VS writes lower 4 clip/cull dist @@ -197,21 +198,21 @@ enum SWR_VTX_SLOTS // SoAoSoA struct simdvertex { - simdvector attrib[SWR_VTX_NUM_SLOTS]; + simdvector attrib[SWR_VTX_NUM_SLOTS]; }; #if ENABLE_AVX512_SIMD16 struct simd16vertex { - simd16vector attrib[SWR_VTX_NUM_SLOTS]; + simd16vector attrib[SWR_VTX_NUM_SLOTS]; }; #endif -template<typename SIMD_T> +template <typename SIMD_T> struct SIMDVERTEX_T { - typename SIMD_T::Vec4 attrib[SWR_VTX_NUM_SLOTS]; + typename SIMD_T::Vec4 attrib[SWR_VTX_NUM_SLOTS]; }; ////////////////////////////////////////////////////////////////////////// @@ -229,19 +230,20 @@ struct SWR_SHADER_STATS ///////////////////////////////////////////////////////////////////////// struct SWR_VS_CONTEXT { - simdvertex* pVin; // IN: SIMD input vertex data store - simdvertex* pVout; // OUT: SIMD output vertex data store + simdvertex* pVin; // IN: SIMD input vertex data store + simdvertex* pVout; // OUT: SIMD output vertex data store - uint32_t InstanceID; // IN: Instance ID, constant across all verts of the SIMD - simdscalari VertexID; // IN: Vertex ID - simdscalari mask; // IN: Active mask for shader + uint32_t InstanceID; // IN: Instance ID, constant across all verts of the SIMD + simdscalari VertexID; // IN: Vertex ID + simdscalari mask; // IN: Active mask for shader // SIMD16 Frontend fields. - uint32_t AlternateOffset; // IN: amount to offset for interleaving even/odd simd8 in simd16vertex output - simd16scalari mask16; // IN: Active mask for shader (16-wide) - simd16scalari VertexID16; // IN: Vertex ID (16-wide) + uint32_t AlternateOffset; // IN: amount to offset for interleaving even/odd simd8 in + // simd16vertex output + simd16scalari mask16; // IN: Active mask for shader (16-wide) + simd16scalari VertexID16; // IN: Vertex ID (16-wide) - SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast. + SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast. }; ///////////////////////////////////////////////////////////////////////// @@ -268,16 +270,16 @@ struct ScalarCPoint ///////////////////////////////////////////////////////////////////////// struct SWR_TESSELLATION_FACTORS { - float OuterTessFactors[SWR_NUM_OUTER_TESS_FACTORS]; - float InnerTessFactors[SWR_NUM_INNER_TESS_FACTORS]; + float OuterTessFactors[SWR_NUM_OUTER_TESS_FACTORS]; + float InnerTessFactors[SWR_NUM_INNER_TESS_FACTORS]; }; #define MAX_NUM_VERTS_PER_PRIM 32 // support up to 32 control point patches struct ScalarPatch { SWR_TESSELLATION_FACTORS tessFactors; - ScalarCPoint cp[MAX_NUM_VERTS_PER_PRIM]; - ScalarCPoint patchData; + ScalarCPoint cp[MAX_NUM_VERTS_PER_PRIM]; + ScalarCPoint patchData; }; ////////////////////////////////////////////////////////////////////////// @@ -286,12 +288,11 @@ struct ScalarPatch ///////////////////////////////////////////////////////////////////////// struct SWR_HS_CONTEXT { - simdvertex vert[MAX_NUM_VERTS_PER_PRIM]; // IN: (SIMD) input primitive data - simdscalari PrimitiveID; // IN: (SIMD) primitive ID generated from the draw call - simdscalari mask; // IN: Active mask for shader - ScalarPatch* pCPout; // OUT: Output control point patch - // SIMD-sized-array of SCALAR patches - SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast. + simdvertex vert[MAX_NUM_VERTS_PER_PRIM]; // IN: (SIMD) input primitive data + simdscalari PrimitiveID; // IN: (SIMD) primitive ID generated from the draw call + simdscalari mask; // IN: Active mask for shader + ScalarPatch* pCPout; // OUT: Output control point patch SIMD-sized-array of SCALAR patches + SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast. }; ////////////////////////////////////////////////////////////////////////// @@ -318,13 +319,13 @@ struct SWR_DS_CONTEXT ///////////////////////////////////////////////////////////////////////// struct SWR_GS_CONTEXT { - simdvector* pVerts; // IN: input primitive data for SIMD prims - uint32_t inputVertStride; // IN: input vertex stride, in attributes - simdscalari PrimitiveID; // IN: input primitive ID generated from the draw call - uint32_t InstanceID; // IN: input instance ID - simdscalari mask; // IN: Active mask for shader - uint8_t* pStreams[KNOB_SIMD_WIDTH]; // OUT: output stream (contains vertices for all output streams) - SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast. + simdvector* pVerts; // IN: input primitive data for SIMD prims + uint32_t inputVertStride; // IN: input vertex stride, in attributes + simdscalari PrimitiveID; // IN: input primitive ID generated from the draw call + uint32_t InstanceID; // IN: input instance ID + simdscalari mask; // IN: Active mask for shader + uint8_t* pStreams[KNOB_SIMD_WIDTH]; // OUT: output stream (contains vertices for all output streams) + SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast. }; struct PixelPositions @@ -343,36 +344,35 @@ struct PixelPositions ///////////////////////////////////////////////////////////////////////// struct SWR_PS_CONTEXT { - PixelPositions vX; // IN: x location(s) of pixels - PixelPositions vY; // IN: x location(s) of pixels - simdscalar vZ; // INOUT: z location of pixels - simdscalari activeMask; // OUT: mask for kill - simdscalar inputMask; // IN: input coverage mask for all samples - simdscalari oMask; // OUT: mask for output coverage + PixelPositions vX; // IN: x location(s) of pixels + PixelPositions vY; // IN: x location(s) of pixels + simdscalar vZ; // INOUT: z location of pixels + simdscalari activeMask; // OUT: mask for kill + simdscalar inputMask; // IN: input coverage mask for all samples + simdscalari oMask; // OUT: mask for output coverage - PixelPositions vI; // barycentric coords evaluated at pixel center, sample position, centroid + PixelPositions vI; // barycentric coords evaluated at pixel center, sample position, centroid PixelPositions vJ; - PixelPositions vOneOverW; // IN: 1/w + PixelPositions vOneOverW; // IN: 1/w const float* pAttribs; // IN: pointer to attribute barycentric coefficients const float* pPerspAttribs; // IN: pointer to attribute/w barycentric coefficients const float* pRecipW; // IN: pointer to 1/w coord for each vertex - const float *I; // IN: Barycentric A, B, and C coefs used to compute I - const float *J; // IN: Barycentric A, B, and C coefs used to compute J - float recipDet; // IN: 1/Det, used when barycentric interpolating attributes + const float* I; // IN: Barycentric A, B, and C coefs used to compute I + const float* J; // IN: Barycentric A, B, and C coefs used to compute J + float recipDet; // IN: 1/Det, used when barycentric interpolating attributes const float* pSamplePosX; // IN: array of sample positions const float* pSamplePosY; // IN: array of sample positions - simdvector shaded[SWR_NUM_RENDERTARGETS]; - // OUT: result color per rendertarget + simdvector shaded[SWR_NUM_RENDERTARGETS]; // OUT: result color per rendertarget - uint32_t frontFace; // IN: front- 1, back- 0 - uint32_t sampleIndex; // IN: sampleIndex - uint32_t renderTargetArrayIndex; // IN: render target array index from GS - uint32_t rasterizerSampleCount; // IN: sample count used by the rasterizer + uint32_t frontFace; // IN: front- 1, back- 0 + uint32_t sampleIndex; // IN: sampleIndex + uint32_t renderTargetArrayIndex; // IN: render target array index from GS + uint32_t rasterizerSampleCount; // IN: sample count used by the rasterizer uint8_t* pColorBuffer[SWR_NUM_RENDERTARGETS]; // IN: Pointers to render target hottiles - SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast. + SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast. }; ////////////////////////////////////////////////////////////////////////// @@ -401,41 +401,41 @@ struct SWR_CS_CONTEXT // count into the shader. When the count reaches 0 then all thread groups in the // dispatch call have been completed. - uint32_t tileCounter; // The tile counter value for this thread group. + uint32_t tileCounter; // The tile counter value for this thread group. // Dispatch dimensions used by shader to compute system values from the tile counter. uint32_t dispatchDims[3]; uint8_t* pTGSM; // Thread Group Shared Memory pointer. uint8_t* pSpillFillBuffer; // Spill/fill buffer for barrier support - uint8_t* pScratchSpace; // Pointer to scratch space buffer used by the shader, shader is responsible - // for subdividing scratch space per instance/simd + uint8_t* pScratchSpace; // Pointer to scratch space buffer used by the shader, shader is + // responsible for subdividing scratch space per instance/simd uint32_t scratchSpacePerSimd; // Scratch space per work item x SIMD_WIDTH - SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast. + SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast. }; // enums enum SWR_TILE_MODE { - SWR_TILE_NONE = 0x0, // Linear mode (no tiling) - SWR_TILE_MODE_WMAJOR, // W major tiling - SWR_TILE_MODE_XMAJOR, // X major tiling - SWR_TILE_MODE_YMAJOR, // Y major tiling - SWR_TILE_SWRZ, // SWR-Z tiling + SWR_TILE_NONE = 0x0, // Linear mode (no tiling) + SWR_TILE_MODE_WMAJOR, // W major tiling + SWR_TILE_MODE_XMAJOR, // X major tiling + SWR_TILE_MODE_YMAJOR, // Y major tiling + SWR_TILE_SWRZ, // SWR-Z tiling SWR_TILE_MODE_COUNT }; enum SWR_SURFACE_TYPE { - SURFACE_1D = 0, - SURFACE_2D = 1, - SURFACE_3D = 2, - SURFACE_CUBE = 3, - SURFACE_BUFFER = 4, + SURFACE_1D = 0, + SURFACE_2D = 1, + SURFACE_3D = 2, + SURFACE_CUBE = 3, + SURFACE_BUFFER = 4, SURFACE_STRUCTURED_BUFFER = 5, - SURFACE_NULL = 7 + SURFACE_NULL = 7 }; enum SWR_ZFUNCTION @@ -537,34 +537,35 @@ struct SWR_LOD_OFFSETS ////////////////////////////////////////////////////////////////////////// struct SWR_SURFACE_STATE { - gfxptr_t xpBaseAddress; - SWR_SURFACE_TYPE type; // @llvm_enum - SWR_FORMAT format; // @llvm_enum - uint32_t width; - uint32_t height; - uint32_t depth; - uint32_t numSamples; - uint32_t samplePattern; - uint32_t pitch; - uint32_t qpitch; - uint32_t minLod; // for sampled surfaces, the most detailed LOD that can be accessed by sampler - uint32_t maxLod; // for sampled surfaces, the max LOD that can be accessed - float resourceMinLod; // for sampled surfaces, the most detailed fractional mip that can be accessed by sampler - uint32_t lod; // for render targets, the lod being rendered to - uint32_t arrayIndex; // for render targets, the array index being rendered to for arrayed surfaces - SWR_TILE_MODE tileMode; // @llvm_enum - uint32_t halign; - uint32_t valign; - uint32_t xOffset; - uint32_t yOffset; + gfxptr_t xpBaseAddress; + SWR_SURFACE_TYPE type; // @llvm_enum + SWR_FORMAT format; // @llvm_enum + uint32_t width; + uint32_t height; + uint32_t depth; + uint32_t numSamples; + uint32_t samplePattern; + uint32_t pitch; + uint32_t qpitch; + uint32_t minLod; // for sampled surfaces, the most detailed LOD that can be accessed by sampler + uint32_t maxLod; // for sampled surfaces, the max LOD that can be accessed + float resourceMinLod; // for sampled surfaces, the most detailed fractional mip that can be + // accessed by sampler + uint32_t lod; // for render targets, the lod being rendered to + uint32_t arrayIndex; // for render targets, the array index being rendered to for arrayed surfaces + SWR_TILE_MODE tileMode; // @llvm_enum + uint32_t halign; + uint32_t valign; + uint32_t xOffset; + uint32_t yOffset; uint32_t lodOffsets[2][15]; // lod offsets for sampled surfaces - gfxptr_t xpAuxBaseAddress; // Used for compression, append/consume counter, etc. - SWR_AUX_MODE auxMode; // @llvm_enum + gfxptr_t xpAuxBaseAddress; // Used for compression, append/consume counter, etc. + SWR_AUX_MODE auxMode; // @llvm_enum - bool bInterleavedSamples; // are MSAA samples stored interleaved or planar + bool bInterleavedSamples; // are MSAA samples stored interleaved or planar }; // vertex fetch state @@ -576,9 +577,10 @@ struct SWR_VERTEX_BUFFER_STATE uint32_t index; uint32_t pitch; uint32_t size; - uint32_t minVertex; // min vertex (for bounds checking) - uint32_t maxVertex; // size / pitch. precalculated value used by fetch shader for OOB checks - uint32_t partialInboundsSize; // size % pitch. precalculated value used by fetch shader for partially OOB vertices + uint32_t minVertex; // min vertex (for bounds checking) + uint32_t maxVertex; // size / pitch. precalculated value used by fetch shader for OOB checks + uint32_t partialInboundsSize; // size % pitch. precalculated value used by fetch shader for + // partially OOB vertices }; struct SWR_INDEX_BUFFER_STATE @@ -586,10 +588,9 @@ struct SWR_INDEX_BUFFER_STATE gfxptr_t xpIndices; // Format type for indices (e.g. UINT16, UINT32, etc.) SWR_FORMAT format; // @llvm_enum - uint32_t size; + uint32_t size; }; - ////////////////////////////////////////////////////////////////////////// /// SWR_FETCH_CONTEXT /// @brief Input to fetch shader. @@ -598,20 +599,21 @@ struct SWR_INDEX_BUFFER_STATE ///////////////////////////////////////////////////////////////////////// struct SWR_FETCH_CONTEXT { - const SWR_VERTEX_BUFFER_STATE* pStreams; // IN: array of bound vertex buffers - gfxptr_t xpIndices; // IN: pointer to int32 index buffer for indexed draws - gfxptr_t xpLastIndex; // IN: pointer to end of index buffer, used for bounds checking - uint32_t CurInstance; // IN: current instance - uint32_t BaseVertex; // IN: base vertex - uint32_t StartVertex; // IN: start vertex - uint32_t StartInstance; // IN: start instance - simdscalari VertexID; // OUT: vector of vertex IDs - simdscalari CutMask; // OUT: vector mask of indices which have the cut index value + const SWR_VERTEX_BUFFER_STATE* pStreams; // IN: array of bound vertex buffers + gfxptr_t xpIndices; // IN: pointer to int32 index buffer for indexed draws + gfxptr_t xpLastIndex; // IN: pointer to end of index buffer, used for bounds checking + uint32_t CurInstance; // IN: current instance + uint32_t BaseVertex; // IN: base vertex + uint32_t StartVertex; // IN: start vertex + uint32_t StartInstance; // IN: start instance + simdscalari VertexID; // OUT: vector of vertex IDs + simdscalari CutMask; // OUT: vector mask of indices which have the cut index value #if USE_SIMD16_SHADERS -// simd16scalari VertexID; // OUT: vector of vertex IDs -// simd16scalari CutMask; // OUT: vector mask of indices which have the cut index value - simdscalari VertexID2; // OUT: vector of vertex IDs - simdscalari CutMask2; // OUT: vector mask of indices which have the cut index value + // simd16scalari VertexID; // OUT: vector of vertex IDs + // simd16scalari CutMask; // OUT: vector mask of indices which have the + // cut index value + simdscalari VertexID2; // OUT: vector of vertex IDs + simdscalari CutMask2; // OUT: vector mask of indices which have the cut index value #endif }; @@ -627,8 +629,8 @@ OSALIGNLINE(struct) SWR_STATS uint64_t DepthPassCount; // Number of passing depth tests. Not exact. // Pipeline Stats - uint64_t PsInvocations; // Number of Pixel Shader invocations - uint64_t CsInvocations; // Number of Compute Shader invocations + uint64_t PsInvocations; // Number of Pixel Shader invocations + uint64_t CsInvocations; // Number of Compute Shader invocations }; @@ -654,9 +656,9 @@ OSALIGNLINE(struct) SWR_STATS_FE uint64_t SoNumPrimsWritten[4]; }; -////////////////////////////////////////////////////////////////////////// -/// STREAMOUT_BUFFERS -///////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// + /// STREAMOUT_BUFFERS + ///////////////////////////////////////////////////////////////////////// #define MAX_SO_STREAMS 4 #define MAX_SO_BUFFERS 4 @@ -718,7 +720,7 @@ struct SWR_STREAMOUT_STATE ///////////////////////////////////////////////////////////////////////// struct SWR_STREAMOUT_CONTEXT { - uint32_t* pPrimData; + uint32_t* pPrimData; SWR_STREAMOUT_BUFFER* pBuffer[MAX_SO_STREAMS]; // Num prims written for this stream @@ -736,8 +738,8 @@ struct SWR_GS_STATE bool gsEnable; // If true, geometry shader emits a single stream, with separate cut buffer. - // If false, geometry shader emits vertices for multiple streams to the stream buffer, with a separate StreamID buffer - // to map vertices to streams + // If false, geometry shader emits vertices for multiple streams to the stream buffer, with a + // separate StreamID buffer to map vertices to streams bool isSingleStream; // Number of input attributes per vertex. Used by the frontend to @@ -748,7 +750,7 @@ struct SWR_GS_STATE uint32_t inputVertStride; // Output topology - can be point, tristrip, linestrip, or rectlist - PRIMITIVE_TOPOLOGY outputTopology; // @llvm_enum + PRIMITIVE_TOPOLOGY outputTopology; // @llvm_enum // Maximum number of verts that can be emitted by a single instance of the GS uint32_t maxNumVerts; @@ -763,14 +765,16 @@ struct SWR_GS_STATE // Total amount of memory to allocate for one instance of the shader output in bytes uint32_t allocationSize; - // Offset to the start of the attributes of the input vertices, in simdvector units, as read by the GS + // Offset to the start of the attributes of the input vertices, in simdvector units, as read by + // the GS uint32_t vertexAttribOffset; // Offset to the attributes as stored by the preceding shader stage. uint32_t srcVertexAttribOffset; - // Size of the control data section which contains cut or streamID data, in simdscalar units. Should be sized to handle - // the maximum number of verts output by the GS. Can be 0 if there are no cuts or streamID bits. + // Size of the control data section which contains cut or streamID data, in simdscalar units. + // Should be sized to handle the maximum number of verts output by the GS. Can be 0 if there are + // no cuts or streamID bits. uint32_t controlDataSize; // Offset to the control data section, in bytes @@ -782,15 +786,14 @@ struct SWR_GS_STATE // Offset to the start of the vertex section, in bytes uint32_t outputVertexOffset; - // Set this to non-zero to indicate that the shader outputs a static number of verts. If zero, shader is - // expected to store the final vertex count in the first dword of the gs output stream. + // Set this to non-zero to indicate that the shader outputs a static number of verts. If zero, + // shader is expected to store the final vertex count in the first dword of the gs output + // stream. uint32_t staticVertexCount; uint32_t pad; }; -static_assert(sizeof(SWR_GS_STATE) == 64, - "Adjust padding to keep size (or remove this assert)"); - +static_assert(sizeof(SWR_GS_STATE) == 64, "Adjust padding to keep size (or remove this assert)"); ////////////////////////////////////////////////////////////////////////// /// SWR_TS_OUTPUT_TOPOLOGY - Defines data output by the tessellator / DS @@ -834,22 +837,22 @@ enum SWR_TS_DOMAIN ///////////////////////////////////////////////////////////////////////// struct SWR_TS_STATE { - bool tsEnable; + bool tsEnable; - SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology; // @llvm_enum - SWR_TS_PARTITIONING partitioning; // @llvm_enum - SWR_TS_DOMAIN domain; // @llvm_enum + SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology; // @llvm_enum + SWR_TS_PARTITIONING partitioning; // @llvm_enum + SWR_TS_DOMAIN domain; // @llvm_enum - PRIMITIVE_TOPOLOGY postDSTopology; // @llvm_enum + PRIMITIVE_TOPOLOGY postDSTopology; // @llvm_enum - uint32_t numHsInputAttribs; - uint32_t numHsOutputAttribs; - uint32_t numDsOutputAttribs; - uint32_t dsAllocationSize; - uint32_t dsOutVtxAttribOffset; + uint32_t numHsInputAttribs; + uint32_t numHsOutputAttribs; + uint32_t numDsOutputAttribs; + uint32_t dsAllocationSize; + uint32_t dsOutVtxAttribOffset; // Offset to the start of the attributes of the input vertices, in simdvector units - uint32_t vertexAttribOffset; + uint32_t vertexAttribOffset; }; // output merger state @@ -860,7 +863,8 @@ struct SWR_RENDER_TARGET_BLEND_STATE uint8_t writeDisableBlue : 1; uint8_t writeDisableAlpha : 1; }; -static_assert(sizeof(SWR_RENDER_TARGET_BLEND_STATE) == 1, "Invalid SWR_RENDER_TARGET_BLEND_STATE size"); +static_assert(sizeof(SWR_RENDER_TARGET_BLEND_STATE) == 1, + "Invalid SWR_RENDER_TARGET_BLEND_STATE size"); enum SWR_MULTISAMPLE_COUNT { @@ -887,7 +891,7 @@ struct SWR_BLEND_STATE uint32_t sampleMask; // all RT's have the same sample count ///@todo move this to Output Merger state when we refactor - SWR_MULTISAMPLE_COUNT sampleCount; // @llvm_enum + SWR_MULTISAMPLE_COUNT sampleCount; // @llvm_enum SWR_RENDER_TARGET_BLEND_STATE renderTarget[SWR_NUM_RENDERTARGETS]; }; @@ -895,17 +899,17 @@ static_assert(sizeof(SWR_BLEND_STATE) == 36, "Invalid SWR_BLEND_STATE size"); struct SWR_BLEND_CONTEXT { - const SWR_BLEND_STATE* pBlendState; - simdvector* src; - simdvector* src1; - simdvector* src0alpha; - uint32_t sampleNum; - simdvector* pDst; - simdvector* result; - simdscalari* oMask; - simdscalari* pMask; - uint32_t isAlphaTested; - uint32_t isAlphaBlended; + const SWR_BLEND_STATE* pBlendState; + simdvector* src; + simdvector* src1; + simdvector* src0alpha; + uint32_t sampleNum; + simdvector* pDst; + simdvector* result; + simdscalari* oMask; + simdscalari* pMask; + uint32_t isAlphaTested; + uint32_t isAlphaBlended; }; ////////////////////////////////////////////////////////////////////////// @@ -922,13 +926,12 @@ typedef void(__cdecl *PFN_DS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateDat typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_GS_CONTEXT* pGsContext); typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_CS_CONTEXT* pCsContext); typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext); -typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT *pContext); -typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT *pContext); +typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT* pContext); +typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT* pContext); typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(SWR_BLEND_CONTEXT*); typedef simdscalar(*PFN_QUANTIZE_DEPTH)(simdscalar const &); - ////////////////////////////////////////////////////////////////////////// /// FRONTEND_STATE ///////////////////////////////////////////////////////////////////////// @@ -1029,44 +1032,44 @@ enum SWR_PIXEL_LOCATION struct SWR_MULTISAMPLE_POS { public: - INLINE void SetXi(uint32_t sampleNum, uint32_t val) { _xi[sampleNum] = val; }; // @llvm_func - INLINE void SetYi(uint32_t sampleNum, uint32_t val) { _yi[sampleNum] = val; }; // @llvm_func - INLINE uint32_t Xi(uint32_t sampleNum) const { return _xi[sampleNum]; }; // @llvm_func - INLINE uint32_t Yi(uint32_t sampleNum) const { return _yi[sampleNum]; }; // @llvm_func - INLINE void SetX(uint32_t sampleNum, float val) { _x[sampleNum] = val; }; // @llvm_func - INLINE void SetY(uint32_t sampleNum, float val) { _y[sampleNum] = val; }; // @llvm_func - INLINE float X(uint32_t sampleNum) const { return _x[sampleNum]; }; // @llvm_func - INLINE float Y(uint32_t sampleNum) const { return _y[sampleNum]; }; // @llvm_func - typedef const float(&sampleArrayT)[SWR_MAX_NUM_MULTISAMPLES]; //@llvm_typedef - INLINE sampleArrayT X() const { return _x; }; // @llvm_func - INLINE sampleArrayT Y() const { return _y; }; // @llvm_func + INLINE void SetXi(uint32_t sampleNum, uint32_t val) { _xi[sampleNum] = val; }; // @llvm_func + INLINE void SetYi(uint32_t sampleNum, uint32_t val) { _yi[sampleNum] = val; }; // @llvm_func + INLINE uint32_t Xi(uint32_t sampleNum) const { return _xi[sampleNum]; }; // @llvm_func + INLINE uint32_t Yi(uint32_t sampleNum) const { return _yi[sampleNum]; }; // @llvm_func + INLINE void SetX(uint32_t sampleNum, float val) { _x[sampleNum] = val; }; // @llvm_func + INLINE void SetY(uint32_t sampleNum, float val) { _y[sampleNum] = val; }; // @llvm_func + INLINE float X(uint32_t sampleNum) const { return _x[sampleNum]; }; // @llvm_func + INLINE float Y(uint32_t sampleNum) const { return _y[sampleNum]; }; // @llvm_func + typedef const float (&sampleArrayT)[SWR_MAX_NUM_MULTISAMPLES]; //@llvm_typedef + INLINE sampleArrayT X() const { return _x; }; // @llvm_func + INLINE sampleArrayT Y() const { return _y; }; // @llvm_func INLINE const __m128i& vXi(uint32_t sampleNum) const { return _vXi[sampleNum]; }; // @llvm_func INLINE const __m128i& vYi(uint32_t sampleNum) const { return _vYi[sampleNum]; }; // @llvm_func INLINE const simdscalar& vX(uint32_t sampleNum) const { return _vX[sampleNum]; }; // @llvm_func INLINE const simdscalar& vY(uint32_t sampleNum) const { return _vY[sampleNum]; }; // @llvm_func - INLINE const __m128i& TileSampleOffsetsX() const { return tileSampleOffsetsX; }; // @llvm_func - INLINE const __m128i& TileSampleOffsetsY() const { return tileSampleOffsetsY; }; // @llvm_func + INLINE const __m128i& TileSampleOffsetsX() const { return tileSampleOffsetsX; }; // @llvm_func + INLINE const __m128i& TileSampleOffsetsY() const { return tileSampleOffsetsY; }; // @llvm_func INLINE void PrecalcSampleData(int numSamples); //@llvm_func private: template <typename MaskT> INLINE __m128i expandThenBlend4(uint32_t* min, uint32_t* max); // @llvm_func - INLINE void CalcTileSampleOffsets(int numSamples); // @llvm_func + INLINE void CalcTileSampleOffsets(int numSamples); // @llvm_func // scalar sample values uint32_t _xi[SWR_MAX_NUM_MULTISAMPLES]; uint32_t _yi[SWR_MAX_NUM_MULTISAMPLES]; - float _x[SWR_MAX_NUM_MULTISAMPLES]; - float _y[SWR_MAX_NUM_MULTISAMPLES]; + float _x[SWR_MAX_NUM_MULTISAMPLES]; + float _y[SWR_MAX_NUM_MULTISAMPLES]; // precalc'd / vectorized samples - __m128i _vXi[SWR_MAX_NUM_MULTISAMPLES]; - __m128i _vYi[SWR_MAX_NUM_MULTISAMPLES]; + __m128i _vXi[SWR_MAX_NUM_MULTISAMPLES]; + __m128i _vYi[SWR_MAX_NUM_MULTISAMPLES]; simdscalar _vX[SWR_MAX_NUM_MULTISAMPLES]; simdscalar _vY[SWR_MAX_NUM_MULTISAMPLES]; - __m128i tileSampleOffsetsX; - __m128i tileSampleOffsetsY; + __m128i tileSampleOffsetsX; + __m128i tileSampleOffsetsY; }; ////////////////////////////////////////////////////////////////////////// @@ -1074,33 +1077,33 @@ private: ////////////////////////////////////////////////////////////////////////// struct SWR_RASTSTATE { - uint32_t cullMode : 2; - uint32_t fillMode : 2; - uint32_t frontWinding : 1; - uint32_t scissorEnable : 1; - uint32_t depthClipEnable : 1; - uint32_t clipHalfZ : 1; - uint32_t pointParam : 1; - uint32_t pointSpriteEnable : 1; - uint32_t pointSpriteTopOrigin : 1; - uint32_t forcedSampleCount : 1; - uint32_t pixelOffset : 1; - uint32_t depthBiasPreAdjusted : 1; ///< depth bias constant is in float units, not per-format Z units - uint32_t conservativeRast : 1; + uint32_t cullMode : 2; + uint32_t fillMode : 2; + uint32_t frontWinding : 1; + uint32_t scissorEnable : 1; + uint32_t depthClipEnable : 1; + uint32_t clipHalfZ : 1; + uint32_t pointParam : 1; + uint32_t pointSpriteEnable : 1; + uint32_t pointSpriteTopOrigin : 1; + uint32_t forcedSampleCount : 1; + uint32_t pixelOffset : 1; + uint32_t depthBiasPreAdjusted : 1; ///< depth bias constant is in float units, not per-format Z units + uint32_t conservativeRast : 1; float pointSize; float lineWidth; - float depthBias; - float slopeScaledDepthBias; - float depthBiasClamp; - SWR_FORMAT depthFormat; // @llvm_enum + float depthBias; + float slopeScaledDepthBias; + float depthBiasClamp; + SWR_FORMAT depthFormat; // @llvm_enum // sample count the rasterizer is running at - SWR_MULTISAMPLE_COUNT sampleCount; // @llvm_enum - uint32_t pixelLocation; // UL or Center - SWR_MULTISAMPLE_POS samplePositions; // @llvm_struct - bool bIsCenterPattern; // @llvm_enum + SWR_MULTISAMPLE_COUNT sampleCount; // @llvm_enum + uint32_t pixelLocation; // UL or Center + SWR_MULTISAMPLE_POS samplePositions; // @llvm_struct + bool bIsCenterPattern; // @llvm_enum }; @@ -1122,17 +1125,21 @@ struct SWR_ATTRIB_SWIZZLE // backend state struct SWR_BACKEND_STATE { - uint32_t constantInterpolationMask; // bitmask indicating which attributes have constant interpolation - uint32_t pointSpriteTexCoordMask; // bitmask indicating the attribute(s) which should be interpreted as tex coordinates + uint32_t constantInterpolationMask; // bitmask indicating which attributes have constant + // interpolation + uint32_t pointSpriteTexCoordMask; // bitmask indicating the attribute(s) which should be + // interpreted as tex coordinates - bool swizzleEnable; // when enabled, core will parse the swizzle map when - // setting up attributes for the backend, otherwise - // all attributes up to numAttributes will be sent - uint8_t numAttributes; // total number of attributes to send to backend (up to 32) - uint8_t numComponents[32]; // number of components to setup per attribute, this reduces some calculations for unneeded components + bool swizzleEnable; // when enabled, core will parse the swizzle map when + // setting up attributes for the backend, otherwise + // all attributes up to numAttributes will be sent + uint8_t numAttributes; // total number of attributes to send to backend (up to 32) + uint8_t numComponents[32]; // number of components to setup per attribute, this reduces some + // calculations for unneeded components - bool readRenderTargetArrayIndex; // Forward render target array index from last FE stage to the backend - bool readViewportArrayIndex; // Read viewport array index from last FE stage during binning + bool readRenderTargetArrayIndex; // Forward render target array index from last FE stage to the + // backend + bool readViewportArrayIndex; // Read viewport array index from last FE stage during binning // User clip/cull distance enables uint8_t cullDistanceMask; @@ -1142,7 +1149,7 @@ struct SWR_BACKEND_STATE // and that the next fields are dword aligned. uint8_t pad[10]; - // Offset to the start of the attributes of the input vertices, in simdvector units + // Offset to the start of the attributes of the input vertices, in simdvector units uint32_t vertexAttribOffset; // Offset to clip/cull attrib section of the vertex, in simdvector units @@ -1151,7 +1158,7 @@ struct SWR_BACKEND_STATE SWR_ATTRIB_SWIZZLE swizzleMap[32]; }; static_assert(sizeof(SWR_BACKEND_STATE) == 128, - "Adjust padding to keep size (or remove this assert)"); + "Adjust padding to keep size (or remove this assert)"); union SWR_DEPTH_STENCIL_STATE @@ -1214,8 +1221,8 @@ enum SWR_PS_POSITION_OFFSET enum SWR_BARYCENTRICS_MASK { - SWR_BARYCENTRIC_PER_PIXEL_MASK = 0x1, - SWR_BARYCENTRIC_CENTROID_MASK = 0x2, + SWR_BARYCENTRIC_PER_PIXEL_MASK = 0x1, + SWR_BARYCENTRIC_CENTROID_MASK = 0x2, SWR_BARYCENTRIC_PER_SAMPLE_MASK = 0x4, }; @@ -1223,27 +1230,28 @@ enum SWR_BARYCENTRICS_MASK struct SWR_PS_STATE { // dword 0-1 - PFN_PIXEL_KERNEL pfnPixelShader; // @llvm_pfn + PFN_PIXEL_KERNEL pfnPixelShader; // @llvm_pfn // dword 2 - uint32_t killsPixel : 1; // pixel shader can kill pixels - uint32_t inputCoverage : 2; // ps uses input coverage - uint32_t writesODepth : 1; // pixel shader writes to depth - uint32_t usesSourceDepth : 1; // pixel shader reads depth - uint32_t shadingRate : 2; // shading per pixel / sample / coarse pixel - uint32_t posOffset : 2; // type of offset (none, sample, centroid) to add to pixel position - uint32_t barycentricsMask : 3; // which type(s) of barycentric coords does the PS interpolate attributes with - uint32_t usesUAV : 1; // pixel shader accesses UAV - uint32_t forceEarlyZ : 1; // force execution of early depth/stencil test + uint32_t killsPixel : 1; // pixel shader can kill pixels + uint32_t inputCoverage : 2; // ps uses input coverage + uint32_t writesODepth : 1; // pixel shader writes to depth + uint32_t usesSourceDepth : 1; // pixel shader reads depth + uint32_t shadingRate : 2; // shading per pixel / sample / coarse pixel + uint32_t posOffset : 2; // type of offset (none, sample, centroid) to add to pixel position + uint32_t barycentricsMask : 3; // which type(s) of barycentric coords does the PS interpolate + // attributes with + uint32_t usesUAV : 1; // pixel shader accesses UAV + uint32_t forceEarlyZ : 1; // force execution of early depth/stencil test - uint8_t renderTargetMask; // Mask of render targets written + uint8_t renderTargetMask; // Mask of render targets written }; // depth bounds state struct SWR_DEPTH_BOUNDS_STATE { - bool depthBoundsTestEnable; - float depthBoundsTestMinValue; - float depthBoundsTestMaxValue; + bool depthBoundsTestEnable; + float depthBoundsTestMinValue; + float depthBoundsTestMaxValue; }; - +// clang-format on diff --git a/src/gallium/drivers/swr/rasterizer/core/state_funcs.h b/src/gallium/drivers/swr/rasterizer/core/state_funcs.h index eaf0094b626..99eac835ea8 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state_funcs.h +++ b/src/gallium/drivers/swr/rasterizer/core/state_funcs.h @@ -1,36 +1,35 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file state.h -* -* @brief Definitions for API state - complex function implementation. -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file state.h + * + * @brief Definitions for API state - complex function implementation. + * + ******************************************************************************/ #pragma once #include "core/state.h" #include "common/simdintrin.h" - template <typename MaskT> INLINE __m128i SWR_MULTISAMPLE_POS::expandThenBlend4(uint32_t* min, uint32_t* max) { @@ -41,27 +40,27 @@ INLINE __m128i SWR_MULTISAMPLE_POS::expandThenBlend4(uint32_t* min, uint32_t* ma INLINE void SWR_MULTISAMPLE_POS::PrecalcSampleData(int numSamples) { - for(int i = 0; i < numSamples; i++) + for (int i = 0; i < numSamples; i++) { _vXi[i] = _mm_set1_epi32(_xi[i]); _vYi[i] = _mm_set1_epi32(_yi[i]); - _vX[i] = _simd_set1_ps(_x[i]); - _vY[i] = _simd_set1_ps(_y[i]); + _vX[i] = _simd_set1_ps(_x[i]); + _vY[i] = _simd_set1_ps(_y[i]); } // precalculate the raster tile BB for the rasterizer. - CalcTileSampleOffsets(numSamples); + CalcTileSampleOffsets(numSamples); } INLINE void SWR_MULTISAMPLE_POS::CalcTileSampleOffsets(int numSamples) { - auto minXi = std::min_element(std::begin(_xi), &_xi[numSamples]); - auto maxXi = std::max_element(std::begin(_xi), &_xi[numSamples]); + auto minXi = std::min_element(std::begin(_xi), &_xi[numSamples]); + auto maxXi = std::max_element(std::begin(_xi), &_xi[numSamples]); using xMask = std::integral_constant<int, 0xA>; // BR(max), BL(min), UR(max), UL(min) tileSampleOffsetsX = expandThenBlend4<xMask>(minXi, maxXi); - auto minYi = std::min_element(std::begin(_yi), &_yi[numSamples]); - auto maxYi = std::max_element(std::begin(_yi), &_yi[numSamples]); + auto minYi = std::min_element(std::begin(_yi), &_yi[numSamples]); + auto maxYi = std::max_element(std::begin(_yi), &_yi[numSamples]); using yMask = std::integral_constant<int, 0xC>; // BR(max), BL(min), UR(max), UL(min) tileSampleOffsetsY = expandThenBlend4<yMask>(minYi, maxYi); diff --git a/src/gallium/drivers/swr/rasterizer/core/tessellator.h b/src/gallium/drivers/swr/rasterizer/core/tessellator.h index 316f66f94ae..348170bfd42 100644 --- a/src/gallium/drivers/swr/rasterizer/core/tessellator.h +++ b/src/gallium/drivers/swr/rasterizer/core/tessellator.h @@ -1,43 +1,42 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file tessellator.h -* -* @brief Tessellator fixed function unit interface definition -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file tessellator.h + * + * @brief Tessellator fixed function unit interface definition + * + ******************************************************************************/ #pragma once /// Allocate and initialize a new tessellation context -HANDLE SWR_API TSInitCtx( - SWR_TS_DOMAIN tsDomain, ///< [IN] Tessellation domain (isoline, quad, triangle) - SWR_TS_PARTITIONING tsPartitioning, ///< [IN] Tessellation partitioning algorithm - SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology, ///< [IN] Tessellation output topology - void* pContextMem, ///< [IN] Memory to use for the context - size_t& memSize); ///< [INOUT] In: Amount of memory in pContextMem. Out: Mem required +HANDLE SWR_API + TSInitCtx(SWR_TS_DOMAIN tsDomain, ///< [IN] Tessellation domain (isoline, quad, triangle) + SWR_TS_PARTITIONING tsPartitioning, ///< [IN] Tessellation partitioning algorithm + SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology, ///< [IN] Tessellation output topology + void* pContextMem, ///< [IN] Memory to use for the context + size_t& memSize); ///< [INOUT] In: Amount of memory in pContextMem. Out: Mem required /// Destroy & de-allocate tessellation context -void SWR_API TSDestroyCtx( - HANDLE tsCtx); ///< [IN] Tessellation context to be destroyed +void SWR_API TSDestroyCtx(HANDLE tsCtx); ///< [IN] Tessellation context to be destroyed struct SWR_TS_TESSELLATED_DATA { @@ -45,43 +44,38 @@ struct SWR_TS_TESSELLATED_DATA uint32_t NumDomainPoints; uint32_t* ppIndices[3]; - float* pDomainPointsU; - float* pDomainPointsV; + float* pDomainPointsU; + float* pDomainPointsV; // For Tri: pDomainPointsW[i] = 1.0f - pDomainPointsU[i] - pDomainPointsV[i] }; /// Perform Tessellation -void SWR_API TSTessellate( - HANDLE tsCtx, ///< [IN] Tessellation Context - const SWR_TESSELLATION_FACTORS& tsTessFactors, ///< [IN] Tessellation Factors - SWR_TS_TESSELLATED_DATA& tsTessellatedData); ///< [OUT] Tessellated Data - +void SWR_API + TSTessellate(HANDLE tsCtx, ///< [IN] Tessellation Context + const SWR_TESSELLATION_FACTORS& tsTessFactors, ///< [IN] Tessellation Factors + SWR_TS_TESSELLATED_DATA& tsTessellatedData); ///< [OUT] Tessellated Data /// @TODO - Implement OSS tessellator -INLINE HANDLE SWR_API TSInitCtx( - SWR_TS_DOMAIN tsDomain, - SWR_TS_PARTITIONING tsPartitioning, - SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology, - void* pContextMem, - size_t& memSize) +INLINE HANDLE SWR_API TSInitCtx(SWR_TS_DOMAIN tsDomain, + SWR_TS_PARTITIONING tsPartitioning, + SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology, + void* pContextMem, + size_t& memSize) { SWR_NOT_IMPL; return NULL; } - INLINE void SWR_API TSDestroyCtx(HANDLE tsCtx) { SWR_NOT_IMPL; } - -INLINE void SWR_API TSTessellate( - HANDLE tsCtx, - const SWR_TESSELLATION_FACTORS& tsTessFactors, - SWR_TS_TESSELLATED_DATA& tsTessellatedData) +INLINE void SWR_API TSTessellate(HANDLE tsCtx, + const SWR_TESSELLATION_FACTORS& tsTessFactors, + SWR_TS_TESSELLATED_DATA& tsTessellatedData) { SWR_NOT_IMPL; } diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index f77ae22a80a..4523616cba0 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -1,25 +1,25 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -****************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + ****************************************************************************/ #include <stdio.h> #include <thread> @@ -52,13 +52,11 @@ #include "tileset.h" - - // ThreadId struct Core { - uint32_t procGroup = 0; - std::vector<uint32_t> threadIds; + uint32_t procGroup = 0; + std::vector<uint32_t> threadIds; }; struct NumaNode @@ -78,7 +76,7 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread std::vector<KAFFINITY> threadMaskPerProcGroup; - static std::mutex m; + static std::mutex m; std::lock_guard<std::mutex> l(m); DWORD bufSize = 0; @@ -86,13 +84,14 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize); SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER); - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize); + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem = + (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize); SWR_ASSERT(pBufferMem); ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize); SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information"); - uint32_t count = bufSize / pBufferMem->Size; + uint32_t count = bufSize / pBufferMem->Size; PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem; for (uint32_t i = 0; i < count; ++i) @@ -100,8 +99,8 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore); for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g) { - auto& gmask = pBuffer->Processor.GroupMask[g]; - uint32_t threadId = 0; + auto& gmask = pBuffer->Processor.GroupMask[g]; + uint32_t threadId = 0; uint32_t procGroup = gmask.Group; Core* pCore = nullptr; @@ -133,10 +132,10 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId); // Find Numa Node - uint32_t numaId = 0; + uint32_t numaId = 0; PROCESSOR_NUMBER procNum = {}; - procNum.Group = WORD(procGroup); - procNum.Number = UCHAR(threadId); + procNum.Group = WORD(procGroup); + procNum.Number = UCHAR(threadId); ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId); SWR_ASSERT(ret); @@ -146,7 +145,7 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread { out_nodes.resize(numaId + 1); } - auto& numaNode = out_nodes[numaId]; + auto& numaNode = out_nodes[numaId]; numaNode.numaId = numaId; uint32_t coreId = 0; @@ -154,7 +153,7 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread if (nullptr == pCore) { numaNode.cores.push_back(Core()); - pCore = &numaNode.cores.back(); + pCore = &numaNode.cores.back(); pCore->procGroup = procGroup; } pCore->threadIds.push_back(threadId); @@ -169,56 +168,55 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread free(pBufferMem); - -#elif defined(__linux__) || defined (__gnu_linux__) +#elif defined(__linux__) || defined(__gnu_linux__) // Parse /proc/cpuinfo to get full topology std::ifstream input("/proc/cpuinfo"); - std::string line; - char* c; - uint32_t procId = uint32_t(-1); - uint32_t coreId = uint32_t(-1); - uint32_t physId = uint32_t(-1); + std::string line; + char* c; + uint32_t procId = uint32_t(-1); + uint32_t coreId = uint32_t(-1); + uint32_t physId = uint32_t(-1); while (std::getline(input, line)) { if (line.find("processor") != std::string::npos) { auto data_start = line.find(": ") + 2; - procId = std::strtoul(&line.c_str()[data_start], &c, 10); + procId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } if (line.find("core id") != std::string::npos) { auto data_start = line.find(": ") + 2; - coreId = std::strtoul(&line.c_str()[data_start], &c, 10); + coreId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } if (line.find("physical id") != std::string::npos) { auto data_start = line.find(": ") + 2; - physId = std::strtoul(&line.c_str()[data_start], &c, 10); + physId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } if (line.length() == 0) { if (physId + 1 > out_nodes.size()) out_nodes.resize(physId + 1); - auto& numaNode = out_nodes[physId]; + auto& numaNode = out_nodes[physId]; numaNode.numaId = physId; if (coreId + 1 > numaNode.cores.size()) numaNode.cores.resize(coreId + 1); - auto& core = numaNode.cores[coreId]; + auto& core = numaNode.cores[coreId]; core.procGroup = coreId; core.threadIds.push_back(procId); } } out_numThreadsPerProcGroup = 0; - for (auto &node : out_nodes) + for (auto& node : out_nodes) { - for (auto &core : node.cores) + for (auto& core : node.cores) { out_numThreadsPerProcGroup += core.threadIds.size(); } @@ -226,11 +224,11 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread #elif defined(__APPLE__) - auto numProcessors = 0; - auto numCores = 0; + auto numProcessors = 0; + auto numCores = 0; auto numPhysicalIds = 0; - int value; + int value; size_t size = sizeof(value); int result = sysctlbyname("hw.packages", &value, &size, NULL, 0); @@ -249,8 +247,8 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread for (auto physId = 0; physId < numPhysicalIds; ++physId) { - auto &numaNode = out_nodes[physId]; - auto procId = 0; + auto& numaNode = out_nodes[physId]; + auto procId = 0; numaNode.cores.resize(numCores); @@ -258,7 +256,7 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread { for (auto coreId = 0; coreId < numaNode.cores.size(); ++coreId, ++procId) { - auto &core = numaNode.cores[coreId]; + auto& core = numaNode.cores[coreId]; core.procGroup = coreId; core.threadIds.push_back(procId); @@ -268,9 +266,9 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread out_numThreadsPerProcGroup = 0; - for (auto &node : out_nodes) + for (auto& node : out_nodes) { - for (auto &core : node.cores) + for (auto& core : node.cores) { out_numThreadsPerProcGroup += core.threadIds.size(); } @@ -283,10 +281,10 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread #endif // Prune empty cores and numa nodes - for (auto node_it = out_nodes.begin(); node_it != out_nodes.end(); ) + for (auto node_it = out_nodes.begin(); node_it != out_nodes.end();) { // Erase empty cores (first) - for (auto core_it = node_it->cores.begin(); core_it != node_it->cores.end(); ) + for (auto core_it = node_it->cores.begin(); core_it != node_it->cores.end();) { if (core_it->threadIds.size() == 0) { @@ -310,10 +308,14 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread } } -void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=false) +void bindThread(SWR_CONTEXT* pContext, + uint32_t threadId, + uint32_t procGroupId = 0, + bool bindProcGroup = false) { // Only bind threads when MAX_WORKER_THREADS isn't set. - if (pContext->threadInfo.SINGLE_THREADED || (pContext->threadInfo.MAX_WORKER_THREADS && bindProcGroup == false)) + if (pContext->threadInfo.SINGLE_THREADED || + (pContext->threadInfo.MAX_WORKER_THREADS && bindProcGroup == false)) { return; } @@ -321,7 +323,7 @@ void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, uint32_t procGroupId = #if defined(_WIN32) GROUP_AFFINITY affinity = {}; - affinity.Group = procGroupId; + affinity.Group = procGroupId; #if !defined(_WIN64) if (threadId >= 32) @@ -340,7 +342,7 @@ void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, uint32_t procGroupId = { // If MAX_WORKER_THREADS is set, only bind to the proc group, // Not the individual HW thread. - if (!bindProcGroup && !pContext->threadInfo.MAX_WORKER_THREADS) + if (!bindProcGroup && !pContext->threadInfo.MAX_WORKER_THREADS) { affinity.Mask = KAFFINITY(1) << threadId; } @@ -372,15 +374,15 @@ void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, uint32_t procGroupId = } INLINE -uint32_t GetEnqueuedDraw(SWR_CONTEXT *pContext) +uint32_t GetEnqueuedDraw(SWR_CONTEXT* pContext) { return pContext->dcRing.GetHead(); } INLINE -DRAW_CONTEXT *GetDC(SWR_CONTEXT *pContext, uint32_t drawId) +DRAW_CONTEXT* GetDC(SWR_CONTEXT* pContext, uint32_t drawId) { - return &pContext->dcRing[(drawId-1) % pContext->MAX_DRAWS_IN_FLIGHT]; + return &pContext->dcRing[(drawId - 1) % pContext->MAX_DRAWS_IN_FLIGHT]; } INLINE @@ -393,12 +395,12 @@ bool IDComparesLess(uint32_t a, uint32_t b) // returns true if dependency not met INLINE -bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t lastRetiredDraw) +bool CheckDependency(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t lastRetiredDraw) { return pDC->dependent && IDComparesLess(lastRetiredDraw, pDC->drawId - 1); } -bool CheckDependencyFE(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t lastRetiredDraw) +bool CheckDependencyFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t lastRetiredDraw) { return pDC->dependentFE && IDComparesLess(lastRetiredDraw, pDC->drawId - 1); } @@ -413,15 +415,15 @@ INLINE void UpdateClientStats(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CON } DRAW_DYNAMIC_STATE& dynState = pDC->dynState; - OSALIGNLINE(SWR_STATS) stats{ 0 }; + OSALIGNLINE(SWR_STATS) stats{0}; // Sum up stats across all workers before sending to client. for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) { stats.DepthPassCount += dynState.pStats[i].DepthPassCount; - stats.PsInvocations += dynState.pStats[i].PsInvocations; - stats.CsInvocations += dynState.pStats[i].CsInvocations; + stats.PsInvocations += dynState.pStats[i].PsInvocations; + stats.CsInvocations += dynState.pStats[i].CsInvocations; } @@ -435,8 +437,8 @@ INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONT if (pDC->retireCallback.pfnCallbackFunc) { pDC->retireCallback.pfnCallbackFunc(pDC->retireCallback.userData, - pDC->retireCallback.userData2, - pDC->retireCallback.userData3); + pDC->retireCallback.userData2, + pDC->retireCallback.userData3); } } @@ -465,7 +467,7 @@ INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId, _ReadWriteBarrier(); - pContext->dcRing.Dequeue(); // Remove from tail + pContext->dcRing.Dequeue(); // Remove from tail } return result; @@ -477,20 +479,23 @@ int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC) return CompleteDrawContextInl(pContext, 0, pDC); } -INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawBE, uint32_t& drawEnqueued) +INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, + uint32_t workerId, + uint32_t& curDrawBE, + uint32_t& drawEnqueued) { // increment our current draw id to the first incomplete draw drawEnqueued = GetEnqueuedDraw(pContext); while (IDComparesLess(curDrawBE, drawEnqueued)) { - DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT]; + DRAW_CONTEXT* pDC = &pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT]; // If its not compute and FE is not done then break out of loop. - if (!pDC->doneFE && !pDC->isCompute) break; + if (!pDC->doneFE && !pDC->isCompute) + break; - bool isWorkComplete = pDC->isCompute ? - pDC->pDispatch->isWorkComplete() : - pDC->pTileMgr->isWorkComplete(); + bool isWorkComplete = + pDC->isCompute ? pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete(); if (isWorkComplete) { @@ -511,24 +516,24 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint32_t workerId, ui /// @brief If there is any BE work then go work on it. /// @param pContext - pointer to SWR context. /// @param workerId - The unique worker ID that is assigned to this thread. -/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread -/// has its own curDrawBE counter and this ensures that each worker processes all the -/// draws in order. +/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker +/// thread +/// has its own curDrawBE counter and this ensures that each worker processes all +/// the draws in order. /// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its -/// own set and each time it fails to lock a macrotile, because its already locked, -/// then it will add that tile to the lockedTiles set. As a worker begins to work -/// on future draws the lockedTiles ensure that it doesn't work on tiles that may -/// still have work pending in a previous draw. Additionally, the lockedTiles is -/// hueristic that can steer a worker back to the same macrotile that it had been -/// working on in a previous draw. +/// own set and each time it fails to lock a macrotile, because its already +/// locked, then it will add that tile to the lockedTiles set. As a worker +/// begins to work on future draws the lockedTiles ensure that it doesn't work +/// on tiles that may still have work pending in a previous draw. Additionally, +/// the lockedTiles is hueristic that can steer a worker back to the same +/// macrotile that it had been working on in a previous draw. /// @returns true if worker thread should shutdown -bool WorkOnFifoBE( - SWR_CONTEXT *pContext, - uint32_t workerId, - uint32_t &curDrawBE, - TileSet& lockedTiles, - uint32_t numaNode, - uint32_t numaMask) +bool WorkOnFifoBE(SWR_CONTEXT* pContext, + uint32_t workerId, + uint32_t& curDrawBE, + TileSet& lockedTiles, + uint32_t numaNode, + uint32_t numaMask) { bool bShutdown = false; @@ -540,27 +545,30 @@ bool WorkOnFifoBE( return false; } - uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1; + uint32_t lastRetiredDraw = + pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1; // Reset our history for locked tiles. We'll have to re-learn which tiles are locked. lockedTiles.clear(); // Try to work on each draw in order of the available draws in flight. // 1. If we're on curDrawBE, we can work on any macrotile that is available. - // 2. If we're trying to work on draws after curDrawBE, we are restricted to + // 2. If we're trying to work on draws after curDrawBE, we are restricted to // working on those macrotiles that are known to be complete in the prior draw to // maintain order. The locked tiles provides the history to ensures this. for (uint32_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i) { - DRAW_CONTEXT *pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT]; + DRAW_CONTEXT* pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT]; - if (pDC->isCompute) return false; // We don't look at compute work. + if (pDC->isCompute) + return false; // We don't look at compute work. // First wait for FE to be finished with this draw. This keeps threading model simple // but if there are lots of bubbles between draws then serializing FE and BE may // need to be revisited. - if (!pDC->doneFE) return false; - + if (!pDC->doneFE) + return false; + // If this draw is dependent on a previous draw then we need to bail. if (CheckDependency(pContext, pDC, lastRetiredDraw)) { @@ -568,7 +576,7 @@ bool WorkOnFifoBE( } // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it. - auto ¯oTiles = pDC->pTileMgr->getDirtyTiles(); + auto& macroTiles = pDC->pTileMgr->getDirtyTiles(); for (auto tile : macroTiles) { @@ -595,7 +603,7 @@ bool WorkOnFifoBE( if (tile->tryLock()) { - BE_WORK *pWork; + BE_WORK* pWork; RDTSC_BEGIN(WorkerFoundWork, pDC->drawId); @@ -624,11 +632,13 @@ bool WorkOnFifoBE( pDC->pTileMgr->markTileComplete(tileID); - // Optimization: If the draw is complete and we're the last one to have worked on it then - // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete. + // Optimization: If the draw is complete and we're the last one to have worked on it + // then we can reset the locked list as we know that all previous draws before the + // next are guaranteed to be complete. if ((curDrawBE == i) && (bShutdown || pDC->pTileMgr->isWorkComplete())) { - // We can increment the current BE and safely move to next draw since we know this draw is complete. + // We can increment the current BE and safely move to next draw since we know + // this draw is complete. curDrawBE++; CompleteDrawContextInl(pContext, workerId, pDC); @@ -645,7 +655,8 @@ bool WorkOnFifoBE( } else { - // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again. + // This tile is already locked. So let's add it to our locked tiles set. This way we + // don't try locking this one again. lockedTiles.set(tileID); } } @@ -663,12 +674,24 @@ INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEX SWR_STATS_FE& stats = pDC->dynState.statsFE; AR_EVENT(FrontendStatsEvent(pDC->drawId, - stats.IaVertices, stats.IaPrimitives, stats.VsInvocations, stats.HsInvocations, - stats.DsInvocations, stats.GsInvocations, stats.GsPrimitives, stats.CInvocations, stats.CPrimitives, - stats.SoPrimStorageNeeded[0], stats.SoPrimStorageNeeded[1], stats.SoPrimStorageNeeded[2], stats.SoPrimStorageNeeded[3], - stats.SoNumPrimsWritten[0], stats.SoNumPrimsWritten[1], stats.SoNumPrimsWritten[2], stats.SoNumPrimsWritten[3] - )); - AR_EVENT(FrontendDrawEndEvent(pDC->drawId)); + stats.IaVertices, + stats.IaPrimitives, + stats.VsInvocations, + stats.HsInvocations, + stats.DsInvocations, + stats.GsInvocations, + stats.GsPrimitives, + stats.CInvocations, + stats.CPrimitives, + stats.SoPrimStorageNeeded[0], + stats.SoPrimStorageNeeded[1], + stats.SoPrimStorageNeeded[2], + stats.SoPrimStorageNeeded[3], + stats.SoNumPrimsWritten[0], + stats.SoNumPrimsWritten[1], + stats.SoNumPrimsWritten[2], + stats.SoNumPrimsWritten[3])); + AR_EVENT(FrontendDrawEndEvent(pDC->drawId)); pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &stats); } @@ -680,7 +703,8 @@ INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEX if ((pDC->dynState.SoWriteOffsetDirty[i]) && (pDC->pState->state.soBuffer[i].soWriteEnable)) { - pContext->pfnUpdateSoWriteOffset(GetPrivateState(pDC), i, pDC->dynState.SoWriteOffset[i]); + pContext->pfnUpdateSoWriteOffset( + GetPrivateState(pDC), i, pDC->dynState.SoWriteOffset[i]); } } } @@ -692,14 +716,14 @@ INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEX InterlockedDecrement(&pContext->drawsOutstandingFE); } -void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE) +void WorkOnFifoFE(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawFE) { // Try to grab the next DC from the ring uint32_t drawEnqueued = GetEnqueuedDraw(pContext); while (IDComparesLess(curDrawFE, drawEnqueued)) { - uint32_t dcSlot = curDrawFE % pContext->MAX_DRAWS_IN_FLIGHT; - DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot]; + uint32_t dcSlot = curDrawFE % pContext->MAX_DRAWS_IN_FLIGHT; + DRAW_CONTEXT* pDC = &pContext->dcRing[dcSlot]; if (pDC->isCompute || pDC->doneFE) { CompleteDrawContextInl(pContext, workerId, pDC); @@ -712,11 +736,11 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE) } uint32_t lastRetiredFE = curDrawFE - 1; - uint32_t curDraw = curDrawFE; + uint32_t curDraw = curDrawFE; while (IDComparesLess(curDraw, drawEnqueued)) { - uint32_t dcSlot = curDraw % pContext->MAX_DRAWS_IN_FLIGHT; - DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot]; + uint32_t dcSlot = curDraw % pContext->MAX_DRAWS_IN_FLIGHT; + DRAW_CONTEXT* pDC = &pContext->dcRing[dcSlot]; if (!pDC->isCompute && !pDC->FeLock) { @@ -742,13 +766,11 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE) /// @brief If there is any compute work then go work on it. /// @param pContext - pointer to SWR context. /// @param workerId - The unique worker ID that is assigned to this thread. -/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread -/// has its own curDrawBE counter and this ensures that each worker processes all the -/// draws in order. -void WorkOnCompute( - SWR_CONTEXT *pContext, - uint32_t workerId, - uint32_t& curDrawBE) +/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker +/// thread +/// has its own curDrawBE counter and this ensures that each worker processes all +/// the draws in order. +void WorkOnCompute(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawBE) { uint32_t drawEnqueued = 0; if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false) @@ -756,12 +778,14 @@ void WorkOnCompute( return; } - uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1; + uint32_t lastRetiredDraw = + pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1; for (uint64_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i) { - DRAW_CONTEXT *pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT]; - if (pDC->isCompute == false) return; + DRAW_CONTEXT* pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT]; + if (pDC->isCompute == false) + return; // check dependencies if (CheckDependency(pContext, pDC, lastRetiredDraw)) @@ -775,9 +799,9 @@ void WorkOnCompute( // Is there any work remaining? if (queue.getNumQueued() > 0) { - void* pSpillFillBuffer = nullptr; - void* pScratchSpace = nullptr; - uint32_t threadGroupId = 0; + void* pSpillFillBuffer = nullptr; + void* pScratchSpace = nullptr; + uint32_t threadGroupId = 0; while (queue.getWork(threadGroupId)) { queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace); @@ -790,7 +814,7 @@ void WorkOnCompute( } } -void BindApiThread(SWR_CONTEXT *pContext, uint32_t apiThreadId) +void BindApiThread(SWR_CONTEXT* pContext, uint32_t apiThreadId) { if (nullptr == pContext) { @@ -801,25 +825,26 @@ void BindApiThread(SWR_CONTEXT *pContext, uint32_t apiThreadId) { if (pContext->threadPool.numReservedThreads) { - const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[0]; + const THREAD_DATA& threadData = pContext->threadPool.pApiThreadData[0]; // Just bind to the process group used for API thread 0 bindThread(pContext, 0, threadData.procGroupId, true); } return; } - const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[apiThreadId]; + const THREAD_DATA& threadData = pContext->threadPool.pApiThreadData[apiThreadId]; - bindThread(pContext, threadData.threadId, threadData.procGroupId, threadData.forceBindProcGroup); + bindThread( + pContext, threadData.threadId, threadData.procGroupId, threadData.forceBindProcGroup); } -template<bool IsFEThread, bool IsBEThread> +template <bool IsFEThread, bool IsBEThread> DWORD workerThreadMain(LPVOID pData) { - THREAD_DATA *pThreadData = (THREAD_DATA*)pData; - SWR_CONTEXT *pContext = pThreadData->pContext; - uint32_t threadId = pThreadData->threadId; - uint32_t workerId = pThreadData->workerId; + THREAD_DATA* pThreadData = (THREAD_DATA*)pData; + SWR_CONTEXT* pContext = pThreadData->pContext; + uint32_t threadId = pThreadData->threadId; + uint32_t workerId = pThreadData->workerId; bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup); @@ -832,7 +857,10 @@ DWORD workerThreadMain(LPVOID pData) // linux pthread name limited to 16 chars (including \0) "w%03d-n%d-c%03d-t%d", #endif - workerId, pThreadData->numaId, pThreadData->coreId, pThreadData->htId); + workerId, + pThreadData->numaId, + pThreadData->coreId, + pThreadData->htId); SetCurrentThreadName(threadName); } @@ -851,7 +879,7 @@ DWORD workerThreadMain(LPVOID pData) // each worker has the ability to work on any of the queued draws as long as certain // conditions are met. the data associated - // with a draw is guaranteed to be active as long as a worker hasn't signaled that he + // with a draw is guaranteed to be active as long as a worker hasn't signaled that he // has moved on to the next draw when he determines there is no more work to do. The api // thread will not increment the head of the dc ring until all workers have moved past the // current head. @@ -906,7 +934,8 @@ DWORD workerThreadMain(LPVOID pData) if (IsBEThread) { RDTSC_BEGIN(WorkerWorkOnFifoBE, 0); - bShutdown |= WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask); + bShutdown |= + WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask); RDTSC_END(WorkerWorkOnFifoBE, 0); WorkOnCompute(pContext, workerId, curDrawBE); @@ -925,7 +954,8 @@ DWORD workerThreadMain(LPVOID pData) return 0; } -template<> DWORD workerThreadMain<false, false>(LPVOID) = delete; +template <> +DWORD workerThreadMain<false, false>(LPVOID) = delete; template <bool IsFEThread, bool IsBEThread> DWORD workerThreadInit(LPVOID pData) @@ -938,7 +968,7 @@ DWORD workerThreadInit(LPVOID pData) } #if defined(_WIN32) - __except(EXCEPTION_CONTINUE_SEARCH) + __except (EXCEPTION_CONTINUE_SEARCH) { } @@ -946,14 +976,16 @@ DWORD workerThreadInit(LPVOID pData) return 1; } -template<> DWORD workerThreadInit<false, false>(LPVOID pData) = delete; +template <> +DWORD workerThreadInit<false, false>(LPVOID pData) = delete; static void InitPerThreadStats(SWR_CONTEXT* pContext, uint32_t numThreads) { // Initialize DRAW_CONTEXT's per-thread stats for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc) { - pContext->dcRing[dc].dynState.pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64); + pContext->dcRing[dc].dynState.pStats = + (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64); memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads); } } @@ -965,15 +997,15 @@ static void InitPerThreadStats(SWR_CONTEXT* pContext, uint32_t numThreads) void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) { CPUNumaNodes nodes; - uint32_t numThreadsPerProcGroup = 0; + uint32_t numThreadsPerProcGroup = 0; CalculateProcessorTopology(nodes, numThreadsPerProcGroup); // Assumption, for asymmetric topologies, multi-threaded cores will appear // in the list before single-threaded cores. This appears to be true for // Windows when the total HW threads is limited to 64. - uint32_t numHWNodes = (uint32_t)nodes.size(); - uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size(); - uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size(); + uint32_t numHWNodes = (uint32_t)nodes.size(); + uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size(); + uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size(); #if defined(_WIN32) && !defined(_WIN64) if (!pContext->threadInfo.MAX_WORKER_THREADS) @@ -997,9 +1029,9 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) } } - uint32_t numNodes = numHWNodes; - uint32_t numCoresPerNode = numHWCoresPerNode; - uint32_t numHyperThreads = numHWHyperThreads; + uint32_t numNodes = numHWNodes; + uint32_t numCoresPerNode = numHWCoresPerNode; + uint32_t numHyperThreads = numHWHyperThreads; // Calc used threads per-core if (numHyperThreads > pContext->threadInfo.BASE_THREAD) @@ -1008,11 +1040,10 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) } else { - SWR_ASSERT( - false, - "Cannot use BASE_THREAD value: %d, maxThreads: %d, reverting BASE_THREAD to 0", - pContext->threadInfo.BASE_THREAD, - numHyperThreads); + SWR_ASSERT(false, + "Cannot use BASE_THREAD value: %d, maxThreads: %d, reverting BASE_THREAD to 0", + pContext->threadInfo.BASE_THREAD, + numHyperThreads); pContext->threadInfo.BASE_THREAD = 0; } @@ -1042,11 +1073,10 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) } else { - SWR_ASSERT( - false, - "Cannot use BASE_CORE value: %d, maxCores: %d, reverting BASE_CORE to 0", - pContext->threadInfo.BASE_CORE, - numCoresPerNode); + SWR_ASSERT(false, + "Cannot use BASE_CORE value: %d, maxCores: %d, reverting BASE_CORE to 0", + pContext->threadInfo.BASE_CORE, + numCoresPerNode); pContext->threadInfo.BASE_CORE = 0; } @@ -1080,25 +1110,25 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) SWR_REL_ASSERT(numThreads <= numHWThreads); uint32_t& numAPIReservedThreads = pContext->apiThreadInfo.numAPIReservedThreads; - uint32_t& numAPIThreadsPerCore = pContext->apiThreadInfo.numAPIThreadsPerCore; - uint32_t numRemovedThreads = 0; + uint32_t& numAPIThreadsPerCore = pContext->apiThreadInfo.numAPIThreadsPerCore; + uint32_t numRemovedThreads = 0; if (pContext->threadInfo.SINGLE_THREADED) { - numAPIReservedThreads = 0; - numThreads = 1; + numAPIReservedThreads = 0; + numThreads = 1; pContext->NumWorkerThreads = 1; - pContext->NumFEThreads = 1; - pContext->NumBEThreads = 1; - pPool->numThreads = 0; + pContext->NumFEThreads = 1; + pContext->NumBEThreads = 1; + pPool->numThreads = 0; } else if (pContext->threadInfo.MAX_WORKER_THREADS) { numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, numHWThreads); pContext->threadInfo.BASE_NUMA_NODE = 0; - pContext->threadInfo.BASE_CORE = 0; - pContext->threadInfo.BASE_THREAD = 0; - numAPIReservedThreads = 0; + pContext->threadInfo.BASE_CORE = 0; + pContext->threadInfo.BASE_THREAD = 0; + numAPIReservedThreads = 0; } else { @@ -1119,7 +1149,8 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) if (numAPIThreadsPerCore == 2 && numHyperThreads == 1) { // Adjust removed threads to make logic below work - numRemovedThreads = std::max(1U, (numRemovedThreads + numAPIThreadsPerCore - 1) / 2); + numRemovedThreads = + std::max(1U, (numRemovedThreads + numAPIThreadsPerCore - 1) / 2); } numThreads -= numRemovedThreads; @@ -1131,7 +1162,7 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) if (pContext->threadInfo.SINGLE_THREADED) { numAPIReservedThreads = 0; - numThreads = 1; + numThreads = 1; } if (numAPIReservedThreads) @@ -1149,7 +1180,7 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) } pPool->numReservedThreads = numAPIReservedThreads; - pPool->numThreads = numThreads; + pPool->numThreads = numThreads; pContext->NumWorkerThreads = pPool->numThreads; pPool->pThreadData = new (std::nothrow) THREAD_DATA[pPool->numThreads]; @@ -1161,7 +1192,8 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) pPool->pWorkerPrivateDataArray = nullptr; if (pContext->workerPrivateState.perWorkerPrivateStateSize) { - size_t perWorkerSize = AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64); + size_t perWorkerSize = + AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64); size_t totalSize = perWorkerSize * pPool->numThreads; if (totalSize) { @@ -1191,19 +1223,19 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) if (pContext->threadInfo.MAX_WORKER_THREADS) { - bool bForceBindProcGroup = (numThreads > numThreadsPerProcGroup); + bool bForceBindProcGroup = (numThreads > numThreadsPerProcGroup); uint32_t numProcGroups = (numThreads + numThreadsPerProcGroup - 1) / numThreadsPerProcGroup; // When MAX_WORKER_THREADS is set we don't bother to bind to specific HW threads // But Windows will still require binding to specific process groups for (uint32_t workerId = 0; workerId < numThreads; ++workerId) { - pPool->pThreadData[workerId].workerId = workerId; - pPool->pThreadData[workerId].procGroupId = workerId % numProcGroups; - pPool->pThreadData[workerId].threadId = 0; - pPool->pThreadData[workerId].numaId = 0; - pPool->pThreadData[workerId].coreId = 0; - pPool->pThreadData[workerId].htId = 0; - pPool->pThreadData[workerId].pContext = pContext; + pPool->pThreadData[workerId].workerId = workerId; + pPool->pThreadData[workerId].procGroupId = workerId % numProcGroups; + pPool->pThreadData[workerId].threadId = 0; + pPool->pThreadData[workerId].numaId = 0; + pPool->pThreadData[workerId].coreId = 0; + pPool->pThreadData[workerId].htId = 0; + pPool->pThreadData[workerId].pContext = pContext; pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup; pContext->NumBEThreads++; @@ -1228,7 +1260,7 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) pPool->numaMask = 0; } - uint32_t workerId = 0; + uint32_t workerId = 0; uint32_t numReservedThreads = numAPIReservedThreads; for (uint32_t n = 0; n < numNodes; ++n) { @@ -1236,7 +1268,7 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) { break; } - auto& node = nodes[n + pContext->threadInfo.BASE_NUMA_NODE]; + auto& node = nodes[n + pContext->threadInfo.BASE_NUMA_NODE]; uint32_t numCores = numCoresPerNode; for (uint32_t c = 0; c < numCores; ++c) { @@ -1258,26 +1290,32 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) --numRemovedThreads; SWR_REL_ASSERT(numReservedThreads); --numReservedThreads; - pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU; + pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU; pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup; - pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t]; - pPool->pApiThreadData[numReservedThreads].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0; - pPool->pApiThreadData[numReservedThreads].coreId = c + pContext->threadInfo.BASE_CORE; - pPool->pApiThreadData[numReservedThreads].htId = t + pContext->threadInfo.BASE_THREAD; - pPool->pApiThreadData[numReservedThreads].pContext = pContext; + pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t]; + pPool->pApiThreadData[numReservedThreads].numaId = + useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0; + pPool->pApiThreadData[numReservedThreads].coreId = + c + pContext->threadInfo.BASE_CORE; + pPool->pApiThreadData[numReservedThreads].htId = + t + pContext->threadInfo.BASE_THREAD; + pPool->pApiThreadData[numReservedThreads].pContext = pContext; pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false; - if (numAPIThreadsPerCore > numHyperThreads && numReservedThreads) { --numReservedThreads; - pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU; + pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU; pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup; - pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t + 1]; - pPool->pApiThreadData[numReservedThreads].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0; - pPool->pApiThreadData[numReservedThreads].coreId = c + pContext->threadInfo.BASE_CORE; - pPool->pApiThreadData[numReservedThreads].htId = t + pContext->threadInfo.BASE_THREAD; - pPool->pApiThreadData[numReservedThreads].pContext = pContext; + pPool->pApiThreadData[numReservedThreads].threadId = + core.threadIds[t + 1]; + pPool->pApiThreadData[numReservedThreads].numaId = + useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0; + pPool->pApiThreadData[numReservedThreads].coreId = + c + pContext->threadInfo.BASE_CORE; + pPool->pApiThreadData[numReservedThreads].htId = + t + pContext->threadInfo.BASE_THREAD; + pPool->pApiThreadData[numReservedThreads].pContext = pContext; pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false; } @@ -1286,12 +1324,14 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) SWR_ASSERT(workerId < numThreads); - pPool->pThreadData[workerId].workerId = workerId; + pPool->pThreadData[workerId].workerId = workerId; pPool->pThreadData[workerId].procGroupId = core.procGroup; - pPool->pThreadData[workerId].threadId = core.threadIds[t + pContext->threadInfo.BASE_THREAD]; - pPool->pThreadData[workerId].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0; - pPool->pThreadData[workerId].coreId = c + pContext->threadInfo.BASE_CORE; - pPool->pThreadData[workerId].htId = t + pContext->threadInfo.BASE_THREAD; + pPool->pThreadData[workerId].threadId = + core.threadIds[t + pContext->threadInfo.BASE_THREAD]; + pPool->pThreadData[workerId].numaId = + useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0; + pPool->pThreadData[workerId].coreId = c + pContext->threadInfo.BASE_CORE; + pPool->pThreadData[workerId].htId = t + pContext->threadInfo.BASE_THREAD; pPool->pThreadData[workerId].pContext = pContext; pPool->pThreadData[workerId].forceBindProcGroup = false; @@ -1319,7 +1359,8 @@ void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) for (uint32_t workerId = 0; workerId < pContext->NumWorkerThreads; ++workerId) { - pPool->pThreads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]); + pPool->pThreads[workerId] = + new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]); } } @@ -1327,7 +1368,7 @@ void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) /// @brief Destroys thread pool. /// @param pContext - pointer to context /// @param pPool - pointer to thread pool object. -void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) +void DestroyThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) { // Wait for all threads to finish SwrWaitForIdle(pContext); @@ -1340,12 +1381,13 @@ void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) // Detach from thread. Cannot join() due to possibility (in Windows) of code // in some DLLMain(THREAD_DETATCH case) blocking the thread until after this returns. pPool->pThreads[t]->detach(); - delete(pPool->pThreads[t]); + delete (pPool->pThreads[t]); } if (pContext->workerPrivateState.pfnFinishWorkerData) { - pContext->workerPrivateState.pfnFinishWorkerData(pPool->pThreadData[t].pWorkerPrivateData, t); + pContext->workerPrivateState.pfnFinishWorkerData( + pPool->pThreadData[t].pWorkerPrivateData, t); } } diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h index 0489a3cc6cf..d0f4b30dca0 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.h +++ b/src/gallium/drivers/swr/rasterizer/core/threads.h @@ -1,30 +1,30 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file threads.h -* -* @brief Definitions for SWR threading model. -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file threads.h + * + * @brief Definitions for SWR threading model. + * + ******************************************************************************/ #pragma once #include "knobs.h" @@ -39,39 +39,43 @@ struct SWR_WORKER_PRIVATE_STATE; struct THREAD_DATA { - void* pWorkerPrivateData;// Pointer to per-worker private data - uint32_t procGroupId; // Will always be 0 for non-Windows OS - uint32_t threadId; // within the procGroup for Windows - uint32_t numaId; // NUMA node id - uint32_t coreId; // Core id - uint32_t htId; // Hyperthread id - uint32_t workerId; - SWR_CONTEXT *pContext; - bool forceBindProcGroup; // Only useful when MAX_WORKER_THREADS is set. + void* pWorkerPrivateData; // Pointer to per-worker private data + uint32_t procGroupId; // Will always be 0 for non-Windows OS + uint32_t threadId; // within the procGroup for Windows + uint32_t numaId; // NUMA node id + uint32_t coreId; // Core id + uint32_t htId; // Hyperthread id + uint32_t workerId; + SWR_CONTEXT* pContext; + bool forceBindProcGroup; // Only useful when MAX_WORKER_THREADS is set. }; - struct THREAD_POOL { - THREAD_PTR* pThreads; - uint32_t numThreads; - uint32_t numaMask; - THREAD_DATA *pThreadData; - void* pWorkerPrivateDataArray; // All memory for worker private data - uint32_t numReservedThreads; // Number of threads reserved for API use - THREAD_DATA *pApiThreadData; + THREAD_PTR* pThreads; + uint32_t numThreads; + uint32_t numaMask; + THREAD_DATA* pThreadData; + void* pWorkerPrivateDataArray; // All memory for worker private data + uint32_t numReservedThreads; // Number of threads reserved for API use + THREAD_DATA* pApiThreadData; }; struct TileSet; -void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); +void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool); void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool); -void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); +void DestroyThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool); // Expose FE and BE worker functions to the API thread if single threaded -void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE); -bool WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask); -void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE); +void WorkOnFifoFE(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawFE); +bool WorkOnFifoBE(SWR_CONTEXT* pContext, + uint32_t workerId, + uint32_t& curDrawBE, + TileSet& usedTiles, + uint32_t numaNode, + uint32_t numaMask); +void WorkOnCompute(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawBE); int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC); -void BindApiThread(SWR_CONTEXT *pContext, uint32_t apiThreadId); +void BindApiThread(SWR_CONTEXT* pContext, uint32_t apiThreadId); diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp index 1bdef4bd7dd..87d5373a215 100644 --- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp @@ -1,31 +1,31 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file tilemgr.cpp -* -* @brief Implementation for Macro Tile Manager which provides the facilities -* for threads to work on an macro tile. -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file tilemgr.cpp + * + * @brief Implementation for Macro Tile Manager which provides the facilities + * for threads to work on an macro tile. + * + ******************************************************************************/ #include <unordered_map> #include "fifo.hpp" @@ -33,17 +33,15 @@ #include "core/multisample.h" #include "rdtsc_core.h" -MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena) -{ -} +MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena) {} -void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork) +void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK* pWork) { // Should not enqueue more then what we have backing for in the hot tile manager. SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X); SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y); - if ((x & ~(KNOB_NUM_HOT_TILES_X-1)) | (y & ~(KNOB_NUM_HOT_TILES_Y-1))) + if ((x & ~(KNOB_NUM_HOT_TILES_X - 1)) | (y & ~(KNOB_NUM_HOT_TILES_Y - 1))) { return; } @@ -55,7 +53,7 @@ void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork) mTiles.resize((16 + id) * 2); } - MacroTileQueue *pTile = mTiles[id]; + MacroTileQueue* pTile = mTiles[id]; if (!pTile) { pTile = mTiles[id] = new MacroTileQueue(); @@ -76,8 +74,8 @@ void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork) void MacroTileMgr::markTileComplete(uint32_t id) { SWR_ASSERT(mTiles.size() > id); - MacroTileQueue &tile = *mTiles[id]; - uint32_t numTiles = tile.mWorkItemsFE; + MacroTileQueue& tile = *mTiles[id]; + uint32_t numTiles = tile.mWorkItemsFE; InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles); _ReadWriteBarrier(); @@ -90,8 +88,14 @@ void MacroTileMgr::markTileComplete(uint32_t id) tile.mWorkItemsBE = 0; } -HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, HANDLE hWorkerPrivateData, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples, - uint32_t renderTargetArrayIndex) +HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, + DRAW_CONTEXT* pDC, + HANDLE hWorkerPrivateData, + uint32_t macroID, + SWR_RENDERTARGET_ATTACHMENT attachment, + bool create, + uint32_t numSamples, + uint32_t renderTargetArrayIndex) { uint32_t x, y; MacroTileMgr::getTileIndices(macroID, x, y); @@ -99,17 +103,18 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, HANDLE SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X); SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y); - HotTileSet &tile = mHotTiles[x][y]; - HOTTILE& hotTile = tile.Attachment[attachment]; + HotTileSet& tile = mHotTiles[x][y]; + HOTTILE& hotTile = tile.Attachment[attachment]; if (hotTile.pBuffer == NULL) { if (create) { - uint32_t size = numSamples * mHotTileSize[attachment]; + uint32_t size = numSamples * mHotTileSize[attachment]; uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask); - hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE); - hotTile.state = HOTTILE_INVALID; - hotTile.numSamples = numSamples; + hotTile.pBuffer = + (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE); + hotTile.state = HOTTILE_INVALID; + hotTile.numSamples = numSamples; hotTile.renderTargetArrayIndex = renderTargetArrayIndex; } else @@ -122,22 +127,22 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, HANDLE // free the old tile and create a new one with enough space to hold all samples if (numSamples > hotTile.numSamples) { - // tile should be either uninitialized or resolved if we're deleting and switching to a + // tile should be either uninitialized or resolved if we're deleting and switching to a // new sample count - SWR_ASSERT((hotTile.state == HOTTILE_INVALID) || - (hotTile.state == HOTTILE_RESOLVED) || - (hotTile.state == HOTTILE_CLEAR)); + SWR_ASSERT((hotTile.state == HOTTILE_INVALID) || (hotTile.state == HOTTILE_RESOLVED) || + (hotTile.state == HOTTILE_CLEAR)); FreeHotTileMem(hotTile.pBuffer); - uint32_t size = numSamples * mHotTileSize[attachment]; + uint32_t size = numSamples * mHotTileSize[attachment]; uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask); - hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE); - hotTile.state = HOTTILE_INVALID; + hotTile.pBuffer = + (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE); + hotTile.state = HOTTILE_INVALID; hotTile.numSamples = numSamples; } - // if requested render target array index isn't currently loaded, need to store out the current hottile - // and load the requested array slice + // if requested render target array index isn't currently loaded, need to store out the + // current hottile and load the requested array slice if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex) { SWR_FORMAT format; @@ -150,10 +155,19 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, HANDLE case SWR_ATTACHMENT_COLOR4: case SWR_ATTACHMENT_COLOR5: case SWR_ATTACHMENT_COLOR6: - case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break; - case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break; - case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break; - default: SWR_INVALID("Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break; + case SWR_ATTACHMENT_COLOR7: + format = KNOB_COLOR_HOT_TILE_FORMAT; + break; + case SWR_ATTACHMENT_DEPTH: + format = KNOB_DEPTH_HOT_TILE_FORMAT; + break; + case SWR_ATTACHMENT_STENCIL: + format = KNOB_STENCIL_HOT_TILE_FORMAT; + break; + default: + SWR_INVALID("Unknown attachment: %d", attachment); + format = KNOB_COLOR_HOT_TILE_FORMAT; + break; } if (hotTile.state == HOTTILE_CLEAR) @@ -170,23 +184,38 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, HANDLE if (hotTile.state == HOTTILE_DIRTY) { - pContext->pfnStoreTile(GetPrivateState(pDC), hWorkerPrivateData, format, attachment, - x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer); + pContext->pfnStoreTile(GetPrivateState(pDC), + hWorkerPrivateData, + format, + attachment, + x * KNOB_MACROTILE_X_DIM, + y * KNOB_MACROTILE_Y_DIM, + hotTile.renderTargetArrayIndex, + hotTile.pBuffer); } - pContext->pfnLoadTile(GetPrivateState(pDC), hWorkerPrivateData, format, attachment, - x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer); + pContext->pfnLoadTile(GetPrivateState(pDC), + hWorkerPrivateData, + format, + attachment, + x * KNOB_MACROTILE_X_DIM, + y * KNOB_MACROTILE_Y_DIM, + renderTargetArrayIndex, + hotTile.pBuffer); hotTile.renderTargetArrayIndex = renderTargetArrayIndex; - hotTile.state = HOTTILE_DIRTY; + hotTile.state = HOTTILE_DIRTY; } } return &tile.Attachment[attachment]; } -HOTTILE* HotTileMgr::GetHotTileNoLoad( - SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, - SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples) +HOTTILE* HotTileMgr::GetHotTileNoLoad(SWR_CONTEXT* pContext, + DRAW_CONTEXT* pDC, + uint32_t macroID, + SWR_RENDERTARGET_ATTACHMENT attachment, + bool create, + uint32_t numSamples) { uint32_t x, y; MacroTileMgr::getTileIndices(macroID, x, y); @@ -194,16 +223,16 @@ HOTTILE* HotTileMgr::GetHotTileNoLoad( SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X); SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y); - HotTileSet &tile = mHotTiles[x][y]; - HOTTILE& hotTile = tile.Attachment[attachment]; + HotTileSet& tile = mHotTiles[x][y]; + HOTTILE& hotTile = tile.Attachment[attachment]; if (hotTile.pBuffer == NULL) { if (create) { - uint32_t size = numSamples * mHotTileSize[attachment]; - hotTile.pBuffer = (uint8_t*)AlignedMalloc(size, 64); - hotTile.state = HOTTILE_INVALID; - hotTile.numSamples = numSamples; + uint32_t size = numSamples * mHotTileSize[attachment]; + hotTile.pBuffer = (uint8_t*)AlignedMalloc(size, 64); + hotTile.state = HOTTILE_INVALID; + hotTile.numSamples = numSamples; hotTile.renderTargetArrayIndex = 0; } else @@ -216,23 +245,25 @@ HOTTILE* HotTileMgr::GetHotTileNoLoad( } #if USE_8x2_TILE_BACKEND -void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. +void HotTileMgr::ClearColorHotTile( + const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. { // Load clear color into SIMD register... - float *pClearData = (float *)(pHotTile->clearData); - simd16scalar valR = _simd16_broadcast_ss(&pClearData[0]); - simd16scalar valG = _simd16_broadcast_ss(&pClearData[1]); - simd16scalar valB = _simd16_broadcast_ss(&pClearData[2]); - simd16scalar valA = _simd16_broadcast_ss(&pClearData[3]); + float* pClearData = (float*)(pHotTile->clearData); + simd16scalar valR = _simd16_broadcast_ss(&pClearData[0]); + simd16scalar valG = _simd16_broadcast_ss(&pClearData[1]); + simd16scalar valB = _simd16_broadcast_ss(&pClearData[2]); + simd16scalar valA = _simd16_broadcast_ss(&pClearData[3]); - float *pfBuf = (float *)pHotTile->pBuffer; + float* pfBuf = (float*)pHotTile->pBuffer; uint32_t numSamples = pHotTile->numSamples; for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) { for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) { - for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM) + for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); + si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM) { _simd16_store_ps(pfBuf, valR); pfBuf += KNOB_SIMD16_WIDTH; @@ -250,20 +281,22 @@ void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile) // clear a macro ti } } -void HotTileMgr::ClearDepthHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. +void HotTileMgr::ClearDepthHotTile( + const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. { // Load clear color into SIMD register... - float *pClearData = (float *)(pHotTile->clearData); - simd16scalar valZ = _simd16_broadcast_ss(&pClearData[0]); + float* pClearData = (float*)(pHotTile->clearData); + simd16scalar valZ = _simd16_broadcast_ss(&pClearData[0]); - float *pfBuf = (float *)pHotTile->pBuffer; + float* pfBuf = (float*)pHotTile->pBuffer; uint32_t numSamples = pHotTile->numSamples; for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) { for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) { - for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM) + for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); + si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM) { _simd16_store_ps(pfBuf, valZ); pfBuf += KNOB_SIMD16_WIDTH; @@ -276,18 +309,19 @@ void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile) { // convert from F32 to U8. uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]); - //broadcast 32x into __m256i... + // broadcast 32x into __m256i... simd16scalari valS = _simd16_set1_epi8(clearVal); - simd16scalari *pBuf = (simd16scalari *)pHotTile->pBuffer; - uint32_t numSamples = pHotTile->numSamples; + simd16scalari* pBuf = (simd16scalari*)pHotTile->pBuffer; + uint32_t numSamples = pHotTile->numSamples; for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) { for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) { // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly. - for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM * 4) + for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); + si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM * 4) { _simd16_store_si(pBuf, valS); pBuf += 1; @@ -297,23 +331,26 @@ void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile) } #else -void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. +void HotTileMgr::ClearColorHotTile( + const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. { // Load clear color into SIMD register... - float *pClearData = (float*)(pHotTile->clearData); - simdscalar valR = _simd_broadcast_ss(&pClearData[0]); - simdscalar valG = _simd_broadcast_ss(&pClearData[1]); - simdscalar valB = _simd_broadcast_ss(&pClearData[2]); - simdscalar valA = _simd_broadcast_ss(&pClearData[3]); + float* pClearData = (float*)(pHotTile->clearData); + simdscalar valR = _simd_broadcast_ss(&pClearData[0]); + simdscalar valG = _simd_broadcast_ss(&pClearData[1]); + simdscalar valB = _simd_broadcast_ss(&pClearData[2]); + simdscalar valA = _simd_broadcast_ss(&pClearData[3]); - float *pfBuf = (float*)pHotTile->pBuffer; + float* pfBuf = (float*)pHotTile->pBuffer; uint32_t numSamples = pHotTile->numSamples; for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) { for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) { - for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++) + for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); + si += + SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) // SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++) { _simd_store_ps(pfBuf, valR); pfBuf += KNOB_SIMD_WIDTH; @@ -328,20 +365,22 @@ void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile) // clear a macro ti } } -void HotTileMgr::ClearDepthHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. +void HotTileMgr::ClearDepthHotTile( + const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. { // Load clear color into SIMD register... - float *pClearData = (float*)(pHotTile->clearData); - simdscalar valZ = _simd_broadcast_ss(&pClearData[0]); + float* pClearData = (float*)(pHotTile->clearData); + simdscalar valZ = _simd_broadcast_ss(&pClearData[0]); - float *pfBuf = (float*)pHotTile->pBuffer; + float* pfBuf = (float*)pHotTile->pBuffer; uint32_t numSamples = pHotTile->numSamples; for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) { for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) { - for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) + for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); + si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) { _simd_store_ps(pfBuf, valZ); pfBuf += KNOB_SIMD_WIDTH; @@ -354,18 +393,19 @@ void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile) { // convert from F32 to U8. uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]); - //broadcast 32x into __m256i... + // broadcast 32x into __m256i... simdscalari valS = _simd_set1_epi8(clearVal); - simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer; - uint32_t numSamples = pHotTile->numSamples; + simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer; + uint32_t numSamples = pHotTile->numSamples; for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) { for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) { // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly. - for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4) + for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); + si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4) { _simd_store_si(pBuf, valS); pBuf += 1; @@ -383,9 +423,12 @@ void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile) /// to avoid unnecessary setup every triangle /// @todo support deferred clear /// @param pCreateInfo - pointer to creation info. -void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroID) +void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, + DRAW_CONTEXT* pDC, + uint32_t workerId, + uint32_t macroID) { - const API_STATE& state = GetApiState(pDC); + const API_STATE& state = GetApiState(pDC); HANDLE hWorkerPrivateData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; uint32_t x, y; @@ -396,17 +439,31 @@ void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, ui uint32_t numSamples = GetNumSamples(state.rastState.sampleCount); // check RT if enabled - unsigned long rtSlot = 0; - uint32_t colorHottileEnableMask = state.colorHottileEnable; + unsigned long rtSlot = 0; + uint32_t colorHottileEnableMask = state.colorHottileEnable; while (_BitScanForward(&rtSlot, colorHottileEnableMask)) { - HOTTILE* pHotTile = GetHotTile(pContext, pDC, hWorkerPrivateData, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples); + HOTTILE* pHotTile = + GetHotTile(pContext, + pDC, + hWorkerPrivateData, + macroID, + (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), + true, + numSamples); if (pHotTile->state == HOTTILE_INVALID) { RDTSC_BEGIN(BELoadTiles, pDC->drawId); // invalid hottile before draw requires a load from surface before we can draw to it - pContext->pfnLoadTile(GetPrivateState(pDC), hWorkerPrivateData, KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); + pContext->pfnLoadTile(GetPrivateState(pDC), + hWorkerPrivateData, + KNOB_COLOR_HOT_TILE_FORMAT, + (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), + x, + y, + pHotTile->renderTargetArrayIndex, + pHotTile->pBuffer); pHotTile->state = HOTTILE_DIRTY; RDTSC_END(BELoadTiles, 0); } @@ -424,12 +481,20 @@ void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, ui // check depth if enabled if (state.depthHottileEnable) { - HOTTILE* pHotTile = GetHotTile(pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples); + HOTTILE* pHotTile = GetHotTile( + pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples); if (pHotTile->state == HOTTILE_INVALID) { RDTSC_BEGIN(BELoadTiles, pDC->drawId); // invalid hottile before draw requires a load from surface before we can draw to it - pContext->pfnLoadTile(GetPrivateState(pDC), hWorkerPrivateData, KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); + pContext->pfnLoadTile(GetPrivateState(pDC), + hWorkerPrivateData, + KNOB_DEPTH_HOT_TILE_FORMAT, + SWR_ATTACHMENT_DEPTH, + x, + y, + pHotTile->renderTargetArrayIndex, + pHotTile->pBuffer); pHotTile->state = HOTTILE_DIRTY; RDTSC_END(BELoadTiles, 0); } @@ -446,12 +511,20 @@ void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, ui // check stencil if enabled if (state.stencilHottileEnable) { - HOTTILE* pHotTile = GetHotTile(pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples); + HOTTILE* pHotTile = GetHotTile( + pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples); if (pHotTile->state == HOTTILE_INVALID) { RDTSC_BEGIN(BELoadTiles, pDC->drawId); // invalid hottile before draw requires a load from surface before we can draw to it - pContext->pfnLoadTile(GetPrivateState(pDC), hWorkerPrivateData, KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); + pContext->pfnLoadTile(GetPrivateState(pDC), + hWorkerPrivateData, + KNOB_STENCIL_HOT_TILE_FORMAT, + SWR_ATTACHMENT_STENCIL, + x, + y, + pHotTile->renderTargetArrayIndex, + pHotTile->pBuffer); pHotTile->state = HOTTILE_DIRTY; RDTSC_END(BELoadTiles, 0); } diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h index 8392db1b05f..7173b0248f1 100644 --- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h +++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h @@ -1,31 +1,31 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file tilemgr.h -* -* @brief Definitions for Macro Tile Manager which provides the facilities -* for threads to work on an macro tile. -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file tilemgr.h + * + * @brief Definitions for Macro Tile Manager which provides the facilities + * for threads to work on an macro tile. + * + ******************************************************************************/ #pragma once #include <set> @@ -41,22 +41,16 @@ ////////////////////////////////////////////////////////////////////////// struct MacroTileQueue { - MacroTileQueue() { } + MacroTileQueue() {} ~MacroTileQueue() { destroy(); } ////////////////////////////////////////////////////////////////////////// /// @brief Returns number of work items queued for this tile. - uint32_t getNumQueued() - { - return mFifo.getNumQueued(); - } + uint32_t getNumQueued() { return mFifo.getNumQueued(); } ////////////////////////////////////////////////////////////////////////// /// @brief Attempt to lock the work fifo. If already locked then return false. - bool tryLock() - { - return mFifo.tryLock(); - } + bool tryLock() { return mFifo.tryLock(); } ////////////////////////////////////////////////////////////////////////// /// @brief Clear fifo and unlock it. @@ -68,10 +62,7 @@ struct MacroTileQueue ////////////////////////////////////////////////////////////////////////// /// @brief Peek at work sitting at the front of the fifo. - BE_WORK* peek() - { - return mFifo.peek(); - } + BE_WORK* peek() { return mFifo.peek(); } template <typename ArenaT> bool enqueue_try_nosync(ArenaT& arena, const BE_WORK* entry) @@ -81,22 +72,16 @@ struct MacroTileQueue ////////////////////////////////////////////////////////////////////////// /// @brief Move to next work item - void dequeue() - { - mFifo.dequeue_noinc(); - } + void dequeue() { mFifo.dequeue_noinc(); } ////////////////////////////////////////////////////////////////////////// /// @brief Destroy fifo - void destroy() - { - mFifo.destroy(); - } + void destroy() { mFifo.destroy(); } ///@todo This will all be private. uint32_t mWorkItemsFE = 0; uint32_t mWorkItemsBE = 0; - uint32_t mId = 0; + uint32_t mId = 0; private: QUEUE<BE_WORK> mFifo; @@ -111,7 +96,7 @@ public: MacroTileMgr(CachingArena& arena); ~MacroTileMgr() { - for (auto *pTile : mTiles) + for (auto* pTile : mTiles) { delete pTile; } @@ -126,16 +111,13 @@ public: } INLINE std::vector<MacroTileQueue*>& getDirtyTiles() { return mDirtyTiles; } - void markTileComplete(uint32_t id); + void markTileComplete(uint32_t id); - INLINE bool isWorkComplete() - { - return mWorkItemsProduced == mWorkItemsConsumed; - } + INLINE bool isWorkComplete() { return mWorkItemsProduced == mWorkItemsConsumed; } - void enqueue(uint32_t x, uint32_t y, BE_WORK *pWork); + void enqueue(uint32_t x, uint32_t y, BE_WORK* pWork); - static INLINE void getTileIndices(uint32_t tileID, uint32_t &x, uint32_t &y) + static INLINE void getTileIndices(uint32_t tileID, uint32_t& x, uint32_t& y) { // Morton / Z order of tiles x = pext_u32(tileID, 0x55555555); @@ -149,17 +131,21 @@ public: } private: - CachingArena& mArena; + CachingArena& mArena; std::vector<MacroTileQueue*> mTiles; // Any tile that has work queued to it is a dirty tile. std::vector<MacroTileQueue*> mDirtyTiles; - OSALIGNLINE(long) mWorkItemsProduced { 0 }; - OSALIGNLINE(volatile long) mWorkItemsConsumed { 0 }; + OSALIGNLINE(long) mWorkItemsProduced{0}; + OSALIGNLINE(volatile long) mWorkItemsConsumed{0}; }; -typedef void(*PFN_DISPATCH)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace); +typedef void (*PFN_DISPATCH)(DRAW_CONTEXT* pDC, + uint32_t workerId, + uint32_t threadGroupId, + void*& pSpillFillBuffer, + void*& pScratchSpace); ////////////////////////////////////////////////////////////////////////// /// DispatchQueue - work queue for dispatch @@ -175,23 +161,20 @@ public: { // The available and outstanding counts start with total tasks. // At the start there are N tasks available and outstanding. - // When both the available and outstanding counts have reached 0 then all work has completed. - // When a worker starts on a threadgroup then it decrements the available count. + // When both the available and outstanding counts have reached 0 then all work has + // completed. When a worker starts on a threadgroup then it decrements the available count. // When a worker completes a threadgroup then it decrements the outstanding count. - mTasksAvailable = totalTasks; + mTasksAvailable = totalTasks; mTasksOutstanding = totalTasks; - mpTaskData = pTaskData; + mpTaskData = pTaskData; mPfnDispatch = pfnDispatch; } ////////////////////////////////////////////////////////////////////////// /// @brief Returns number of tasks available for this dispatch. - uint32_t getNumQueued() - { - return (mTasksAvailable > 0) ? mTasksAvailable : 0; - } + uint32_t getNumQueued() { return (mTasksAvailable > 0) ? mTasksAvailable : 0; } ////////////////////////////////////////////////////////////////////////// /// @brief Atomically decrement the work available count. If the result @@ -224,50 +207,49 @@ public: ////////////////////////////////////////////////////////////////////////// /// @brief Work is complete once both the available/outstanding counts have reached 0. - bool isWorkComplete() - { - return ((mTasksAvailable <= 0) && - (mTasksOutstanding <= 0)); - } + bool isWorkComplete() { return ((mTasksAvailable <= 0) && (mTasksOutstanding <= 0)); } ////////////////////////////////////////////////////////////////////////// /// @brief Return pointer to task data. - const void* GetTasksData() - { - return mpTaskData; - } + const void* GetTasksData() { return mpTaskData; } ////////////////////////////////////////////////////////////////////////// /// @brief Dispatches a unit of work - void dispatch(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace) + void dispatch(DRAW_CONTEXT* pDC, + uint32_t workerId, + uint32_t threadGroupId, + void*& pSpillFillBuffer, + void*& pScratchSpace) { SWR_ASSERT(mPfnDispatch != nullptr); mPfnDispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace); } - void* mpTaskData{ nullptr }; // The API thread will set this up and the callback task function will interpet this. - PFN_DISPATCH mPfnDispatch{ nullptr }; // Function to call per dispatch + void* mpTaskData{nullptr}; // The API thread will set this up and the callback task function + // will interpet this. + PFN_DISPATCH mPfnDispatch{nullptr}; // Function to call per dispatch - OSALIGNLINE(volatile long) mTasksAvailable{ 0 }; - OSALIGNLINE(volatile long) mTasksOutstanding{ 0 }; + OSALIGNLINE(volatile long) mTasksAvailable{0}; + OSALIGNLINE(volatile long) mTasksOutstanding{0}; }; - enum HOTTILE_STATE { - HOTTILE_INVALID, // tile is in unitialized state and should be loaded with surface contents before rendering - HOTTILE_CLEAR, // tile should be cleared - HOTTILE_DIRTY, // tile has been rendered to - HOTTILE_RESOLVED, // tile has been stored to memory + HOTTILE_INVALID, // tile is in unitialized state and should be loaded with surface contents + // before rendering + HOTTILE_CLEAR, // tile should be cleared + HOTTILE_DIRTY, // tile has been rendered to + HOTTILE_RESOLVED, // tile has been stored to memory }; struct HOTTILE { - uint8_t *pBuffer; + uint8_t* pBuffer; HOTTILE_STATE state; - DWORD clearData[4]; // May need to change based on pfnClearTile implementation. Reorder for alignment? + DWORD clearData[4]; // May need to change based on pfnClearTile implementation. Reorder for + // alignment? uint32_t numSamples; - uint32_t renderTargetArrayIndex; // current render target array index loaded + uint32_t renderTargetArrayIndex; // current render target array index loaded }; union HotTileSet @@ -291,10 +273,13 @@ public: // cache hottile size for (uint32_t i = SWR_ATTACHMENT_COLOR0; i <= SWR_ATTACHMENT_COLOR7; ++i) { - mHotTileSize[i] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8; + mHotTileSize[i] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * + FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8; } - mHotTileSize[SWR_ATTACHMENT_DEPTH] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8; - mHotTileSize[SWR_ATTACHMENT_STENCIL] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8; + mHotTileSize[SWR_ATTACHMENT_DEPTH] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * + FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8; + mHotTileSize[SWR_ATTACHMENT_STENCIL] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * + FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8; } ~HotTileMgr() @@ -311,12 +296,26 @@ public: } } - void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroID); - - HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, HANDLE hWorkerData, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1, - uint32_t renderTargetArrayIndex = 0); - - HOTTILE *GetHotTileNoLoad(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1); + void InitializeHotTiles(SWR_CONTEXT* pContext, + DRAW_CONTEXT* pDC, + uint32_t workerId, + uint32_t macroID); + + HOTTILE* GetHotTile(SWR_CONTEXT* pContext, + DRAW_CONTEXT* pDC, + HANDLE hWorkerData, + uint32_t macroID, + SWR_RENDERTARGET_ATTACHMENT attachment, + bool create, + uint32_t numSamples = 1, + uint32_t renderTargetArrayIndex = 0); + + HOTTILE* GetHotTileNoLoad(SWR_CONTEXT* pContext, + DRAW_CONTEXT* pDC, + uint32_t macroID, + SWR_RENDERTARGET_ATTACHMENT attachment, + bool create, + uint32_t numSamples = 1); static void ClearColorHotTile(const HOTTILE* pHotTile); static void ClearDepthHotTile(const HOTTILE* pHotTile); @@ -324,14 +323,15 @@ public: private: HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y]; - uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS]; + uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS]; void* AllocHotTileMem(size_t size, uint32_t align, uint32_t numaNode) { void* p = nullptr; #if defined(_WIN32) HANDLE hProcess = GetCurrentProcess(); - p = VirtualAllocExNuma(hProcess, nullptr, size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE, numaNode); + p = VirtualAllocExNuma( + hProcess, nullptr, size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE, numaNode); #else p = AlignedMalloc(size, align); #endif @@ -351,4 +351,3 @@ private: } } }; - diff --git a/src/gallium/drivers/swr/rasterizer/core/tileset.h b/src/gallium/drivers/swr/rasterizer/core/tileset.h index 3eb4c5d1f00..e28c84d789f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/tileset.h +++ b/src/gallium/drivers/swr/rasterizer/core/tileset.h @@ -1,30 +1,30 @@ /**************************************************************************** -* Copyright (C) 2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file tileset.h -* -* @brief Custom bitset class for managing locked tiles -* -******************************************************************************/ + * Copyright (C) 2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file tileset.h + * + * @brief Custom bitset class for managing locked tiles + * + ******************************************************************************/ #pragma once struct TileSet @@ -65,16 +65,13 @@ struct TileSet private: static const size_t BITS_PER_WORD = sizeof(size_t) * 8; - static const size_t BITS_OFFSET = BITS_PER_WORD - 1; + static const size_t BITS_OFFSET = BITS_PER_WORD - 1; - size_t m_size = 0; - size_t m_maxSet = 0; - size_t* m_bits = nullptr; + size_t m_size = 0; + size_t m_maxSet = 0; + size_t* m_bits = nullptr; - INLINE size_t& _get_word(size_t idx) - { - return m_bits[idx / BITS_PER_WORD]; - } + INLINE size_t& _get_word(size_t idx) { return m_bits[idx / BITS_PER_WORD]; } void _grow(size_t idx) { @@ -83,10 +80,10 @@ private: return; } - size_t new_size = (1 + idx + BITS_OFFSET) & ~BITS_OFFSET; - size_t num_words = new_size / BITS_PER_WORD; - size_t* newBits = (size_t*)AlignedMalloc(sizeof(size_t) * num_words, 64); - size_t copy_words = 0; + size_t new_size = (1 + idx + BITS_OFFSET) & ~BITS_OFFSET; + size_t num_words = new_size / BITS_PER_WORD; + size_t* newBits = (size_t*)AlignedMalloc(sizeof(size_t) * num_words, 64); + size_t copy_words = 0; if (m_bits) { diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h index 7769e05a678..27c9c606d17 100644 --- a/src/gallium/drivers/swr/rasterizer/core/utils.h +++ b/src/gallium/drivers/swr/rasterizer/core/utils.h @@ -1,30 +1,30 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file utils.h -* -* @brief Utilities used by SWR core. -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file utils.h + * + * @brief Utilities used by SWR core. + * + ******************************************************************************/ #pragma once #include <string.h> @@ -54,38 +54,44 @@ struct simd16BBox }; #endif -template<typename SIMD_T> +template <typename SIMD_T> struct SIMDBBOX_T { - typename SIMD_T::Integer ymin; - typename SIMD_T::Integer ymax; - typename SIMD_T::Integer xmin; - typename SIMD_T::Integer xmax; + typename SIMD_T::Integer ymin; + typename SIMD_T::Integer ymax; + typename SIMD_T::Integer xmin; + typename SIMD_T::Integer xmax; }; // helper function to unroll loops -template<int Begin, int End, int Step = 1> -struct UnrollerL { - template<typename Lambda> - INLINE static void step(Lambda& func) { +template <int Begin, int End, int Step = 1> +struct UnrollerL +{ + template <typename Lambda> + INLINE static void step(Lambda& func) + { func(Begin); UnrollerL<Begin + Step, End, Step>::step(func); } }; -template<int End, int Step> -struct UnrollerL<End, End, Step> { - template<typename Lambda> - static void step(Lambda& func) { +template <int End, int Step> +struct UnrollerL<End, End, Step> +{ + template <typename Lambda> + static void step(Lambda& func) + { } }; // helper function to unroll loops, with mask to skip specific iterations -template<int Begin, int End, int Step = 1, int Mask = 0x7f> -struct UnrollerLMask { - template<typename Lambda> - INLINE static void step(Lambda& func) { - if(Mask & (1 << Begin)) +template <int Begin, int End, int Step = 1, int Mask = 0x7f> +struct UnrollerLMask +{ + template <typename Lambda> + INLINE static void step(Lambda& func) + { + if (Mask & (1 << Begin)) { func(Begin); } @@ -93,29 +99,31 @@ struct UnrollerLMask { } }; -template<int End, int Step, int Mask> -struct UnrollerLMask<End, End, Step, Mask> { - template<typename Lambda> - static void step(Lambda& func) { +template <int End, int Step, int Mask> +struct UnrollerLMask<End, End, Step, Mask> +{ + template <typename Lambda> + static void step(Lambda& func) + { } }; // general CRC compute INLINE -uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size) +uint32_t ComputeCRC(uint32_t crc, const void* pData, uint32_t size) { #if defined(_WIN64) || defined(__x86_64__) - uint32_t sizeInQwords = size / sizeof(uint64_t); - uint32_t sizeRemainderBytes = size % sizeof(uint64_t); - uint64_t* pDataWords = (uint64_t*)pData; + uint32_t sizeInQwords = size / sizeof(uint64_t); + uint32_t sizeRemainderBytes = size % sizeof(uint64_t); + uint64_t* pDataWords = (uint64_t*)pData; for (uint32_t i = 0; i < sizeInQwords; ++i) { crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++); } #else - uint32_t sizeInDwords = size / sizeof(uint32_t); - uint32_t sizeRemainderBytes = size % sizeof(uint32_t); - uint32_t* pDataWords = (uint32_t*)pData; + uint32_t sizeInDwords = size / sizeof(uint32_t); + uint32_t sizeRemainderBytes = size % sizeof(uint32_t); + uint32_t* pDataWords = (uint32_t*)pData; for (uint32_t i = 0; i < sizeInDwords; ++i) { crc = _mm_crc32_u32(crc, *pDataWords++); @@ -135,8 +143,7 @@ uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size) /// Check specified bit within a data word ////////////////////////////////////////////////////////////////////////// template <typename T> -INLINE -static bool CheckBit(T word, uint32_t bit) +INLINE static bool CheckBit(T word, uint32_t bit) { return 0 != (word & (T(1) << bit)); } @@ -145,8 +152,7 @@ static bool CheckBit(T word, uint32_t bit) /// Add byte offset to any-type pointer ////////////////////////////////////////////////////////////////////////// template <typename T> -INLINE -static T* PtrAdd(T* p, intptr_t offset) +INLINE static T* PtrAdd(T* p, intptr_t offset) { intptr_t intp = reinterpret_cast<intptr_t>(p); return reinterpret_cast<T*>(intp + offset); @@ -156,8 +162,7 @@ static T* PtrAdd(T* p, intptr_t offset) /// Is a power-of-2? ////////////////////////////////////////////////////////////////////////// template <typename T> -INLINE -static bool IsPow2(T value) +INLINE static bool IsPow2(T value) { return value == (value & (T(0) - value)); } @@ -167,8 +172,7 @@ static bool IsPow2(T value) /// Note: IsPow2(alignment) MUST be true ////////////////////////////////////////////////////////////////////////// template <typename T1, typename T2> -INLINE -static T1 AlignDownPow2(T1 value, T2 alignment) +INLINE static T1 AlignDownPow2(T1 value, T2 alignment) { SWR_ASSERT(IsPow2(alignment)); return value & ~T1(alignment - 1); @@ -179,8 +183,7 @@ static T1 AlignDownPow2(T1 value, T2 alignment) /// Note: IsPow2(alignment) MUST be true ////////////////////////////////////////////////////////////////////////// template <typename T1, typename T2> -INLINE -static T1 AlignUpPow2(T1 value, T2 alignment) +INLINE static T1 AlignUpPow2(T1 value, T2 alignment) { return AlignDownPow2(value + T1(alignment - 1), alignment); } @@ -190,8 +193,7 @@ static T1 AlignUpPow2(T1 value, T2 alignment) /// Note: IsPow2(alignment) MUST be true ////////////////////////////////////////////////////////////////////////// template <typename T1, typename T2> -INLINE -static T1* AlignUpPow2(T1* value, T2 alignment) +INLINE static T1* AlignUpPow2(T1* value, T2 alignment) { return reinterpret_cast<T1*>( AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment)); @@ -201,10 +203,12 @@ static T1* AlignUpPow2(T1* value, T2 alignment) /// Align down to specified alignment ////////////////////////////////////////////////////////////////////////// template <typename T1, typename T2> -INLINE -static T1 AlignDown(T1 value, T2 alignment) +INLINE static T1 AlignDown(T1 value, T2 alignment) { - if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); } + if (IsPow2(alignment)) + { + return AlignDownPow2(value, alignment); + } return value - T1(value % alignment); } @@ -212,8 +216,7 @@ static T1 AlignDown(T1 value, T2 alignment) /// Align down to specified alignment ////////////////////////////////////////////////////////////////////////// template <typename T1, typename T2> -INLINE -static T1* AlignDown(T1* value, T2 alignment) +INLINE static T1* AlignDown(T1* value, T2 alignment) { return (T1*)AlignDown(uintptr_t(value), alignment); } @@ -223,8 +226,7 @@ static T1* AlignDown(T1* value, T2 alignment) /// Note: IsPow2(alignment) MUST be true ////////////////////////////////////////////////////////////////////////// template <typename T1, typename T2> -INLINE -static T1 AlignUp(T1 value, T2 alignment) +INLINE static T1 AlignUp(T1 value, T2 alignment) { return AlignDown(value + T1(alignment - 1), alignment); } @@ -234,33 +236,31 @@ static T1 AlignUp(T1 value, T2 alignment) /// Note: IsPow2(alignment) MUST be true ////////////////////////////////////////////////////////////////////////// template <typename T1, typename T2> -INLINE -static T1* AlignUp(T1* value, T2 alignment) +INLINE static T1* AlignUp(T1* value, T2 alignment) { return AlignDown(PtrAdd(value, alignment - 1), alignment); } ////////////////////////////////////////////////////////////////////////// -/// Helper structure used to access an array of elements that don't +/// Helper structure used to access an array of elements that don't /// correspond to a typical word size. ////////////////////////////////////////////////////////////////////////// -template<typename T, size_t BitsPerElementT, size_t ArrayLenT> +template <typename T, size_t BitsPerElementT, size_t ArrayLenT> class BitsArray { private: - static const size_t BITS_PER_WORD = sizeof(size_t) * 8; + static const size_t BITS_PER_WORD = sizeof(size_t) * 8; static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT; - static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD; - static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1; + static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD; + static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1; static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD, - "Element size must an integral fraction of pointer size"); + "Element size must an integral fraction of pointer size"); - size_t m_words[NUM_WORDS] = {}; + size_t m_words[NUM_WORDS] = {}; public: - - T operator[] (size_t elementIndex) const + T operator[](size_t elementIndex) const { size_t word = m_words[elementIndex / ELEMENTS_PER_WORD]; word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT); @@ -324,9 +324,11 @@ struct TemplateArgUnroller } if (TMax > TMin) { - return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(RangedArg<T, TMin, (T)(int(TMax)-1)>{iArg.val}); + return TemplateArgUnroller<TermT, ArgsB...>::GetFunc( + RangedArg<T, TMin, (T)(int(TMax) - 1)>{iArg.val}); } - SWR_ASSUME(false); return nullptr; + SWR_ASSUME(false); + return nullptr; } template <typename T, T TVal> static typename TermT::FuncType GetFunc(RangedArg<T, TVal, TVal> iArg) @@ -341,19 +343,23 @@ struct TemplateArgUnroller { if (iArg.val == TMax) { - return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<T, TMax>>::GetFunc(remainingArgs...); + return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<T, TMax>>::GetFunc( + remainingArgs...); } if (TMax > TMin) { - return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(RangedArg<T, TMin, (T)(int(TMax) - 1)>{iArg.val}, remainingArgs...); + return TemplateArgUnroller<TermT, ArgsB...>::GetFunc( + RangedArg<T, TMin, (T)(int(TMax) - 1)>{iArg.val}, remainingArgs...); } - SWR_ASSUME(false); return nullptr; + SWR_ASSUME(false); + return nullptr; } template <typename T, T TVal, typename... TArgsT> static typename TermT::FuncType GetFunc(RangedArg<T, TVal, TVal> iArg, TArgsT... remainingArgs) { SWR_ASSERT(iArg.val == TVal); - return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<T, TVal>>::GetFunc(remainingArgs...); + return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<T, TVal>>::GetFunc( + remainingArgs...); } }; @@ -365,12 +371,13 @@ static INLINE std::string GetEnv(const std::string& variableName) std::string output; #if defined(_WIN32) DWORD valueSize = GetEnvironmentVariableA(variableName.c_str(), nullptr, 0); - if (!valueSize) return output; + if (!valueSize) + return output; output.resize(valueSize - 1); // valueSize includes null, output.resize() does not GetEnvironmentVariableA(variableName.c_str(), &output[0], valueSize); #else - char *env = getenv(variableName.c_str()); - output = env ? env : ""; + char* env = getenv(variableName.c_str()); + output = env ? env : ""; #endif return output; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp index e9412b1b53c..5cf527ecd89 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp @@ -1,32 +1,32 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file JitManager.cpp -* -* @brief Implementation if the Jit Manager. -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file JitManager.cpp + * + * @brief Implementation if the Jit Manager. + * + * Notes: + * + ******************************************************************************/ #include "jit_pch.hpp" #include "JitManager.h" @@ -59,21 +59,22 @@ using namespace SwrJit; ////////////////////////////////////////////////////////////////////////// /// @brief Contructor for JitManager. /// @param simdWidth - SIMD width to be used in generated program. -JitManager::JitManager(uint32_t simdWidth, const char *arch, const char* core) - : mContext(), mBuilder(mContext), mIsModuleFinalized(true), mJitNumber(0), mVWidth(simdWidth), mArch(arch) +JitManager::JitManager(uint32_t simdWidth, const char *arch, const char *core) : + mContext(), mBuilder(mContext), mIsModuleFinalized(true), mJitNumber(0), mVWidth(simdWidth), + mArch(arch) { InitializeNativeTarget(); InitializeNativeTargetAsmPrinter(); InitializeNativeTargetDisassembler(); - TargetOptions tOpts; + TargetOptions tOpts; tOpts.AllowFPOpFusion = FPOpFusion::Fast; - tOpts.NoInfsFPMath = false; - tOpts.NoNaNsFPMath = false; + tOpts.NoInfsFPMath = false; + tOpts.NoNaNsFPMath = false; tOpts.UnsafeFPMath = false; - //tOpts.PrintMachineCode = true; + // tOpts.PrintMachineCode = true; std::unique_ptr<Module> newModule(new Module("", mContext)); mpCurrentModule = newModule.get(); @@ -81,10 +82,10 @@ JitManager::JitManager(uint32_t simdWidth, const char *arch, const char* core) StringRef hostCPUName; // force JIT to use the same CPU arch as the rest of swr - if(mArch.AVX512F()) + if (mArch.AVX512F()) { #if USE_SIMD16_SHADERS - if(mArch.AVX512ER()) + if (mArch.AVX512ER()) { hostCPUName = StringRef("knl"); } @@ -101,7 +102,7 @@ JitManager::JitManager(uint32_t simdWidth, const char *arch, const char* core) mVWidth = 8; } } - else if(mArch.AVX2()) + else if (mArch.AVX2()) { hostCPUName = StringRef("core-avx2"); if (mVWidth == 0) @@ -109,7 +110,7 @@ JitManager::JitManager(uint32_t simdWidth, const char *arch, const char* core) mVWidth = 8; } } - else if(mArch.AVX()) + else if (mArch.AVX()) { if (mArch.F16C()) { @@ -140,10 +141,10 @@ JitManager::JitManager(uint32_t simdWidth, const char *arch, const char* core) mpCurrentModule->setTargetTriple(sys::getProcessTriple()); mpExec = EngineBuilder(std::move(newModule)) - .setTargetOptions(tOpts) - .setOptLevel(optLevel) - .setMCPU(hostCPUName) - .create(); + .setTargetOptions(tOpts) + .setOptLevel(optLevel) + .setMCPU(hostCPUName) + .create(); if (KNOB_JIT_ENABLE_CACHE) { @@ -162,7 +163,7 @@ JitManager::JitManager(uint32_t simdWidth, const char *arch, const char* core) #else // typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, simdvertex& out); #endif - std::vector<Type*> fsArgs; + std::vector<Type *> fsArgs; // llvm5 is picky and does not take a void * type fsArgs.push_back(PointerType::get(Gen_SWR_FETCH_CONTEXT(this), 0)); @@ -211,32 +212,44 @@ void JitManager::SetupNewModule() } -DIType* JitManager::CreateDebugStructType(StructType* pType, const std::string& name, DIFile* pFile, uint32_t lineNum, - const std::vector<std::pair<std::string, uint32_t>>& members) +DIType * +JitManager::CreateDebugStructType(StructType * pType, + const std::string & name, + DIFile * pFile, + uint32_t lineNum, + const std::vector<std::pair<std::string, uint32_t>> &members) { - DIBuilder builder(*mpCurrentModule); - SmallVector<Metadata*, 8> ElemTypes; - DataLayout DL = DataLayout(mpCurrentModule); - uint32_t size = DL.getTypeAllocSizeInBits(pType); - uint32_t alignment = DL.getABITypeAlignment(pType); - DINode::DIFlags flags = DINode::DIFlags::FlagPublic; - - DICompositeType* pDIStructTy = builder.createStructType(pFile, name, pFile, lineNum, size, alignment, - flags, nullptr, builder.getOrCreateArray(ElemTypes)); + DIBuilder builder(*mpCurrentModule); + SmallVector<Metadata *, 8> ElemTypes; + DataLayout DL = DataLayout(mpCurrentModule); + uint32_t size = DL.getTypeAllocSizeInBits(pType); + uint32_t alignment = DL.getABITypeAlignment(pType); + DINode::DIFlags flags = DINode::DIFlags::FlagPublic; + + DICompositeType *pDIStructTy = builder.createStructType(pFile, + name, + pFile, + lineNum, + size, + alignment, + flags, + nullptr, + builder.getOrCreateArray(ElemTypes)); // Register mapping now to break loops (in case struct contains itself or pointers to itself) mDebugStructMap[pType] = pDIStructTy; uint32_t idx = 0; - for (auto& elem : pType->elements()) + for (auto &elem : pType->elements()) { - std::string name = members[idx].first; - uint32_t lineNum = members[idx].second; - size = DL.getTypeAllocSizeInBits(elem); - alignment = DL.getABITypeAlignment(elem); - uint32_t offset = DL.getStructLayout(pType)->getElementOffsetInBits(idx); - llvm::DIType* pDebugTy = GetDebugType(elem); - ElemTypes.push_back(builder.createMemberType(pDIStructTy, name, pFile, lineNum, size, alignment, offset, flags, pDebugTy)); + std::string name = members[idx].first; + uint32_t lineNum = members[idx].second; + size = DL.getTypeAllocSizeInBits(elem); + alignment = DL.getABITypeAlignment(elem); + uint32_t offset = DL.getStructLayout(pType)->getElementOffsetInBits(idx); + llvm::DIType *pDebugTy = GetDebugType(elem); + ElemTypes.push_back(builder.createMemberType( + pDIStructTy, name, pFile, lineNum, size, alignment, offset, flags, pDebugTy)); idx++; } @@ -245,54 +258,76 @@ DIType* JitManager::CreateDebugStructType(StructType* pType, const std::string& return pDIStructTy; } -DIType* JitManager::GetDebugArrayType(Type* pTy) +DIType *JitManager::GetDebugArrayType(Type *pTy) { - DIBuilder builder(*mpCurrentModule); - DataLayout DL = DataLayout(mpCurrentModule); - ArrayType* pArrayTy = cast<ArrayType>(pTy); - uint32_t size = DL.getTypeAllocSizeInBits(pArrayTy); - uint32_t alignment = DL.getABITypeAlignment(pArrayTy); + DIBuilder builder(*mpCurrentModule); + DataLayout DL = DataLayout(mpCurrentModule); + ArrayType *pArrayTy = cast<ArrayType>(pTy); + uint32_t size = DL.getTypeAllocSizeInBits(pArrayTy); + uint32_t alignment = DL.getABITypeAlignment(pArrayTy); - SmallVector<Metadata*, 8> Elems; + SmallVector<Metadata *, 8> Elems; Elems.push_back(builder.getOrCreateSubrange(0, pArrayTy->getNumElements())); - return builder.createArrayType(size, alignment, GetDebugType(pArrayTy->getElementType()), builder.getOrCreateArray(Elems)); + return builder.createArrayType( + size, alignment, GetDebugType(pArrayTy->getElementType()), builder.getOrCreateArray(Elems)); } // Create a DIType from llvm Type -DIType* JitManager::GetDebugType(Type* pTy) +DIType *JitManager::GetDebugType(Type *pTy) { - DIBuilder builder(*mpCurrentModule); + DIBuilder builder(*mpCurrentModule); Type::TypeID id = pTy->getTypeID(); switch (id) { - case Type::VoidTyID: return builder.createUnspecifiedType("void"); break; - case Type::HalfTyID: return builder.createBasicType("float16", 16, dwarf::DW_ATE_float); break; - case Type::FloatTyID: return builder.createBasicType("float", 32, dwarf::DW_ATE_float); break; - case Type::DoubleTyID: return builder.createBasicType("double", 64, dwarf::DW_ATE_float); break; - case Type::IntegerTyID: return GetDebugIntegerType(pTy); break; - case Type::StructTyID: return GetDebugStructType(pTy); break; - case Type::ArrayTyID: return GetDebugArrayType(pTy); break; - case Type::PointerTyID: return builder.createPointerType(GetDebugType(pTy->getPointerElementType()), 64, 64); break; - case Type::VectorTyID: return GetDebugVectorType(pTy); break; - case Type::FunctionTyID: return GetDebugFunctionType(pTy); break; - default: SWR_ASSERT(false, "Unimplemented llvm type"); + case Type::VoidTyID: + return builder.createUnspecifiedType("void"); + break; + case Type::HalfTyID: + return builder.createBasicType("float16", 16, dwarf::DW_ATE_float); + break; + case Type::FloatTyID: + return builder.createBasicType("float", 32, dwarf::DW_ATE_float); + break; + case Type::DoubleTyID: + return builder.createBasicType("double", 64, dwarf::DW_ATE_float); + break; + case Type::IntegerTyID: + return GetDebugIntegerType(pTy); + break; + case Type::StructTyID: + return GetDebugStructType(pTy); + break; + case Type::ArrayTyID: + return GetDebugArrayType(pTy); + break; + case Type::PointerTyID: + return builder.createPointerType(GetDebugType(pTy->getPointerElementType()), 64, 64); + break; + case Type::VectorTyID: + return GetDebugVectorType(pTy); + break; + case Type::FunctionTyID: + return GetDebugFunctionType(pTy); + break; + default: + SWR_ASSERT(false, "Unimplemented llvm type"); } return nullptr; } // Create a DISubroutineType from an llvm FunctionType -DIType* JitManager::GetDebugFunctionType(Type* pTy) +DIType *JitManager::GetDebugFunctionType(Type *pTy) { - SmallVector<Metadata*, 8> ElemTypes; - FunctionType* pFuncTy = cast<FunctionType>(pTy); - DIBuilder builder(*mpCurrentModule); + SmallVector<Metadata *, 8> ElemTypes; + FunctionType * pFuncTy = cast<FunctionType>(pTy); + DIBuilder builder(*mpCurrentModule); // Add result type ElemTypes.push_back(GetDebugType(pFuncTy->getReturnType())); // Add arguments - for (auto& param : pFuncTy->params()) + for (auto ¶m : pFuncTy->params()) { ElemTypes.push_back(GetDebugType(param)); } @@ -300,60 +335,74 @@ DIType* JitManager::GetDebugFunctionType(Type* pTy) return builder.createSubroutineType(builder.getOrCreateTypeArray(ElemTypes)); } -DIType* JitManager::GetDebugIntegerType(Type* pTy) +DIType *JitManager::GetDebugIntegerType(Type *pTy) { - DIBuilder builder(*mpCurrentModule); - IntegerType* pIntTy = cast<IntegerType>(pTy); + DIBuilder builder(*mpCurrentModule); + IntegerType *pIntTy = cast<IntegerType>(pTy); switch (pIntTy->getBitWidth()) { - case 1: return builder.createBasicType("int1", 1, dwarf::DW_ATE_unsigned); break; - case 8: return builder.createBasicType("int8", 8, dwarf::DW_ATE_signed); break; - case 16: return builder.createBasicType("int16", 16, dwarf::DW_ATE_signed); break; - case 32: return builder.createBasicType("int", 32, dwarf::DW_ATE_signed); break; - case 64: return builder.createBasicType("int64", 64, dwarf::DW_ATE_signed); break; - case 128: return builder.createBasicType("int128", 128, dwarf::DW_ATE_signed); break; - default: SWR_ASSERT(false, "Unimplemented integer bit width"); + case 1: + return builder.createBasicType("int1", 1, dwarf::DW_ATE_unsigned); + break; + case 8: + return builder.createBasicType("int8", 8, dwarf::DW_ATE_signed); + break; + case 16: + return builder.createBasicType("int16", 16, dwarf::DW_ATE_signed); + break; + case 32: + return builder.createBasicType("int", 32, dwarf::DW_ATE_signed); + break; + case 64: + return builder.createBasicType("int64", 64, dwarf::DW_ATE_signed); + break; + case 128: + return builder.createBasicType("int128", 128, dwarf::DW_ATE_signed); + break; + default: + SWR_ASSERT(false, "Unimplemented integer bit width"); } return nullptr; } -DIType* JitManager::GetDebugVectorType(Type* pTy) +DIType *JitManager::GetDebugVectorType(Type *pTy) { - DIBuilder builder(*mpCurrentModule); - VectorType* pVecTy = cast<VectorType>(pTy); - DataLayout DL = DataLayout(mpCurrentModule); - uint32_t size = DL.getTypeAllocSizeInBits(pVecTy); - uint32_t alignment = DL.getABITypeAlignment(pVecTy); - SmallVector<Metadata*, 1> Elems; + DIBuilder builder(*mpCurrentModule); + VectorType * pVecTy = cast<VectorType>(pTy); + DataLayout DL = DataLayout(mpCurrentModule); + uint32_t size = DL.getTypeAllocSizeInBits(pVecTy); + uint32_t alignment = DL.getABITypeAlignment(pVecTy); + SmallVector<Metadata *, 1> Elems; Elems.push_back(builder.getOrCreateSubrange(0, pVecTy->getVectorNumElements())); - return builder.createVectorType(size, alignment, GetDebugType(pVecTy->getVectorElementType()), builder.getOrCreateArray(Elems)); - + return builder.createVectorType(size, + alignment, + GetDebugType(pVecTy->getVectorElementType()), + builder.getOrCreateArray(Elems)); } ////////////////////////////////////////////////////////////////////////// /// @brief Dump function x86 assembly to file. /// @note This should only be called after the module has been jitted to x86 and the /// module will not be further accessed. -void JitManager::DumpAsm(Function* pFunction, const char* fileName) +void JitManager::DumpAsm(Function *pFunction, const char *fileName) { if (KNOB_DUMP_SHADER_IR) { - #if defined(_WIN32) DWORD pid = GetCurrentProcessId(); - char procname[MAX_PATH]; + char procname[MAX_PATH]; GetModuleFileNameA(NULL, procname, MAX_PATH); - const char* pBaseName = strrchr(procname, '\\'); + const char * pBaseName = strrchr(procname, '\\'); std::stringstream outDir; outDir << JITTER_OUTPUT_DIR << pBaseName << "_" << pid << std::ends; CreateDirectoryPath(outDir.str().c_str()); #endif std::error_code EC; - Module* pModule = pFunction->getParent(); - const char *funcName = pFunction->getName().data(); - char fName[256]; + Module * pModule = pFunction->getParent(); + const char * funcName = pFunction->getName().data(); + char fName[256]; #if defined(_WIN32) sprintf(fName, "%s\\%s.%s.asm", outDir.str().c_str(), funcName, fileName); #else @@ -362,8 +411,8 @@ void JitManager::DumpAsm(Function* pFunction, const char* fileName) raw_fd_ostream filestream(fName, EC, llvm::sys::fs::F_None); - legacy::PassManager* pMPasses = new legacy::PassManager(); - auto* pTarget = mpExec->getTargetMachine(); + legacy::PassManager *pMPasses = new legacy::PassManager(); + auto * pTarget = mpExec->getTargetMachine(); pTarget->Options.MCOptions.AsmVerbose = true; pTarget->addPassesToEmitFile(*pMPasses, filestream, TargetMachine::CGFT_AssemblyFile); pMPasses->run(*pModule); @@ -376,9 +425,9 @@ std::string JitManager::GetOutputDir() { #if defined(_WIN32) DWORD pid = GetCurrentProcessId(); - char procname[MAX_PATH]; + char procname[MAX_PATH]; GetModuleFileNameA(NULL, procname, MAX_PATH); - const char* pBaseName = strrchr(procname, '\\'); + const char * pBaseName = strrchr(procname, '\\'); std::stringstream outDir; outDir << JITTER_OUTPUT_DIR << pBaseName << "_" << pid; CreateDirectoryPath(outDir.str().c_str()); @@ -396,8 +445,8 @@ void JitManager::DumpToFile(Module *M, const char *fileName) std::string outDir = GetOutputDir(); std::error_code EC; - const char *funcName = M->getName().data(); - char fName[256]; + const char * funcName = M->getName().data(); + char fName[256]; #if defined(_WIN32) sprintf(fName, "%s\\%s.%s.ll", outDir.c_str(), funcName, fileName); #else @@ -418,8 +467,8 @@ void JitManager::DumpToFile(Function *f, const char *fileName) std::string outDir = GetOutputDir(); std::error_code EC; - const char *funcName = f->getName().data(); - char fName[256]; + const char * funcName = f->getName().data(); + char fName[256]; #if defined(_WIN32) sprintf(fName, "%s\\%s.%s.ll", outDir.c_str(), funcName, fileName); #else @@ -436,34 +485,33 @@ void JitManager::DumpToFile(Function *f, const char *fileName) fd.flush(); raw_fd_ostream fd_cfg(fName, EC, llvm::sys::fs::F_Text); - WriteGraph(fd_cfg, (const Function*)f); + WriteGraph(fd_cfg, (const Function *)f); fd_cfg.flush(); } } -extern "C" -{ - bool g_DllActive = true; +extern "C" { +bool g_DllActive = true; - ////////////////////////////////////////////////////////////////////////// - /// @brief Create JIT context. - /// @param simdWidth - SIMD width to be used in generated program. - HANDLE JITCALL JitCreateContext(uint32_t targetSimdWidth, const char* arch, const char* core) - { - return new JitManager(targetSimdWidth, arch, core); - } +////////////////////////////////////////////////////////////////////////// +/// @brief Create JIT context. +/// @param simdWidth - SIMD width to be used in generated program. +HANDLE JITCALL JitCreateContext(uint32_t targetSimdWidth, const char *arch, const char *core) +{ + return new JitManager(targetSimdWidth, arch, core); +} - ////////////////////////////////////////////////////////////////////////// - /// @brief Destroy JIT context. - void JITCALL JitDestroyContext(HANDLE hJitContext) +////////////////////////////////////////////////////////////////////////// +/// @brief Destroy JIT context. +void JITCALL JitDestroyContext(HANDLE hJitContext) +{ + if (g_DllActive) { - if (g_DllActive) - { - delete reinterpret_cast<JitManager*>(hJitContext); - } + delete reinterpret_cast<JitManager *>(hJitContext); } } +} ////////////////////////////////////////////////////////////////////////// /// JitCache @@ -474,31 +522,29 @@ extern "C" ////////////////////////////////////////////////////////////////////////// struct JitCacheFileHeader { - void Init( - uint32_t llCRC, - uint32_t objCRC, - const std::string& moduleID, - const std::string& cpu, - uint32_t optLevel, - uint64_t objSize) + void Init(uint32_t llCRC, + uint32_t objCRC, + const std::string &moduleID, + const std::string &cpu, + uint32_t optLevel, + uint64_t objSize) { m_objSize = objSize; - m_llCRC = llCRC; - m_objCRC = objCRC; + m_llCRC = llCRC; + m_objCRC = objCRC; strncpy(m_ModuleID, moduleID.c_str(), JC_STR_MAX_LEN - 1); m_ModuleID[JC_STR_MAX_LEN - 1] = 0; strncpy(m_Cpu, cpu.c_str(), JC_STR_MAX_LEN - 1); m_Cpu[JC_STR_MAX_LEN - 1] = 0; - m_optLevel = optLevel; + m_optLevel = optLevel; } - bool IsValid(uint32_t llCRC, const std::string& moduleID, const std::string& cpu, uint32_t optLevel) + bool + IsValid(uint32_t llCRC, const std::string &moduleID, const std::string &cpu, uint32_t optLevel) { - if ((m_MagicNumber != JC_MAGIC_NUMBER) || - (m_llCRC != llCRC) || - (m_platformKey != JC_PLATFORM_KEY) || - (m_optLevel != optLevel)) + if ((m_MagicNumber != JC_MAGIC_NUMBER) || (m_llCRC != llCRC) || + (m_platformKey != JC_PLATFORM_KEY) || (m_optLevel != optLevel)) { return false; } @@ -522,27 +568,25 @@ struct JitCacheFileHeader uint64_t GetObjectCRC() const { return m_objCRC; } private: - static const uint64_t JC_MAGIC_NUMBER = 0xfedcba9876543211ULL + 4; - static const size_t JC_STR_MAX_LEN = 32; - static const uint32_t JC_PLATFORM_KEY = - (LLVM_VERSION_MAJOR << 24) | - (LLVM_VERSION_MINOR << 16) | - (LLVM_VERSION_PATCH << 8) | - ((sizeof(void*) > sizeof(uint32_t)) ? 1 : 0); - - uint64_t m_MagicNumber = JC_MAGIC_NUMBER; - uint64_t m_objSize = 0; - uint32_t m_llCRC = 0; - uint32_t m_platformKey = JC_PLATFORM_KEY; - uint32_t m_objCRC = 0; - uint32_t m_optLevel = 0; - char m_ModuleID[JC_STR_MAX_LEN] = {}; - char m_Cpu[JC_STR_MAX_LEN] = {}; + static const uint64_t JC_MAGIC_NUMBER = 0xfedcba9876543211ULL + 4; + static const size_t JC_STR_MAX_LEN = 32; + static const uint32_t JC_PLATFORM_KEY = (LLVM_VERSION_MAJOR << 24) | + (LLVM_VERSION_MINOR << 16) | (LLVM_VERSION_PATCH << 8) | + ((sizeof(void *) > sizeof(uint32_t)) ? 1 : 0); + + uint64_t m_MagicNumber = JC_MAGIC_NUMBER; + uint64_t m_objSize = 0; + uint32_t m_llCRC = 0; + uint32_t m_platformKey = JC_PLATFORM_KEY; + uint32_t m_objCRC = 0; + uint32_t m_optLevel = 0; + char m_ModuleID[JC_STR_MAX_LEN] = {}; + char m_Cpu[JC_STR_MAX_LEN] = {}; }; -static inline uint32_t ComputeModuleCRC(const llvm::Module* M) +static inline uint32_t ComputeModuleCRC(const llvm::Module *M) { - std::string bitcodeBuffer; + std::string bitcodeBuffer; raw_string_ostream bitcodeStream(bitcodeBuffer); #if LLVM_VERSION_MAJOR >= 7 @@ -550,7 +594,7 @@ static inline uint32_t ComputeModuleCRC(const llvm::Module* M) #else llvm::WriteBitcodeToFile(M, bitcodeStream); #endif - //M->print(bitcodeStream, nullptr, false); + // M->print(bitcodeStream, nullptr, false); bitcodeStream.flush(); @@ -561,21 +605,24 @@ static inline uint32_t ComputeModuleCRC(const llvm::Module* M) JitCache::JitCache() { #if defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__) - if (strncmp(KNOB_JIT_CACHE_DIR.c_str(), "~/", 2) == 0) { + if (strncmp(KNOB_JIT_CACHE_DIR.c_str(), "~/", 2) == 0) + { char *homedir; - if (!(homedir = getenv("HOME"))) { + if (!(homedir = getenv("HOME"))) + { homedir = getpwuid(getuid())->pw_dir; } mCacheDir = homedir; mCacheDir += (KNOB_JIT_CACHE_DIR.c_str() + 1); - } else + } + else #endif { mCacheDir = KNOB_JIT_CACHE_DIR; } } -int ExecUnhookedProcess(const std::string& CmdLine, std::string* pStdOut, std::string* pStdErr) +int ExecUnhookedProcess(const std::string &CmdLine, std::string *pStdOut, std::string *pStdErr) { return ExecCmd(CmdLine, "", pStdOut, pStdErr); } @@ -583,7 +630,7 @@ int ExecUnhookedProcess(const std::string& CmdLine, std::string* pStdOut, std::s /// notifyObjectCompiled - Provides a pointer to compiled code for Module M. void JitCache::notifyObjectCompiled(const llvm::Module *M, llvm::MemoryBufferRef Obj) { - const std::string& moduleID = M->getModuleIdentifier(); + const std::string &moduleID = M->getModuleIdentifier(); if (!moduleID.length()) { return; @@ -605,7 +652,7 @@ void JitCache::notifyObjectCompiled(const llvm::Module *M, llvm::MemoryBufferRef objPath += JIT_OBJ_EXT; { - std::error_code err; + std::error_code err; llvm::raw_fd_ostream fileObj(objPath.c_str(), err, llvm::sys::fs::F_None); fileObj << Obj.getBuffer(); fileObj.flush(); @@ -613,14 +660,14 @@ void JitCache::notifyObjectCompiled(const llvm::Module *M, llvm::MemoryBufferRef { - std::error_code err; + std::error_code err; llvm::raw_fd_ostream fileObj(filePath.c_str(), err, llvm::sys::fs::F_None); uint32_t objcrc = ComputeCRC(0, Obj.getBufferStart(), Obj.getBufferSize()); header.Init(mCurrentModuleCRC, objcrc, moduleID, mCpu, mOptLevel, Obj.getBufferSize()); - fileObj.write((const char*)&header, sizeof(header)); + fileObj.write((const char *)&header, sizeof(header)); fileObj.flush(); } } @@ -628,10 +675,10 @@ void JitCache::notifyObjectCompiled(const llvm::Module *M, llvm::MemoryBufferRef /// Returns a pointer to a newly allocated MemoryBuffer that contains the /// object which corresponds with Module M, or 0 if an object is not /// available. -std::unique_ptr<llvm::MemoryBuffer> JitCache::getObject(const llvm::Module* M) +std::unique_ptr<llvm::MemoryBuffer> JitCache::getObject(const llvm::Module *M) { - const std::string& moduleID = M->getModuleIdentifier(); - mCurrentModuleCRC = ComputeModuleCRC(M); + const std::string &moduleID = M->getModuleIdentifier(); + mCurrentModuleCRC = ComputeModuleCRC(M); if (!moduleID.length()) { @@ -649,8 +696,8 @@ std::unique_ptr<llvm::MemoryBuffer> JitCache::getObject(const llvm::Module* M) llvm::SmallString<MAX_PATH> objFilePath = filePath; objFilePath += JIT_OBJ_EXT; - FILE* fpObjIn = nullptr; - FILE* fpIn = fopen(filePath.c_str(), "rb"); + FILE *fpObjIn = nullptr; + FILE *fpIn = fopen(filePath.c_str(), "rb"); if (!fpIn) { return nullptr; @@ -681,7 +728,7 @@ std::unique_ptr<llvm::MemoryBuffer> JitCache::getObject(const llvm::Module* M) #else pBuf = llvm::WritableMemoryBuffer::getNewUninitMemBuffer(size_t(header.GetObjectSize())); #endif - if (!fread(const_cast<char*>(pBuf->getBufferStart()), header.GetObjectSize(), 1, fpObjIn)) + if (!fread(const_cast<char *>(pBuf->getBufferStart()), header.GetObjectSize(), 1, fpObjIn)) { pBuf = nullptr; break; @@ -694,8 +741,7 @@ std::unique_ptr<llvm::MemoryBuffer> JitCache::getObject(const llvm::Module* M) break; } - } - while (0); + } while (0); fclose(fpIn); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h index 152776a6513..a5b6af91f06 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h @@ -1,32 +1,32 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file JitManager.h -* -* @brief JitManager contains the LLVM data structures used for JIT generation -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file JitManager.h + * + * @brief JitManager contains the LLVM data structures used for JIT generation + * + * Notes: + * + ******************************************************************************/ #pragma once #include "jit_pch.hpp" @@ -37,7 +37,7 @@ /// JitInstructionSet /// @brief Subclass of InstructionSet that allows users to override /// the reporting of support for certain ISA features. This allows capping -/// the jitted code to a certain feature level, e.g. jit AVX level code on +/// the jitted code to a certain feature level, e.g. jit AVX level code on /// a platform that supports AVX2. ////////////////////////////////////////////////////////////////////////// class JitInstructionSet : public InstructionSet @@ -47,22 +47,22 @@ public: { std::transform(isaRequest.begin(), isaRequest.end(), isaRequest.begin(), ::tolower); - if(isaRequest == "avx") + if (isaRequest == "avx") { - bForceAVX = true; - bForceAVX2 = false; + bForceAVX = true; + bForceAVX2 = false; bForceAVX512 = false; } - else if(isaRequest == "avx2") + else if (isaRequest == "avx2") { - bForceAVX = false; - bForceAVX2 = true; + bForceAVX = false; + bForceAVX2 = true; bForceAVX512 = false; } - else if(isaRequest == "avx512") + else if (isaRequest == "avx512") { - bForceAVX = false; - bForceAVX2 = false; + bForceAVX = false; + bForceAVX2 = false; bForceAVX512 = true; } }; @@ -73,19 +73,16 @@ public: bool BMI2(void) { return bForceAVX ? 0 : InstructionSet::BMI2(); } private: - bool bForceAVX = false; - bool bForceAVX2 = false; - bool bForceAVX512 = false; + bool bForceAVX = false; + bool bForceAVX2 = false; + bool bForceAVX512 = false; std::string isaRequest; }; - - struct JitLLVMContext : llvm::LLVMContext { }; - ////////////////////////////////////////////////////////////////////////// /// JitCache ////////////////////////////////////////////////////////////////////////// @@ -97,18 +94,15 @@ public: JitCache(); virtual ~JitCache() {} - void Init( - JitManager* pJitMgr, - const llvm::StringRef& cpu, - llvm::CodeGenOpt::Level level) + void Init(JitManager* pJitMgr, const llvm::StringRef& cpu, llvm::CodeGenOpt::Level level) { - mCpu = cpu.str(); - mpJitMgr = pJitMgr; + mCpu = cpu.str(); + mpJitMgr = pJitMgr; mOptLevel = level; } /// notifyObjectCompiled - Provides a pointer to compiled code for Module M. - void notifyObjectCompiled(const llvm::Module *M, llvm::MemoryBufferRef Obj) override; + void notifyObjectCompiled(const llvm::Module* M, llvm::MemoryBufferRef Obj) override; /// Returns a pointer to a newly allocated MemoryBuffer that contains the /// object which corresponds with Module M, or 0 if an object is not @@ -116,11 +110,11 @@ public: std::unique_ptr<llvm::MemoryBuffer> getObject(const llvm::Module* M) override; private: - std::string mCpu; + std::string mCpu; llvm::SmallString<MAX_PATH> mCacheDir; - uint32_t mCurrentModuleCRC = 0; - JitManager* mpJitMgr = nullptr; - llvm::CodeGenOpt::Level mOptLevel = llvm::CodeGenOpt::None; + uint32_t mCurrentModuleCRC = 0; + JitManager* mpJitMgr = nullptr; + llvm::CodeGenOpt::Level mOptLevel = llvm::CodeGenOpt::None; }; ////////////////////////////////////////////////////////////////////////// @@ -131,33 +125,33 @@ struct JitManager JitManager(uint32_t w, const char* arch, const char* core); ~JitManager(){}; - JitLLVMContext mContext; ///< LLVM compiler - llvm::IRBuilder<> mBuilder; ///< LLVM IR Builder - llvm::ExecutionEngine* mpExec; - JitCache mCache; + JitLLVMContext mContext; ///< LLVM compiler + llvm::IRBuilder<> mBuilder; ///< LLVM IR Builder + llvm::ExecutionEngine* mpExec; + JitCache mCache; // Need to be rebuilt after a JIT and before building new IR - llvm::Module* mpCurrentModule; - bool mIsModuleFinalized; - uint32_t mJitNumber; + llvm::Module* mpCurrentModule; + bool mIsModuleFinalized; + uint32_t mJitNumber; - uint32_t mVWidth; + uint32_t mVWidth; - bool mUsingAVX512 = false; + bool mUsingAVX512 = false; // fetch shader types - llvm::FunctionType* mFetchShaderTy; + llvm::FunctionType* mFetchShaderTy; - JitInstructionSet mArch; + JitInstructionSet mArch; // Debugging support std::unordered_map<llvm::StructType*, llvm::DIType*> mDebugStructMap; void SetupNewModule(); - void DumpAsm(llvm::Function* pFunction, const char* fileName); - static void DumpToFile(llvm::Function *f, const char *fileName); - static void DumpToFile(llvm::Module *M, const char *fileName); + void DumpAsm(llvm::Function* pFunction, const char* fileName); + static void DumpToFile(llvm::Function* f, const char* fileName); + static void DumpToFile(llvm::Module* M, const char* fileName); static std::string GetOutputDir(); // Debugging support methods @@ -177,6 +171,10 @@ struct JitManager return mDebugStructMap[pStructTy]; } - llvm::DIType* CreateDebugStructType(llvm::StructType* pType, const std::string& name, llvm::DIFile* pFile, uint32_t lineNum, - const std::vector<std::pair<std::string, uint32_t>>& members); + llvm::DIType* + CreateDebugStructType(llvm::StructType* pType, + const std::string& name, + llvm::DIFile* pFile, + uint32_t lineNum, + const std::vector<std::pair<std::string, uint32_t>>& members); }; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp index 20f2e42eec9..f89c502db7d 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp @@ -1,32 +1,32 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file blend_jit.cpp -* -* @brief Implementation of the blend jitter -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file blend_jit.cpp + * + * @brief Implementation of the blend jitter + * + * Notes: + * + ******************************************************************************/ #include "jit_pch.hpp" #include "builder.h" #include "jit_api.h" @@ -47,8 +47,13 @@ struct BlendJit : public Builder { BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){}; - template<bool Color, bool Alpha> - void GenerateBlendFactor(SWR_BLEND_FACTOR factor, Value* constColor[4], Value* src[4], Value* src1[4], Value* dst[4], Value* result[4]) + template <bool Color, bool Alpha> + void GenerateBlendFactor(SWR_BLEND_FACTOR factor, + Value* constColor[4], + Value* src[4], + Value* src1[4], + Value* dst[4], + Value* result[4]) { Value* out[4]; @@ -77,7 +82,7 @@ struct BlendJit : public Builder break; case BLENDFACTOR_SRC_ALPHA_SATURATE: out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3])); - out[3] = VIMMED1(1.0f); + out[3] = VIMMED1(1.0f); break; case BLENDFACTOR_CONST_COLOR: out[0] = constColor[0]; @@ -158,7 +163,7 @@ struct BlendJit : public Builder void Clamp(SWR_FORMAT format, Value* src[4]) { const SWR_FORMAT_INFO& info = GetFormatInfo(format); - SWR_TYPE type = info.type[0]; + SWR_TYPE type = info.type[0]; switch (type) { @@ -179,7 +184,8 @@ struct BlendJit : public Builder src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f)); break; - case SWR_TYPE_UNKNOWN: SWR_INVALID("Unsupport format type: %d", type); + case SWR_TYPE_UNKNOWN: + SWR_INVALID("Unsupport format type: %d", type); } } @@ -187,7 +193,7 @@ struct BlendJit : public Builder { const SWR_FORMAT_INFO& info = GetFormatInfo(format); - bool valid[] = { false, false, false, false }; + bool valid[] = {false, false, false, false}; for (uint32_t c = 0; c < info.numComps; ++c) { valid[info.swizzle[c]] = true; @@ -210,7 +216,8 @@ struct BlendJit : public Builder { if (info.type[c] == SWR_TYPE_UNUSED) { - src[info.swizzle[c]] = BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty); + src[info.swizzle[c]] = + BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty); } } } @@ -223,22 +230,28 @@ struct BlendJit : public Builder if (info.bpc[c] <= QUANTIZE_THRESHOLD && info.type[c] != SWR_TYPE_UNUSED) { uint32_t swizComp = info.swizzle[c]; - float factor = (float)((1 << info.bpc[c]) - 1); + float factor = (float)((1 << info.bpc[c]) - 1); switch (info.type[c]) { case SWR_TYPE_UNORM: src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f)); src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO)); - src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f /factor)); + src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f / factor)); break; - default: SWR_INVALID("Unsupported format type: %d", info.type[c]); + default: + SWR_INVALID("Unsupported format type: %d", info.type[c]); } } } } - template<bool Color, bool Alpha> - void BlendFunc(SWR_BLEND_OP blendOp, Value* src[4], Value* srcFactor[4], Value* dst[4], Value* dstFactor[4], Value* result[4]) + template <bool Color, bool Alpha> + void BlendFunc(SWR_BLEND_OP blendOp, + Value* src[4], + Value* srcFactor[4], + Value* dst[4], + Value* dstFactor[4], + Value* result[4]) { Value* out[4]; Value* srcBlend[4]; @@ -308,7 +321,7 @@ struct BlendJit : public Builder void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4]) { // Op: (s == PS output, d = RT contents) - switch(logicOp) + switch (logicOp) { case LOGICOP_CLEAR: result[0] = VIMMED1(0); @@ -443,32 +456,49 @@ struct BlendJit : public Builder } } - void AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* ppAlpha, Value* ppMask) + void + AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* ppAlpha, Value* ppMask) { // load uint32_t reference - Value* pRef = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_alphaTestReference })); - + Value* pRef = VBROADCAST(LOAD(pBlendState, {0, SWR_BLEND_STATE_alphaTestReference})); + // load alpha - Value* pAlpha = LOAD(ppAlpha, { 0, 0 }); + Value* pAlpha = LOAD(ppAlpha, {0, 0}); Value* pTest = nullptr; if (state.alphaTestFormat == ALPHA_TEST_UNORM8) { // convert float alpha to unorm8 Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f)); - pAlphaU8 = FP_TO_UI(pAlphaU8, mSimdInt32Ty); + pAlphaU8 = FP_TO_UI(pAlphaU8, mSimdInt32Ty); // compare switch (state.alphaTestFunction) { - case ZFUNC_ALWAYS: pTest = VIMMED1(true); break; - case ZFUNC_NEVER: pTest = VIMMED1(false); break; - case ZFUNC_LT: pTest = ICMP_ULT(pAlphaU8, pRef); break; - case ZFUNC_EQ: pTest = ICMP_EQ(pAlphaU8, pRef); break; - case ZFUNC_LE: pTest = ICMP_ULE(pAlphaU8, pRef); break; - case ZFUNC_GT: pTest = ICMP_UGT(pAlphaU8, pRef); break; - case ZFUNC_NE: pTest = ICMP_NE(pAlphaU8, pRef); break; - case ZFUNC_GE: pTest = ICMP_UGE(pAlphaU8, pRef); break; + case ZFUNC_ALWAYS: + pTest = VIMMED1(true); + break; + case ZFUNC_NEVER: + pTest = VIMMED1(false); + break; + case ZFUNC_LT: + pTest = ICMP_ULT(pAlphaU8, pRef); + break; + case ZFUNC_EQ: + pTest = ICMP_EQ(pAlphaU8, pRef); + break; + case ZFUNC_LE: + pTest = ICMP_ULE(pAlphaU8, pRef); + break; + case ZFUNC_GT: + pTest = ICMP_UGT(pAlphaU8, pRef); + break; + case ZFUNC_NE: + pTest = ICMP_NE(pAlphaU8, pRef); + break; + case ZFUNC_GE: + pTest = ICMP_UGE(pAlphaU8, pRef); + break; default: SWR_INVALID("Invalid alpha test function"); break; @@ -482,14 +512,30 @@ struct BlendJit : public Builder // compare switch (state.alphaTestFunction) { - case ZFUNC_ALWAYS: pTest = VIMMED1(true); break; - case ZFUNC_NEVER: pTest = VIMMED1(false); break; - case ZFUNC_LT: pTest = FCMP_OLT(pAlpha, pRef); break; - case ZFUNC_EQ: pTest = FCMP_OEQ(pAlpha, pRef); break; - case ZFUNC_LE: pTest = FCMP_OLE(pAlpha, pRef); break; - case ZFUNC_GT: pTest = FCMP_OGT(pAlpha, pRef); break; - case ZFUNC_NE: pTest = FCMP_ONE(pAlpha, pRef); break; - case ZFUNC_GE: pTest = FCMP_OGE(pAlpha, pRef); break; + case ZFUNC_ALWAYS: + pTest = VIMMED1(true); + break; + case ZFUNC_NEVER: + pTest = VIMMED1(false); + break; + case ZFUNC_LT: + pTest = FCMP_OLT(pAlpha, pRef); + break; + case ZFUNC_EQ: + pTest = FCMP_OEQ(pAlpha, pRef); + break; + case ZFUNC_LE: + pTest = FCMP_OLE(pAlpha, pRef); + break; + case ZFUNC_GT: + pTest = FCMP_OGT(pAlpha, pRef); + break; + case ZFUNC_NE: + pTest = FCMP_ONE(pAlpha, pRef); + break; + case ZFUNC_GE: + pTest = FCMP_OGE(pAlpha, pRef); + break; default: SWR_INVALID("Invalid alpha test function"); break; @@ -514,22 +560,24 @@ struct BlendJit : public Builder Function* Create(const BLEND_COMPILE_STATE& state) { - std::stringstream fnName("BLND_", std::ios_base::in | std::ios_base::out | std::ios_base::ate); + std::stringstream fnName("BLND_", + std::ios_base::in | std::ios_base::out | std::ios_base::ate); fnName << ComputeCRC(0, &state, sizeof(state)); // blend function signature - //typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_CONTEXT*); + // typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_CONTEXT*); std::vector<Type*> args{ PointerType::get(Gen_SWR_BLEND_CONTEXT(JM()), 0) // SWR_BLEND_CONTEXT* }; - //std::vector<Type*> args{ + // std::vector<Type*> args{ // PointerType::get(Gen_SWR_BLEND_CONTEXT(JM()), 0), // SWR_BLEND_CONTEXT* //}; - FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false); - Function* blendFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); + FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false); + Function* blendFunc = Function::Create( + fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); blendFunc->getParent()->setModuleIdentifier(blendFunc->getName()); BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc); @@ -537,29 +585,30 @@ struct BlendJit : public Builder IRB()->SetInsertPoint(entry); // arguments - auto argitr = blendFunc->arg_begin(); + auto argitr = blendFunc->arg_begin(); Value* pBlendContext = &*argitr++; pBlendContext->setName("pBlendContext"); - Value* pBlendState = LOAD(pBlendContext, { 0, SWR_BLEND_CONTEXT_pBlendState }); + Value* pBlendState = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pBlendState}); pBlendState->setName("pBlendState"); - Value* pSrc = LOAD(pBlendContext, { 0, SWR_BLEND_CONTEXT_src }); + Value* pSrc = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src}); pSrc->setName("src"); - Value* pSrc1 = LOAD(pBlendContext, { 0, SWR_BLEND_CONTEXT_src1 }); + Value* pSrc1 = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src1}); pSrc1->setName("src1"); - Value* pSrc0Alpha = LOAD(pBlendContext, { 0, SWR_BLEND_CONTEXT_src0alpha }); + Value* pSrc0Alpha = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src0alpha}); pSrc0Alpha->setName("src0alpha"); - Value* sampleNum = LOAD(pBlendContext, { 0, SWR_BLEND_CONTEXT_sampleNum }); + Value* sampleNum = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_sampleNum}); sampleNum->setName("sampleNum"); - Value* pDst = LOAD(pBlendContext, { 0, SWR_BLEND_CONTEXT_pDst }); + Value* pDst = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pDst}); pDst->setName("pDst"); - Value* pResult = LOAD(pBlendContext, { 0, SWR_BLEND_CONTEXT_result }); + Value* pResult = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_result}); pResult->setName("result"); - Value* ppoMask = LOAD(pBlendContext, { 0, SWR_BLEND_CONTEXT_oMask }); + Value* ppoMask = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_oMask}); ppoMask->setName("ppoMask"); - Value* ppMask = LOAD(pBlendContext, { 0, SWR_BLEND_CONTEXT_pMask }); + Value* ppMask = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pMask}); ppMask->setName("pMask"); - static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format"); + static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, + "Unsupported hot tile format"); Value* dst[4]; Value* constantColor[4]; Value* src[4]; @@ -568,44 +617,44 @@ struct BlendJit : public Builder for (uint32_t i = 0; i < 4; ++i) { // load hot tile - dst[i] = LOAD(pDst, { 0, i }); + dst[i] = LOAD(pDst, {0, i}); // load constant color - constantColor[i] = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_constantColor, i })); - + constantColor[i] = VBROADCAST(LOAD(pBlendState, {0, SWR_BLEND_STATE_constantColor, i})); + // load src - src[i] = LOAD(pSrc, { 0, i }); + src[i] = LOAD(pSrc, {0, i}); // load src1 - src1[i] = LOAD(pSrc1, { 0, i }); + src1[i] = LOAD(pSrc1, {0, i}); } Value* currentSampleMask = VIMMED1(-1); if (state.desc.alphaToCoverageEnable) { - Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f); - uint32_t bits = (1 << state.desc.numSamples) - 1; - currentSampleMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits))); - currentSampleMask = FP_TO_SI(FADD(currentSampleMask, VIMMED1(0.5f)), mSimdInt32Ty); + Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f); + uint32_t bits = (1 << state.desc.numSamples) - 1; + currentSampleMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits))); + currentSampleMask = FP_TO_SI(FADD(currentSampleMask, VIMMED1(0.5f)), mSimdInt32Ty); } // alpha test if (state.desc.alphaTestEnable) { // Gather for archrast stats - STORE(C(1), pBlendContext, { 0, SWR_BLEND_CONTEXT_isAlphaTested }); + STORE(C(1), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaTested}); AlphaTest(state, pBlendState, pSrc0Alpha, ppMask); } else { // Gather for archrast stats - STORE(C(0), pBlendContext, { 0, SWR_BLEND_CONTEXT_isAlphaTested }); + STORE(C(0), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaTested}); } // color blend if (state.blendState.blendEnable) { // Gather for archrast stats - STORE(C(1), pBlendContext, { 0, SWR_BLEND_CONTEXT_isAlphaBlended }); + STORE(C(1), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaBlended}); // clamp sources Clamp(state.format, src); @@ -635,40 +684,57 @@ struct BlendJit : public Builder Value* dstFactor[4]; if (state.desc.independentAlphaBlendEnable) { - GenerateBlendFactor<true, false>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor); - GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor, constantColor, src, src1, dst, srcFactor); - - GenerateBlendFactor<true, false>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor); - GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor, constantColor, src, src1, dst, dstFactor); - - BlendFunc<true, false>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result); - BlendFunc<false, true>(state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result); + GenerateBlendFactor<true, false>( + state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor); + GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor, + constantColor, + src, + src1, + dst, + srcFactor); + + GenerateBlendFactor<true, false>( + state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor); + GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor, + constantColor, + src, + src1, + dst, + dstFactor); + + BlendFunc<true, false>( + state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result); + BlendFunc<false, true>( + state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result); } else { - GenerateBlendFactor<true, true>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor); - GenerateBlendFactor<true, true>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor); + GenerateBlendFactor<true, true>( + state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor); + GenerateBlendFactor<true, true>( + state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor); - BlendFunc<true, true>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result); + BlendFunc<true, true>( + state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result); } // store results out for (uint32_t i = 0; i < 4; ++i) { - STORE(result[i], pResult, { 0, i }); + STORE(result[i], pResult, {0, i}); } } else { // Gather for archrast stats - STORE(C(0), pBlendContext, { 0, SWR_BLEND_CONTEXT_isAlphaBlended }); + STORE(C(0), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaBlended}); } - - if(state.blendState.logicOpEnable) + + if (state.blendState.logicOpEnable) { const SWR_FORMAT_INFO& info = GetFormatInfo(state.format); - Value* vMask[4]; - float scale[4]; + Value* vMask[4]; + float scale[4]; if (!state.blendState.blendEnable) { @@ -676,7 +742,7 @@ struct BlendJit : public Builder Clamp(state.format, dst); } - for(uint32_t i = 0; i < 4; i++) + for (uint32_t i = 0; i < 4; i++) { if (info.type[i] == SWR_TYPE_UNUSED) { @@ -713,20 +779,12 @@ struct BlendJit : public Builder dst[i] = BITCAST(dst[i], mSimdInt32Ty); break; case SWR_TYPE_SNORM: - src[i] = FP_TO_SI( - FMUL(src[i], VIMMED1(scale[i])), - mSimdInt32Ty); - dst[i] = FP_TO_SI( - FMUL(dst[i], VIMMED1(scale[i])), - mSimdInt32Ty); + src[i] = FP_TO_SI(FMUL(src[i], VIMMED1(scale[i])), mSimdInt32Ty); + dst[i] = FP_TO_SI(FMUL(dst[i], VIMMED1(scale[i])), mSimdInt32Ty); break; case SWR_TYPE_UNORM: - src[i] = FP_TO_UI( - FMUL(src[i], VIMMED1(scale[i])), - mSimdInt32Ty); - dst[i] = FP_TO_UI( - FMUL(dst[i], VIMMED1(scale[i])), - mSimdInt32Ty); + src[i] = FP_TO_UI(FMUL(src[i], VIMMED1(scale[i])), mSimdInt32Ty); + dst[i] = FP_TO_UI(FMUL(dst[i], VIMMED1(scale[i])), mSimdInt32Ty); break; } } @@ -734,7 +792,7 @@ struct BlendJit : public Builder LogicOpFunc(state.blendState.logicOpFunc, src, dst, result); // store results out - for(uint32_t i = 0; i < 4; ++i) + for (uint32_t i = 0; i < 4; ++i) { if (info.type[i] == SWR_TYPE_UNUSED) { @@ -761,12 +819,10 @@ struct BlendJit : public Builder case SWR_TYPE_SNORM: result[i] = SHL(result[i], C(32 - info.bpc[i])); result[i] = ASHR(result[i], C(32 - info.bpc[i])); - result[i] = FMUL(SI_TO_FP(result[i], mSimdFP32Ty), - VIMMED1(1.0f / scale[i])); + result[i] = FMUL(SI_TO_FP(result[i], mSimdFP32Ty), VIMMED1(1.0f / scale[i])); break; case SWR_TYPE_UNORM: - result[i] = FMUL(UI_TO_FP(result[i], mSimdFP32Ty), - VIMMED1(1.0f / scale[i])); + result[i] = FMUL(UI_TO_FP(result[i], mSimdFP32Ty), VIMMED1(1.0f / scale[i])); break; } @@ -774,27 +830,27 @@ struct BlendJit : public Builder } } - if(state.desc.oMaskEnable) + if (state.desc.oMaskEnable) { assert(!(state.desc.alphaToCoverageEnable)); // load current mask - Value* oMask = LOAD(ppoMask); + Value* oMask = LOAD(ppoMask); currentSampleMask = AND(oMask, currentSampleMask); } - if(state.desc.sampleMaskEnable) + if (state.desc.sampleMaskEnable) { - Value* sampleMask = LOAD(pBlendState, { 0, SWR_BLEND_STATE_sampleMask}); + Value* sampleMask = LOAD(pBlendState, {0, SWR_BLEND_STATE_sampleMask}); currentSampleMask = AND(VBROADCAST(sampleMask), currentSampleMask); } - if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable || - state.desc.oMaskEnable) + if (state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable || + state.desc.oMaskEnable) { // load coverage mask and mask off any lanes with no samples - Value* pMask = LOAD(ppMask); + Value* pMask = LOAD(ppMask); Value* sampleMasked = SHL(C(1), sampleNum); - currentSampleMask = AND(currentSampleMask, VBROADCAST(sampleMasked)); + currentSampleMask = AND(currentSampleMask, VBROADCAST(sampleMasked)); currentSampleMask = S_EXT(ICMP_UGT(currentSampleMask, VBROADCAST(C(0))), mSimdInt32Ty); Value* outputMask = AND(pMask, currentSampleMask); // store new mask @@ -836,11 +892,12 @@ struct BlendJit : public Builder /// @return PFN_FETCH_FUNC - pointer to fetch code PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc) { - const llvm::Function *func = (const llvm::Function*)hFunc; - JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); - PFN_BLEND_JIT_FUNC pfnBlend; + const llvm::Function* func = (const llvm::Function*)hFunc; + JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); + PFN_BLEND_JIT_FUNC pfnBlend; pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); - // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module + // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot + // add new IR to the module pJitMgr->mIsModuleFinalized = true; return pfnBlend; @@ -850,14 +907,15 @@ PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc) /// @brief JIT compiles blend shader /// @param hJitMgr - JitManager handle /// @param state - blend state to build function from -extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr, const BLEND_COMPILE_STATE& state) +extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr, + const BLEND_COMPILE_STATE& state) { JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); pJitMgr->SetupNewModule(); BlendJit theJit(pJitMgr); - HANDLE hFunc = theJit.Create(state); + HANDLE hFunc = theJit.Create(state); return JitBlendFunc(hJitMgr, hFunc); } diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h index ddb7374d406..3e78054eced 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h @@ -1,32 +1,32 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file blend_jit.h -* -* @brief Definition of the blend jitter -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file blend_jit.h + * + * @brief Definition of the blend jitter + * + * Notes: + * + ******************************************************************************/ #pragma once #include "common/formats.h" @@ -34,15 +34,15 @@ struct RENDER_TARGET_BLEND_COMPILE_STATE { - bool blendEnable; - bool logicOpEnable; + bool blendEnable; + bool logicOpEnable; SWR_BLEND_FACTOR sourceAlphaBlendFactor; SWR_BLEND_FACTOR destAlphaBlendFactor; SWR_BLEND_FACTOR sourceBlendFactor; SWR_BLEND_FACTOR destBlendFactor; - SWR_BLEND_OP colorBlendFunc; - SWR_BLEND_OP alphaBlendFunc; - SWR_LOGIC_OP logicOpFunc; + SWR_BLEND_OP colorBlendFunc; + SWR_BLEND_OP alphaBlendFunc; + SWR_LOGIC_OP logicOpFunc; }; enum ALPHA_TEST_FORMAT @@ -60,14 +60,14 @@ struct BLEND_DESC { struct { - uint32_t alphaTestEnable: 1; - uint32_t independentAlphaBlendEnable: 1; - uint32_t alphaToCoverageEnable: 1; - uint32_t oMaskEnable:1; - uint32_t inputCoverageEnable:1; - uint32_t sampleMaskEnable:1; - uint32_t numSamples:5; - uint32_t _reserved : 21; + uint32_t alphaTestEnable : 1; + uint32_t independentAlphaBlendEnable : 1; + uint32_t alphaToCoverageEnable : 1; + uint32_t oMaskEnable : 1; + uint32_t inputCoverageEnable : 1; + uint32_t sampleMaskEnable : 1; + uint32_t numSamples : 5; + uint32_t _reserved : 21; }; uint32_t bits; }; @@ -78,11 +78,11 @@ struct BLEND_DESC ////////////////////////////////////////////////////////////////////////// struct BLEND_COMPILE_STATE { - SWR_FORMAT format; // format of render target being blended + SWR_FORMAT format; // format of render target being blended RENDER_TARGET_BLEND_COMPILE_STATE blendState; - BLEND_DESC desc; + BLEND_DESC desc; - SWR_ZFUNCTION alphaTestFunction; + SWR_ZFUNCTION alphaTestFunction; ALPHA_TEST_FORMAT alphaTestFormat; bool operator==(const BLEND_COMPILE_STATE& other) const @@ -95,18 +95,18 @@ struct BLEND_COMPILE_STATE { if (!desc.alphaTestEnable) { - alphaTestFormat = (ALPHA_TEST_FORMAT)0; + alphaTestFormat = (ALPHA_TEST_FORMAT)0; alphaTestFunction = (SWR_ZFUNCTION)0; } if (!blendState.blendEnable) { blendState.sourceAlphaBlendFactor = (SWR_BLEND_FACTOR)0; - blendState.destAlphaBlendFactor = (SWR_BLEND_FACTOR)0; - blendState.sourceBlendFactor = (SWR_BLEND_FACTOR)0; - blendState.destBlendFactor = (SWR_BLEND_FACTOR)0; - blendState.colorBlendFunc = (SWR_BLEND_OP)0; - blendState.alphaBlendFunc = (SWR_BLEND_OP)0; + blendState.destAlphaBlendFactor = (SWR_BLEND_FACTOR)0; + blendState.sourceBlendFactor = (SWR_BLEND_FACTOR)0; + blendState.destBlendFactor = (SWR_BLEND_FACTOR)0; + blendState.colorBlendFunc = (SWR_BLEND_OP)0; + blendState.alphaBlendFunc = (SWR_BLEND_OP)0; } if (!blendState.logicOpEnable) @@ -122,8 +122,8 @@ struct BLEND_COMPILE_STATE if (!desc.independentAlphaBlendEnable) { blendState.sourceAlphaBlendFactor = (SWR_BLEND_FACTOR)0; - blendState.destAlphaBlendFactor = (SWR_BLEND_FACTOR)0; - blendState.alphaBlendFunc = (SWR_BLEND_OP)0; + blendState.destAlphaBlendFactor = (SWR_BLEND_FACTOR)0; + blendState.alphaBlendFunc = (SWR_BLEND_OP)0; } } }; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp index 4b06aaa3ab1..ef95e0103f8 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp @@ -38,7 +38,7 @@ namespace SwrJit ////////////////////////////////////////////////////////////////////////// /// @brief Contructor for Builder. /// @param pJitMgr - JitManager which contains modules, function passes, etc. - Builder::Builder(JitManager *pJitMgr) : mpJitMgr(pJitMgr), mpPrivateContext(nullptr) + Builder::Builder(JitManager* pJitMgr) : mpJitMgr(pJitMgr), mpPrivateContext(nullptr) { mVWidth = pJitMgr->mVWidth; mVWidth16 = 16; @@ -79,7 +79,7 @@ namespace SwrJit mSimd32Int8Ty = VectorType::get(mInt8Ty, 32); - if (sizeof(uint32_t *) == 4) + if (sizeof(uint32_t*) == 4) { mIntPtrTy = mInt32Ty; mSimdIntPtrTy = mSimdInt32Ty; @@ -87,7 +87,7 @@ namespace SwrJit } else { - SWR_ASSERT(sizeof(uint32_t *) == 8); + SWR_ASSERT(sizeof(uint32_t*) == 8); mIntPtrTy = mInt64Ty; mSimdIntPtrTy = mSimdInt64Ty; @@ -111,38 +111,38 @@ namespace SwrJit } /// @brief Mark this alloca as temporary to avoid hoisting later on - void Builder::SetTempAlloca(Value *inst) + void Builder::SetTempAlloca(Value* inst) { - AllocaInst *pAlloca = dyn_cast<AllocaInst>(inst); + AllocaInst* pAlloca = dyn_cast<AllocaInst>(inst); SWR_ASSERT(pAlloca, "Unexpected non-alloca instruction"); - MDNode *N = MDNode::get(JM()->mContext, MDString::get(JM()->mContext, "is_temp_alloca")); + MDNode* N = MDNode::get(JM()->mContext, MDString::get(JM()->mContext, "is_temp_alloca")); pAlloca->setMetadata("is_temp_alloca", N); } - bool Builder::IsTempAlloca(Value *inst) + bool Builder::IsTempAlloca(Value* inst) { - AllocaInst *pAlloca = dyn_cast<AllocaInst>(inst); + AllocaInst* pAlloca = dyn_cast<AllocaInst>(inst); SWR_ASSERT(pAlloca, "Unexpected non-alloca instruction"); return (pAlloca->getMetadata("is_temp_alloca") != nullptr); } // Returns true if able to find a call instruction to mark - bool Builder::SetNamedMetaDataOnCallInstr(Instruction *inst, StringRef mdName) + bool Builder::SetNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName) { - CallInst *pCallInstr = dyn_cast<CallInst>(inst); + CallInst* pCallInstr = dyn_cast<CallInst>(inst); if (pCallInstr) { - MDNode *N = MDNode::get(JM()->mContext, MDString::get(JM()->mContext, mdName)); + MDNode* N = MDNode::get(JM()->mContext, MDString::get(JM()->mContext, mdName)); pCallInstr->setMetadata(mdName, N); return true; } else { // Follow use def chain back up - for (Use &u : inst->operands()) + for (Use& u : inst->operands()) { - Instruction *srcInst = dyn_cast<Instruction>(u.get()); + Instruction* srcInst = dyn_cast<Instruction>(u.get()); if (srcInst) { if (SetNamedMetaDataOnCallInstr(srcInst, mdName)) @@ -156,10 +156,9 @@ namespace SwrJit return false; } - bool Builder::HasNamedMetaDataOnCallInstr(Instruction *inst, - StringRef mdName) + bool Builder::HasNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName) { - CallInst *pCallInstr = dyn_cast<CallInst>(inst); + CallInst* pCallInstr = dyn_cast<CallInst>(inst); if (!pCallInstr) { @@ -171,7 +170,7 @@ namespace SwrJit ////////////////////////////////////////////////////////////////////////// /// @brief Packetizes the type. Assumes SOA conversion. - Type *Builder::GetVectorType(Type *pType) + Type* Builder::GetVectorType(Type* pType) { if (pType->isVectorTy()) { @@ -182,24 +181,24 @@ namespace SwrJit if (pType->isArrayTy()) { uint32_t arraySize = pType->getArrayNumElements(); - Type * pArrayType = pType->getArrayElementType(); - Type * pVecArrayType = GetVectorType(pArrayType); - Type * pVecType = ArrayType::get(pVecArrayType, arraySize); + Type* pArrayType = pType->getArrayElementType(); + Type* pVecArrayType = GetVectorType(pArrayType); + Type* pVecType = ArrayType::get(pVecArrayType, arraySize); return pVecType; } // {float,int} should packetize to {<8 x float>, <8 x int>} if (pType->isAggregateType()) { - uint32_t numElems = pType->getStructNumElements(); - SmallVector<Type *, 8> vecTypes; + uint32_t numElems = pType->getStructNumElements(); + SmallVector<Type*, 8> vecTypes; for (uint32_t i = 0; i < numElems; ++i) { - Type *pElemType = pType->getStructElementType(i); - Type *pVecElemType = GetVectorType(pElemType); + Type* pElemType = pType->getStructElementType(i); + Type* pVecElemType = GetVectorType(pElemType); vecTypes.push_back(pVecElemType); } - Type *pVecType = StructType::get(JM()->mContext, vecTypes); + Type* pVecType = StructType::get(JM()->mContext, vecTypes); return pVecType; } @@ -211,7 +210,7 @@ namespace SwrJit } // <ty> should packetize to <8 x <ty>> - Type *vecType = VectorType::get(pType, JM()->mVWidth); + Type* vecType = VectorType::get(pType, JM()->mVWidth); return vecType; } -} +} // namespace SwrJit diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h index c49d07e056c..a047f2a065f 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h @@ -1,32 +1,32 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file builder.h -* -* @brief Includes all the builder related functionality -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file builder.h + * + * @brief Includes all the builder related functionality + * + * Notes: + * + ******************************************************************************/ #pragma once #include "JitManager.h" @@ -37,98 +37,99 @@ namespace SwrJit ///@todo Move this to better place enum SHADER_STATS_COUNTER_TYPE { - STATS_INST_EXECUTED = 0, - STATS_SAMPLE_EXECUTED = 1, - STATS_SAMPLE_L_EXECUTED = 2, - STATS_SAMPLE_B_EXECUTED = 3, - STATS_SAMPLE_C_EXECUTED = 4, - STATS_SAMPLE_C_LZ_EXECUTED = 5, - STATS_SAMPLE_C_D_EXECUTED = 6, - STATS_LOD_EXECUTED = 7, - STATS_GATHER4_EXECUTED = 8, - STATS_GATHER4_C_EXECUTED = 9, - STATS_GATHER4_C_PO_EXECUTED = 10, + STATS_INST_EXECUTED = 0, + STATS_SAMPLE_EXECUTED = 1, + STATS_SAMPLE_L_EXECUTED = 2, + STATS_SAMPLE_B_EXECUTED = 3, + STATS_SAMPLE_C_EXECUTED = 4, + STATS_SAMPLE_C_LZ_EXECUTED = 5, + STATS_SAMPLE_C_D_EXECUTED = 6, + STATS_LOD_EXECUTED = 7, + STATS_GATHER4_EXECUTED = 8, + STATS_GATHER4_C_EXECUTED = 9, + STATS_GATHER4_C_PO_EXECUTED = 10, STATS_GATHER4_C_PO_C_EXECUTED = 11, - STATS_LOAD_RAW_UAV = 12, - STATS_LOAD_RAW_RESOURCE = 13, - STATS_STORE_RAW_UAV = 14, - STATS_STORE_TGSM = 15, - STATS_DISCARD = 16, - STATS_BARRIER = 17, + STATS_LOAD_RAW_UAV = 12, + STATS_LOAD_RAW_RESOURCE = 13, + STATS_STORE_RAW_UAV = 14, + STATS_STORE_TGSM = 15, + STATS_DISCARD = 16, + STATS_BARRIER = 17, }; using namespace llvm; struct Builder { - Builder(JitManager *pJitMgr); + Builder(JitManager* pJitMgr); virtual ~Builder() {} - IRBuilder<> *IRB() { return mpIRBuilder; }; - JitManager *JM() { return mpJitMgr; } + IRBuilder<>* IRB() { return mpIRBuilder; }; + JitManager* JM() { return mpJitMgr; } - JitManager *mpJitMgr; - IRBuilder<> *mpIRBuilder; + JitManager* mpJitMgr; + IRBuilder<>* mpIRBuilder; - uint32_t mVWidth; // vector width target simd - uint32_t mVWidth16; // vector width simd16 + uint32_t mVWidth; // vector width target simd + uint32_t mVWidth16; // vector width simd16 // Built in types: scalar - Type* mVoidTy; - Type* mInt1Ty; - Type* mInt8Ty; - Type* mInt16Ty; - Type* mInt32Ty; - Type* mInt64Ty; - Type* mIntPtrTy; - Type* mFP16Ty; - Type* mFP32Ty; - Type* mFP32PtrTy; - Type* mDoubleTy; - Type* mInt8PtrTy; - Type* mInt16PtrTy; - Type* mInt32PtrTy; - - Type* mSimd4FP64Ty; + Type* mVoidTy; + Type* mInt1Ty; + Type* mInt8Ty; + Type* mInt16Ty; + Type* mInt32Ty; + Type* mInt64Ty; + Type* mIntPtrTy; + Type* mFP16Ty; + Type* mFP32Ty; + Type* mFP32PtrTy; + Type* mDoubleTy; + Type* mInt8PtrTy; + Type* mInt16PtrTy; + Type* mInt32PtrTy; + + Type* mSimd4FP64Ty; // Built in types: target SIMD - Type* mSimdFP16Ty; - Type* mSimdFP32Ty; - Type* mSimdInt1Ty; - Type* mSimdInt16Ty; - Type* mSimdInt32Ty; - Type* mSimdInt64Ty; - Type* mSimdIntPtrTy; - Type* mSimdVectorTy; - Type* mSimdVectorTRTy; - Type* mSimdVectorIntTy; + Type* mSimdFP16Ty; + Type* mSimdFP32Ty; + Type* mSimdInt1Ty; + Type* mSimdInt16Ty; + Type* mSimdInt32Ty; + Type* mSimdInt64Ty; + Type* mSimdIntPtrTy; + Type* mSimdVectorTy; + Type* mSimdVectorTRTy; + Type* mSimdVectorIntTy; // Built in types: simd16 - Type* mSimd16FP16Ty; - Type* mSimd16FP32Ty; - Type* mSimd16Int1Ty; - Type* mSimd16Int16Ty; - Type* mSimd16Int32Ty; - Type* mSimd16Int64Ty; - Type* mSimd16IntPtrTy; - Type* mSimd16VectorTy; - Type* mSimd16VectorTRTy; - - Type* mSimd32Int8Ty; - - void SetTargetWidth(uint32_t width); - void SetTempAlloca(Value* inst); - bool IsTempAlloca(Value* inst); - bool SetNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName); - bool HasNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName); + Type* mSimd16FP16Ty; + Type* mSimd16FP32Ty; + Type* mSimd16Int1Ty; + Type* mSimd16Int16Ty; + Type* mSimd16Int32Ty; + Type* mSimd16Int64Ty; + Type* mSimd16IntPtrTy; + Type* mSimd16VectorTy; + Type* mSimd16VectorTRTy; + + Type* mSimd32Int8Ty; + + void SetTargetWidth(uint32_t width); + void SetTempAlloca(Value* inst); + bool IsTempAlloca(Value* inst); + bool SetNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName); + bool HasNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName); Type* GetVectorType(Type* pType); - void SetMetadata(StringRef s, uint32_t val) + void SetMetadata(StringRef s, uint32_t val) { - llvm::NamedMDNode *metaData = mpJitMgr->mpCurrentModule->getOrInsertNamedMetadata(s); - Constant* cval = mpIRBuilder->getInt32(val); - llvm::MDNode *mdNode = llvm::MDNode::get(mpJitMgr->mpCurrentModule->getContext(), llvm::ConstantAsMetadata::get(cval)); + llvm::NamedMDNode* metaData = mpJitMgr->mpCurrentModule->getOrInsertNamedMetadata(s); + Constant* cval = mpIRBuilder->getInt32(val); + llvm::MDNode* mdNode = llvm::MDNode::get(mpJitMgr->mpCurrentModule->getContext(), + llvm::ConstantAsMetadata::get(cval)); if (metaData->getNumOperands()) { metaData->setOperand(0, mdNode); @@ -143,8 +144,8 @@ namespace SwrJit NamedMDNode* metaData = mpJitMgr->mpCurrentModule->getNamedMetadata(s); if (metaData) { - MDNode* mdNode = metaData->getOperand(0); - Metadata* val = mdNode->getOperand(0); + MDNode* mdNode = metaData->getOperand(0); + Metadata* val = mdNode->getOperand(0); return mdconst::dyn_extract<ConstantInt>(val)->getZExtValue(); } else @@ -161,17 +162,15 @@ namespace SwrJit #include "builder_mem.h" protected: - - void SetPrivateContext(Value* pPrivateContext) - { - mpPrivateContext = pPrivateContext; + void SetPrivateContext(Value* pPrivateContext) + { + mpPrivateContext = pPrivateContext; NotifyPrivateContextSet(); } - virtual void NotifyPrivateContextSet() {} + virtual void NotifyPrivateContextSet() {} inline Value* GetPrivateContext() { return mpPrivateContext; } - private: + private: Value* mpPrivateContext; - }; -} +} // namespace SwrJit diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp index 3013bc53d7f..3f4b090cfc8 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp @@ -1,32 +1,32 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file builder_gfx_mem.cpp -* -* @brief Definition of the gfx mem builder -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file builder_gfx_mem.cpp + * + * @brief Definition of the gfx mem builder + * + * Notes: + * + ******************************************************************************/ #include "jit_pch.hpp" #include "builder.h" #include "common/rdtsc_buckets.h" @@ -37,12 +37,11 @@ namespace SwrJit { using namespace llvm; - BuilderGfxMem::BuilderGfxMem(JitManager* pJitMgr) : - Builder(pJitMgr) + BuilderGfxMem::BuilderGfxMem(JitManager *pJitMgr) : Builder(pJitMgr) { - mpTranslationFuncTy = nullptr; + mpTranslationFuncTy = nullptr; mpfnTranslateGfxAddress = nullptr; - mpParamSimDC = nullptr; + mpParamSimDC = nullptr; } @@ -50,9 +49,10 @@ namespace SwrJit { } - void BuilderGfxMem::AssertGFXMemoryParams(Value* ptr, Builder::JIT_MEM_CLIENT usage) + void BuilderGfxMem::AssertGFXMemoryParams(Value *ptr, Builder::JIT_MEM_CLIENT usage) { - SWR_ASSERT(!(ptr->getType() == mInt64Ty && usage == MEM_CLIENT_INTERNAL), "Internal memory should not be gfxptr_t."); + SWR_ASSERT(!(ptr->getType() == mInt64Ty && usage == MEM_CLIENT_INTERNAL), + "Internal memory should not be gfxptr_t."); } @@ -64,16 +64,20 @@ namespace SwrJit /// @param vIndices - SIMD wide value of VB byte offsets /// @param vMask - SIMD wide mask that controls whether to access memory or the src values /// @param scale - value to scale indices by - Value* BuilderGfxMem::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, - uint8_t scale, JIT_MEM_CLIENT usage) - { - // address may be coming in as 64bit int now so get the pointer + Value *BuilderGfxMem::GATHERPS(Value * vSrc, + Value * pBase, + Value * vIndices, + Value * vMask, + uint8_t scale, + JIT_MEM_CLIENT usage) + { + // address may be coming in as 64bit int now so get the pointer if (pBase->getType() == mInt64Ty) { pBase = INT_TO_PTR(pBase, PointerType::get(mInt8Ty, 0)); } - Value* vGather = Builder::GATHERPS(vSrc, pBase, vIndices, vMask, scale); + Value *vGather = Builder::GATHERPS(vSrc, pBase, vIndices, vMask, scale); return vGather; } @@ -85,8 +89,12 @@ namespace SwrJit /// @param vIndices - SIMD wide value of VB byte offsets /// @param vMask - SIMD wide mask that controls whether to access memory or the src values /// @param scale - value to scale indices by - Value* BuilderGfxMem::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, - uint8_t scale, JIT_MEM_CLIENT usage) + Value *BuilderGfxMem::GATHERDD(Value * vSrc, + Value * pBase, + Value * vIndices, + Value * vMask, + uint8_t scale, + JIT_MEM_CLIENT usage) { // address may be coming in as 64bit int now so get the pointer @@ -95,41 +103,42 @@ namespace SwrJit pBase = INT_TO_PTR(pBase, PointerType::get(mInt8Ty, 0)); } - Value* vGather = Builder::GATHERDD(vSrc, pBase, vIndices, vMask, scale); + Value *vGather = Builder::GATHERDD(vSrc, pBase, vIndices, vMask, scale); return vGather; } - Value* BuilderGfxMem::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset) + Value *BuilderGfxMem::OFFSET_TO_NEXT_COMPONENT(Value *base, Constant *offset) { return ADD(base, offset); } - Value* BuilderGfxMem::GEP(Value* Ptr, Value* Idx, Type* Ty, const Twine& Name) + Value *BuilderGfxMem::GEP(Value *Ptr, Value *Idx, Type *Ty, const Twine &Name) { Ptr = TranslationHelper(Ptr, Ty); return Builder::GEP(Ptr, Idx, nullptr, Name); } - Value* BuilderGfxMem::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name) + Value *BuilderGfxMem::GEP(Type *Ty, Value *Ptr, Value *Idx, const Twine &Name) { Ptr = TranslationHelper(Ptr, Ty); return Builder::GEP(Ty, Ptr, Idx, Name); } - Value* BuilderGfxMem::GEP(Value* Ptr, const std::initializer_list<Value*>& indexList, Type* Ty) + Value *BuilderGfxMem::GEP(Value *Ptr, const std::initializer_list<Value *> &indexList, Type *Ty) { Ptr = TranslationHelper(Ptr, Ty); return Builder::GEP(Ptr, indexList); } - Value* BuilderGfxMem::GEP(Value* Ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty) + Value * + BuilderGfxMem::GEP(Value *Ptr, const std::initializer_list<uint32_t> &indexList, Type *Ty) { Ptr = TranslationHelper(Ptr, Ty); return Builder::GEP(Ptr, indexList); } - Value* BuilderGfxMem::TranslationHelper(Value* Ptr, Type* Ty) + Value *BuilderGfxMem::TranslationHelper(Value *Ptr, Type *Ty) { SWR_ASSERT(!(Ptr->getType() == mInt64Ty && Ty == nullptr), "Access of GFX pointers must have non-null type specified."); @@ -144,7 +153,7 @@ namespace SwrJit return Ptr; } - LoadInst* BuilderGfxMem::LOAD(Value* Ptr, const char *Name, Type *Ty, JIT_MEM_CLIENT usage) + LoadInst *BuilderGfxMem::LOAD(Value *Ptr, const char *Name, Type *Ty, JIT_MEM_CLIENT usage) { AssertGFXMemoryParams(Ptr, usage); @@ -152,7 +161,7 @@ namespace SwrJit return Builder::LOAD(Ptr, Name); } - LoadInst* BuilderGfxMem::LOAD(Value* Ptr, const Twine &Name, Type *Ty, JIT_MEM_CLIENT usage) + LoadInst *BuilderGfxMem::LOAD(Value *Ptr, const Twine &Name, Type *Ty, JIT_MEM_CLIENT usage) { AssertGFXMemoryParams(Ptr, usage); @@ -160,7 +169,8 @@ namespace SwrJit return Builder::LOAD(Ptr, Name); } - LoadInst* BuilderGfxMem::LOAD(Value* Ptr, bool isVolatile, const Twine &Name, Type *Ty, JIT_MEM_CLIENT usage) + LoadInst *BuilderGfxMem::LOAD( + Value *Ptr, bool isVolatile, const Twine &Name, Type *Ty, JIT_MEM_CLIENT usage) { AssertGFXMemoryParams(Ptr, usage); @@ -168,7 +178,11 @@ namespace SwrJit return Builder::LOAD(Ptr, isVolatile, Name); } - LoadInst* BuilderGfxMem::LOAD(Value* BasePtr, const std::initializer_list<uint32_t> &offset, const llvm::Twine& name, Type *Ty, JIT_MEM_CLIENT usage) + LoadInst *BuilderGfxMem::LOAD(Value * BasePtr, + const std::initializer_list<uint32_t> &offset, + const llvm::Twine & name, + Type * Ty, + JIT_MEM_CLIENT usage) { AssertGFXMemoryParams(BasePtr, usage); @@ -176,10 +190,10 @@ namespace SwrJit if (BasePtr->getType() == mInt64Ty) { SWR_ASSERT(Ty); - BasePtr = INT_TO_PTR(BasePtr, Ty, name); + BasePtr = INT_TO_PTR(BasePtr, Ty, name); bNeedTranslation = true; } - std::vector<Value*> valIndices; + std::vector<Value *> valIndices; for (auto i : offset) { valIndices.push_back(C(i)); @@ -193,7 +207,13 @@ namespace SwrJit return LOAD(BasePtr, name, Ty, usage); } - CallInst* BuilderGfxMem::MASKED_LOAD(Value* Ptr, unsigned Align, Value* Mask, Value* PassThru, const Twine &Name, Type *Ty, JIT_MEM_CLIENT usage) + CallInst *BuilderGfxMem::MASKED_LOAD(Value * Ptr, + unsigned Align, + Value * Mask, + Value * PassThru, + const Twine & Name, + Type * Ty, + JIT_MEM_CLIENT usage) { AssertGFXMemoryParams(Ptr, usage); @@ -201,7 +221,10 @@ namespace SwrJit return Builder::MASKED_LOAD(Ptr, Align, Mask, PassThru, Name, Ty, usage); } - Value* BuilderGfxMem::TranslateGfxAddress(Value* xpGfxAddress, Type* PtrTy, const Twine &Name, JIT_MEM_CLIENT /* usage */) + Value *BuilderGfxMem::TranslateGfxAddress(Value * xpGfxAddress, + Type * PtrTy, + const Twine &Name, + JIT_MEM_CLIENT /* usage */) { if (PtrTy == nullptr) { @@ -209,4 +232,4 @@ namespace SwrJit } return INT_TO_PTR(xpGfxAddress, PtrTy, Name); } -} +} // namespace SwrJit diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h index 00817b2b52b..ab6f78ee817 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h @@ -1,32 +1,32 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file builder_gfx_mem.h -* -* @brief Definition of the builder to support different translation types for gfx memory access -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file builder_gfx_mem.h + * + * @brief Definition of the builder to support different translation types for gfx memory access + * + * Notes: + * + ******************************************************************************/ #pragma once #include "builder.h" @@ -38,28 +38,67 @@ namespace SwrJit class BuilderGfxMem : public Builder { public: - BuilderGfxMem(JitManager* pJitMgr); + BuilderGfxMem(JitManager *pJitMgr); virtual ~BuilderGfxMem() {} virtual Value *GEP(Value *Ptr, Value *Idx, Type *Ty = nullptr, const Twine &Name = ""); virtual Value *GEP(Type *Ty, Value *Ptr, Value *Idx, const Twine &Name = ""); - virtual Value *GEP(Value* Ptr, const std::initializer_list<Value*> &indexList, Type *Ty = nullptr); - virtual Value *GEP(Value* Ptr, const std::initializer_list<uint32_t> &indexList, Type *Ty = nullptr); - - virtual LoadInst* LOAD(Value *Ptr, const char *Name, Type *Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); - virtual LoadInst* LOAD(Value *Ptr, const Twine &Name = "", Type *Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); - virtual LoadInst* LOAD(Value *Ptr, bool isVolatile, const Twine &Name = "", Type *Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); - virtual LoadInst* LOAD(Value *BasePtr, const std::initializer_list<uint32_t> &offset, const llvm::Twine& Name = "", Type *Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); - - virtual CallInst* MASKED_LOAD(Value *Ptr, unsigned Align, Value *Mask, Value *PassThru = nullptr, const Twine &Name = "", Type *Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); - - virtual Value *GATHERPS(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); - virtual Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); - - - Value* TranslateGfxAddress(Value* xpGfxAddress, Type* PtrTy = nullptr, const Twine &Name = "", JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); + virtual Value * + GEP(Value *Ptr, const std::initializer_list<Value *> &indexList, Type *Ty = nullptr); + virtual Value * + GEP(Value *Ptr, const std::initializer_list<uint32_t> &indexList, Type *Ty = nullptr); + + virtual LoadInst *LOAD(Value * Ptr, + const char * Name, + Type * Ty = nullptr, + JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); + virtual LoadInst *LOAD(Value * Ptr, + const Twine & Name = "", + Type * Ty = nullptr, + JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); + virtual LoadInst *LOAD(Value * Ptr, + bool isVolatile, + const Twine & Name = "", + Type * Ty = nullptr, + JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); + virtual LoadInst *LOAD(Value * BasePtr, + const std::initializer_list<uint32_t> &offset, + const llvm::Twine & Name = "", + Type * Ty = nullptr, + JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); + + virtual CallInst *MASKED_LOAD(Value * Ptr, + unsigned Align, + Value * Mask, + Value * PassThru = nullptr, + const Twine & Name = "", + Type * Ty = nullptr, + JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); + + virtual Value *GATHERPS(Value * src, + Value * pBase, + Value * indices, + Value * mask, + uint8_t scale = 1, + JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); + virtual Value *GATHERDD(Value * src, + Value * pBase, + Value * indices, + Value * mask, + uint8_t scale = 1, + JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); + + + Value *TranslateGfxAddress(Value * xpGfxAddress, + Type * PtrTy = nullptr, + const Twine & Name = "", + JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); template <typename T> - Value* TranslateGfxAddress(Value* xpGfxBaseAddress, const std::initializer_list<T> &offset, Type* PtrTy = nullptr, const Twine &Name = "", JIT_MEM_CLIENT usage = GFX_MEM_CLIENT_SHADER) + Value *TranslateGfxAddress(Value * xpGfxBaseAddress, + const std::initializer_list<T> &offset, + Type * PtrTy = nullptr, + const Twine & Name = "", + JIT_MEM_CLIENT usage = GFX_MEM_CLIENT_SHADER) { AssertGFXMemoryParams(xpGfxBaseAddress, usage); SWR_ASSERT(xpGfxBaseAddress->getType()->isPointerTy() == false); @@ -69,31 +108,29 @@ namespace SwrJit PtrTy = mInt8PtrTy; } - Value* ptr = INT_TO_PTR(xpGfxBaseAddress, PtrTy); - ptr = GEP(ptr, offset); + Value *ptr = INT_TO_PTR(xpGfxBaseAddress, PtrTy); + ptr = GEP(ptr, offset); return TranslateGfxAddress(PTR_TO_INT(ptr, mInt64Ty), PtrTy, Name, usage); } protected: + void AssertGFXMemoryParams(Value *ptr, Builder::JIT_MEM_CLIENT usage); - void AssertGFXMemoryParams(Value* ptr, Builder::JIT_MEM_CLIENT usage); - virtual void NotifyPrivateContextSet(); - virtual Value* OFFSET_TO_NEXT_COMPONENT(Value* base, Constant *offset); + virtual Value *OFFSET_TO_NEXT_COMPONENT(Value *base, Constant *offset); - Value* TranslationHelper(Value *Ptr, Type *Ty); + Value *TranslationHelper(Value *Ptr, Type *Ty); - FunctionType* GetTranslationFunctionType() { return mpTranslationFuncTy; } - Value* GetTranslationFunction() { return mpfnTranslateGfxAddress; } - Value* GetParamSimDC() { return mpParamSimDC; } + FunctionType *GetTranslationFunctionType() { return mpTranslationFuncTy; } + Value * GetTranslationFunction() { return mpfnTranslateGfxAddress; } + Value * GetParamSimDC() { return mpParamSimDC; } private: - - FunctionType* mpTranslationFuncTy; - Value* mpfnTranslateGfxAddress; - Value* mpParamSimDC; + FunctionType *mpTranslationFuncTy; + Value * mpfnTranslateGfxAddress; + Value * mpParamSimDC; }; -} +} // namespace SwrJit diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h index 92867ec9836..02aa6f97cdf 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h @@ -1,32 +1,32 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file builder_math.h -* -* @brief math/alu builder functions -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file builder_math.h + * + * @brief math/alu builder functions + * + * Notes: + * + ******************************************************************************/ #pragma once Value* VLOG2PS(Value* src); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp index 77c2095ea9c..94489f1c7fd 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp @@ -1,32 +1,32 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file builder_misc.cpp -* -* @brief Implementation for miscellaneous builder functions -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file builder_misc.cpp + * + * @brief Implementation for miscellaneous builder functions + * + * Notes: + * + ******************************************************************************/ #include "jit_pch.hpp" #include "builder.h" #include "common/rdtsc_buckets.h" @@ -37,20 +37,22 @@ namespace SwrJit { void Builder::AssertMemoryUsageParams(Value* ptr, JIT_MEM_CLIENT usage) { - SWR_ASSERT(ptr->getType() != mInt64Ty, "Address appears to be GFX access. Requires translation through BuilderGfxMem."); + SWR_ASSERT( + ptr->getType() != mInt64Ty, + "Address appears to be GFX access. Requires translation through BuilderGfxMem."); } - Value *Builder::GEP(Value *Ptr, Value *Idx, Type *Ty, const Twine &Name) + Value* Builder::GEP(Value* Ptr, Value* Idx, Type* Ty, const Twine& Name) { return IRB()->CreateGEP(Ptr, Idx, Name); } - Value *Builder::GEP(Type *Ty, Value *Ptr, Value *Idx, const Twine &Name) + Value* Builder::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name) { return IRB()->CreateGEP(Ty, Ptr, Idx, Name); } - Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList, Type *Ty) + Value* Builder::GEP(Value* ptr, const std::initializer_list<Value*>& indexList, Type* Ty) { std::vector<Value*> indices; for (auto i : indexList) @@ -58,7 +60,7 @@ namespace SwrJit return GEPA(ptr, indices); } - Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList, Type *Ty) + Value* Builder::GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty) { std::vector<Value*> indices; for (auto i : indexList) @@ -66,17 +68,17 @@ namespace SwrJit return GEPA(ptr, indices); } - Value *Builder::GEPA(Value *Ptr, ArrayRef<Value *> IdxList, const Twine &Name) + Value* Builder::GEPA(Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name) { return IRB()->CreateGEP(Ptr, IdxList, Name); } - Value *Builder::GEPA(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList, const Twine &Name) + Value* Builder::GEPA(Type* Ty, Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name) { return IRB()->CreateGEP(Ty, Ptr, IdxList, Name); } - Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList) + Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*>& indexList) { std::vector<Value*> indices; for (auto i : indexList) @@ -84,7 +86,7 @@ namespace SwrJit return IN_BOUNDS_GEP(ptr, indices); } - Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList) + Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList) { std::vector<Value*> indices; for (auto i : indexList) @@ -92,31 +94,36 @@ namespace SwrJit return IN_BOUNDS_GEP(ptr, indices); } - LoadInst* Builder::LOAD(Value *Ptr, const char *Name, Type *Ty, JIT_MEM_CLIENT usage) + LoadInst* Builder::LOAD(Value* Ptr, const char* Name, Type* Ty, JIT_MEM_CLIENT usage) { AssertMemoryUsageParams(Ptr, usage); return IRB()->CreateLoad(Ptr, Name); } - LoadInst* Builder::LOAD(Value *Ptr, const Twine &Name, Type *Ty, JIT_MEM_CLIENT usage) + LoadInst* Builder::LOAD(Value* Ptr, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage) { AssertMemoryUsageParams(Ptr, usage); return IRB()->CreateLoad(Ptr, Name); } - LoadInst* Builder::LOAD(Type *Ty, Value *Ptr, const Twine &Name, JIT_MEM_CLIENT usage) + LoadInst* Builder::LOAD(Type* Ty, Value* Ptr, const Twine& Name, JIT_MEM_CLIENT usage) { AssertMemoryUsageParams(Ptr, usage); return IRB()->CreateLoad(Ty, Ptr, Name); } - LoadInst* Builder::LOAD(Value *Ptr, bool isVolatile, const Twine &Name, Type *Ty, JIT_MEM_CLIENT usage) + LoadInst* + Builder::LOAD(Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage) { AssertMemoryUsageParams(Ptr, usage); return IRB()->CreateLoad(Ptr, isVolatile, Name); } - LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name, Type *Ty, JIT_MEM_CLIENT usage) + LoadInst* Builder::LOAD(Value* basePtr, + const std::initializer_list<uint32_t>& indices, + const llvm::Twine& name, + Type* Ty, + JIT_MEM_CLIENT usage) { std::vector<Value*> valIndices; for (auto i : indices) @@ -124,7 +131,9 @@ namespace SwrJit return Builder::LOAD(GEPA(basePtr, valIndices), name); } - LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name) + LoadInst* Builder::LOADV(Value* basePtr, + const std::initializer_list<Value*>& indices, + const llvm::Twine& name) { std::vector<Value*> valIndices; for (auto i : indices) @@ -132,7 +141,8 @@ namespace SwrJit return LOAD(GEPA(basePtr, valIndices), name); } - StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices) + StoreInst* + Builder::STORE(Value* val, Value* basePtr, const std::initializer_list<uint32_t>& indices) { std::vector<Value*> valIndices; for (auto i : indices) @@ -140,7 +150,8 @@ namespace SwrJit return STORE(val, GEPA(basePtr, valIndices)); } - StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices) + StoreInst* + Builder::STOREV(Value* val, Value* basePtr, const std::initializer_list<Value*>& indices) { std::vector<Value*> valIndices; for (auto i : indices) @@ -148,27 +159,35 @@ namespace SwrJit return STORE(val, GEPA(basePtr, valIndices)); } - Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant *offset) + Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset) { return GEP(base, offset); } - Value* Builder::MEM_ADD(Value* i32Incr, Value* basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name) + Value* Builder::MEM_ADD(Value* i32Incr, + Value* basePtr, + const std::initializer_list<uint32_t>& indices, + const llvm::Twine& name) { - Value* i32Value = LOAD(GEP(basePtr, indices), name); + Value* i32Value = LOAD(GEP(basePtr, indices), name); Value* i32Result = ADD(i32Value, i32Incr); return STORE(i32Result, GEP(basePtr, indices)); } ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a masked gather operation in LLVM IR. If not + /// @brief Generate a masked gather operation in LLVM IR. If not /// supported on the underlying platform, emulate it with loads /// @param vSrc - SIMD wide value that will be loaded if mask is invalid /// @param pBase - Int8* base VB address pointer value /// @param vIndices - SIMD wide value of VB byte offsets /// @param vMask - SIMD wide mask that controls whether to access memory or the src values /// @param scale - value to scale indices by - Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale, JIT_MEM_CLIENT usage) + Value* Builder::GATHERPS(Value* vSrc, + Value* pBase, + Value* vIndices, + Value* vMask, + uint8_t scale, + JIT_MEM_CLIENT usage) { AssertMemoryUsageParams(pBase, usage); @@ -176,14 +195,19 @@ namespace SwrJit } ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a masked gather operation in LLVM IR. If not + /// @brief Generate a masked gather operation in LLVM IR. If not /// supported on the underlying platform, emulate it with loads /// @param vSrc - SIMD wide value that will be loaded if mask is invalid /// @param pBase - Int8* base VB address pointer value /// @param vIndices - SIMD wide value of VB byte offsets /// @param vMask - SIMD wide mask that controls whether to access memory or the src values /// @param scale - value to scale indices by - Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale, JIT_MEM_CLIENT usage) + Value* Builder::GATHERDD(Value* vSrc, + Value* pBase, + Value* vIndices, + Value* vMask, + uint8_t scale, + JIT_MEM_CLIENT usage) { AssertMemoryUsageParams(pBase, usage); @@ -198,7 +222,8 @@ namespace SwrJit /// @param vIndices - SIMD wide value of VB byte offsets /// @param vMask - SIMD wide mask that controls whether to access memory or the src values /// @param scale - value to scale indices by - Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale) + Value* + Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale) { return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale)); } @@ -213,10 +238,15 @@ namespace SwrJit return MASKED_GATHER(pVecSrcPtr, 4, pVecMask, pVecPassthru); } - void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, - Value* mask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage) + void Builder::Gather4(const SWR_FORMAT format, + Value* pSrcBase, + Value* byteOffsets, + Value* mask, + Value* vGatherComponents[], + bool bPackedOutput, + JIT_MEM_CLIENT usage) { - const SWR_FORMAT_INFO &info = GetFormatInfo(format); + const SWR_FORMAT_INFO& info = GetFormatInfo(format); if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32) { GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage); @@ -227,8 +257,13 @@ namespace SwrJit } } - void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, - Value* vMask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage) + void Builder::GATHER4PS(const SWR_FORMAT_INFO& info, + Value* pSrcBase, + Value* byteOffsets, + Value* vMask, + Value* vGatherComponents[], + bool bPackedOutput, + JIT_MEM_CLIENT usage) { switch (info.bpp / info.numComps) { @@ -253,10 +288,11 @@ namespace SwrJit // offset base to the next components(zw) in the vertex to gather pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4)); - vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); + vGatherResult[1] = + GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); // e.g. result of second 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 - // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw + // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw // } else @@ -281,7 +317,8 @@ namespace SwrJit uint32_t swizzleIndex = info.swizzle[i]; // Gather a SIMD of components - vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage); + vGatherComponents[swizzleIndex] = GATHERPS( + vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage); // offset base to the next component to gather pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4)); @@ -294,18 +331,24 @@ namespace SwrJit } } - void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, - Value* vMask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage) + void Builder::GATHER4DD(const SWR_FORMAT_INFO& info, + Value* pSrcBase, + Value* byteOffsets, + Value* vMask, + Value* vGatherComponents[], + bool bPackedOutput, + JIT_MEM_CLIENT usage) { switch (info.bpp / info.numComps) { case 8: { Value* vGatherMaskedVal = VIMMED1((int32_t)0); - Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); + Value* vGatherResult = + GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); // e.g. result of an 8x32bit integer gather for 8bit components // 256i - 0 1 2 3 4 5 6 7 - // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw + // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); } @@ -331,10 +374,11 @@ namespace SwrJit // offset base to the next components(zw) in the vertex to gather pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4)); - vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); + vGatherResult[1] = + GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); // e.g. result of second 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 - // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw + // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw // } else @@ -344,7 +388,6 @@ namespace SwrJit // Shuffle gathered components into place, each row is a component Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); - } break; case 32: @@ -360,7 +403,8 @@ namespace SwrJit uint32_t swizzleIndex = info.swizzle[i]; // Gather a SIMD of components - vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage); + vGatherComponents[swizzleIndex] = GATHERDD( + vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage); // offset base to the next component to gather pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4)); @@ -373,29 +417,35 @@ namespace SwrJit } } - void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput) + void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO& info, + Value* vGatherInput[2], + Value* vGatherOutput[4], + bool bPackedOutput) { // cast types Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); - Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits + Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits - // input could either be float or int vector; do shuffle work in int + // input could either be float or int vector; do shuffle work in int vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty); vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty); if (bPackedOutput) { - Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits - - // shuffle mask - Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }); - Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy); + Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), + mVWidth / 4); // vwidth is units of 32 bits + + // shuffle mask + Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); + Value* vShufResult = + BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy); // after pshufb: group components together in each 128bit lane // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy - Value* vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + Value* vi128XY = + BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); // after PERMD: move and pack xy components into each 128bit lane // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy @@ -404,8 +454,10 @@ namespace SwrJit Value* vi128ZW = nullptr; if (info.numComps > 2) { - Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy); - vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + Value* vShufResult = + BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy); + vi128ZW = + BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); } for (uint32_t i = 0; i < 4; i++) @@ -425,23 +477,23 @@ namespace SwrJit // if x or y, use vi128XY permute result, else use vi128ZW Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; - // extract packed component 128 bit lanes + // extract packed component 128 bit lanes vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); } - } else { // pshufb masks for each component Value* vConstMask[2]; // x/z shuffle mask - vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, - 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); + vConstMask[0] = C<char>({ + 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, + 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, + }); // y/w shuffle mask - vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, - 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 }); - + vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, + 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits // apply defaults @@ -459,32 +511,41 @@ namespace SwrJit // if x or y, use vi128XY permute result, else use vi128ZW uint32_t selectedGather = (i < 2) ? 0 : 1; - vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy); + vGatherOutput[swizzleIndex] = + BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), + vConstMask[selectedMask]), + vGatherTy); // after pshufb mask for x channel; z uses the same shuffle from the second gather // 256i - 0 1 2 3 4 5 6 7 - // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 + // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 } } } - void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput) + void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO& info, + Value* vGatherInput, + Value* vGatherOutput[], + bool bPackedOutput) { // cast types Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); - Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits + Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits if (bPackedOutput) { - Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits - // shuffle mask - Value* vConstMask = C<char>({ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, - 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }); - Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); + Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), + mVWidth / 4); // vwidth is units of 32 bits + // shuffle mask + Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, + 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}); + Value* vShufResult = + BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); // after pshufb: group components together in each 128bit lane // 256i - 0 1 2 3 4 5 6 7 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww - Value* vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty); + Value* vi128XY = + BITCAST(VPERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty); // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care) @@ -493,10 +554,12 @@ namespace SwrJit Value* vi128ZW = nullptr; if (info.numComps > 2) { - vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty); + vi128ZW = + BITCAST(VPERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty); } - // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex + // sign extend all enabled components. If we have a fill vVertexElements, output to + // current simdvertex for (uint32_t i = 0; i < 4; i++) { uint32_t swizzleIndex = info.swizzle[i]; @@ -519,7 +582,8 @@ namespace SwrJit } } // else zero extend - else { + else + { // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits // apply defaults for (uint32_t i = 0; i < 4; ++i) @@ -527,7 +591,8 @@ namespace SwrJit vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); } - for (uint32_t i = 0; i < info.numComps; i++) { + for (uint32_t i = 0; i < info.numComps; i++) + { uint32_t swizzleIndex = info.swizzle[i]; // pshufb masks for each component @@ -536,45 +601,53 @@ namespace SwrJit { case 0: // x shuffle mask - vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, - 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 }); + vConstMask = + C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, + 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1}); break; case 1: // y shuffle mask - vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, - 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 }); + vConstMask = + C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, + 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1}); break; case 2: // z shuffle mask - vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, - 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 }); + vConstMask = + C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, + 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1}); break; case 3: // w shuffle mask - vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, - 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 }); + vConstMask = + C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, + 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1}); break; default: vConstMask = nullptr; break; } - vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); + vGatherOutput[swizzleIndex] = + BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); // after pshufb for x channel // 256i - 0 1 2 3 4 5 6 7 - // x000 x000 x000 x000 x000 x000 x000 x000 + // x000 x000 x000 x000 x000 x000 x000 x000 } } } ////////////////////////////////////////////////////////////////////////// /// @brief emulates a scatter operation. - /// @param pDst - pointer to destination + /// @param pDst - pointer to destination /// @param vSrc - vector of src data to scatter /// @param vOffsets - vector of byte offsets from pDst /// @param vMask - mask of valid lanes - void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask) + void Builder::SCATTERPS( + Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask, JIT_MEM_CLIENT usage) { + AssertMemoryUsageParams(pDst, usage); + /* Scatter algorithm while(Index = BitScanForward(mask)) @@ -586,25 +659,25 @@ namespace SwrJit */ BasicBlock* pCurBB = IRB()->GetInsertBlock(); - Function* pFunc = pCurBB->getParent(); - Type* pSrcTy = vSrc->getType()->getVectorElementType(); + Function* pFunc = pCurBB->getParent(); + Type* pSrcTy = vSrc->getType()->getVectorElementType(); // Store vectors on stack if (pScatterStackSrc == nullptr) { // Save off stack allocations and reuse per scatter. Significantly reduces stack // requirements for shaders with a lot of scatters. - pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty); + pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty); pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty); } - Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0)); + Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0)); Value* pOffsetsArrayPtr = pScatterStackOffsets; STORE(vSrc, pSrcArrayPtr); STORE(vOffsets, pOffsetsArrayPtr); // Cast to pointers for random access - pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0)); + pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0)); pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0)); Value* pMask = VMOVMSK(vMask); @@ -643,18 +716,18 @@ namespace SwrJit // Add loop basic block contents IRB()->SetInsertPoint(pLoop); PHINode* pIndexPhi = PHI(mInt32Ty, 2); - PHINode* pMaskPhi = PHI(mInt32Ty, 2); + PHINode* pMaskPhi = PHI(mInt32Ty, 2); pIndexPhi->addIncoming(pIndex, pCurBB); pMaskPhi->addIncoming(pMask, pCurBB); // Extract elements for this index - Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi }); - Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi }); + Value* pSrcElem = LOADV(pSrcArrayPtr, {pIndexPhi}); + Value* pOffsetElem = LOADV(pOffsetsArrayPtr, {pIndexPhi}); // GEP to this offset in dst Value* pCurDst = GEP(pDst, pOffsetElem, mInt8PtrTy); - pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0)); + pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0)); STORE(pSrcElem, pCurDst); // Update the mask @@ -673,4 +746,4 @@ namespace SwrJit // Move builder to beginning of post loop IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin()); } -} +} // namespace SwrJit diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h index 3823a136bb8..15def96cb76 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h @@ -1,36 +1,35 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file builder_misc.h -* -* @brief miscellaneous builder functions -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file builder_misc.h + * + * @brief miscellaneous builder functions + * + * Notes: + * + ******************************************************************************/ #pragma once public: - typedef enum _JIT_MEM_CLIENT { MEM_CLIENT_INTERNAL, @@ -40,62 +39,119 @@ typedef enum _JIT_MEM_CLIENT } JIT_MEM_CLIENT; protected: - -virtual Value* OFFSET_TO_NEXT_COMPONENT(Value* base, Constant *offset); -void AssertMemoryUsageParams(Value* ptr, JIT_MEM_CLIENT usage); +virtual Value* OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset); +void AssertMemoryUsageParams(Value* ptr, JIT_MEM_CLIENT usage); public: - -virtual Value *GEP(Value *Ptr, Value *Idx, Type *Ty = nullptr, const Twine &Name = ""); -virtual Value *GEP(Type *Ty, Value *Ptr, Value *Idx, const Twine &Name = ""); -virtual Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList, Type *Ty = nullptr); -virtual Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList, Type *Ty = nullptr); - -Value *GEPA(Value *Ptr, ArrayRef<Value *> IdxList, const Twine &Name = ""); -Value *GEPA(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList, const Twine &Name = ""); - -Value *IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList); -Value *IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList); - -virtual LoadInst* LOAD(Value *Ptr, const char *Name, Type *Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); -virtual LoadInst* LOAD(Value *Ptr, const Twine &Name = "", Type *Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); -virtual LoadInst* LOAD(Type *Ty, Value *Ptr, const Twine &Name = "", JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); -virtual LoadInst* LOAD(Value *Ptr, bool isVolatile, const Twine &Name = "", Type *Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); -virtual LoadInst* LOAD(Value *BasePtr, const std::initializer_list<uint32_t> &offset, const llvm::Twine& Name = "", Type *Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); - -virtual CallInst* MASKED_LOAD(Value *Ptr, unsigned Align, Value *Mask, Value *PassThru = nullptr, const Twine &Name = "", Type *Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL) +virtual Value* GEP(Value* Ptr, Value* Idx, Type* Ty = nullptr, const Twine& Name = ""); +virtual Value* GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name = ""); +virtual Value* GEP(Value* ptr, const std::initializer_list<Value*>& indexList, Type* Ty = nullptr); +virtual Value* +GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty = nullptr); + +Value* GEPA(Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name = ""); +Value* GEPA(Type* Ty, Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name = ""); + +Value* IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*>& indexList); +Value* IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList); + +virtual LoadInst* + LOAD(Value* Ptr, const char* Name, Type* Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); +virtual LoadInst* LOAD(Value* Ptr, + const Twine& Name = "", + Type* Ty = nullptr, + JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); +virtual LoadInst* + LOAD(Type* Ty, Value* Ptr, const Twine& Name = "", JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); +virtual LoadInst* LOAD(Value* Ptr, + bool isVolatile, + const Twine& Name = "", + Type* Ty = nullptr, + JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); +virtual LoadInst* LOAD(Value* BasePtr, + const std::initializer_list<uint32_t>& offset, + const llvm::Twine& Name = "", + Type* Ty = nullptr, + JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); + +virtual CallInst* MASKED_LOAD(Value* Ptr, + unsigned Align, + Value* Mask, + Value* PassThru = nullptr, + const Twine& Name = "", + Type* Ty = nullptr, + JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL) { return IRB()->CreateMaskedLoad(Ptr, Align, Mask, PassThru, Name); } -LoadInst *LOADV(Value *BasePtr, const std::initializer_list<Value*> &offset, const llvm::Twine& name = ""); -StoreInst *STORE(Value *Val, Value *BasePtr, const std::initializer_list<uint32_t> &offset); -StoreInst *STOREV(Value *Val, Value *BasePtr, const std::initializer_list<Value*> &offset); - -Value* MEM_ADD(Value* i32Incr, Value* basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name = ""); - -void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, - Value* mask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); - -virtual Value *GATHERPS(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); - -void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, - Value* mask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); - -virtual Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); - -void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, - Value* mask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); - -Value *GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1); - -Value *GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru); - -void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask); - -void Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput); -void Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[], Value* vGatherOutput[], bool bPackedOutput); +LoadInst* + LOADV(Value* BasePtr, const std::initializer_list<Value*>& offset, const llvm::Twine& name = ""); +StoreInst* STORE(Value* Val, Value* BasePtr, const std::initializer_list<uint32_t>& offset); +StoreInst* STOREV(Value* Val, Value* BasePtr, const std::initializer_list<Value*>& offset); + +Value* MEM_ADD(Value* i32Incr, + Value* basePtr, + const std::initializer_list<uint32_t>& indices, + const llvm::Twine& name = ""); + +void Gather4(const SWR_FORMAT format, + Value* pSrcBase, + Value* byteOffsets, + Value* mask, + Value* vGatherComponents[], + bool bPackedOutput, + JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); + +virtual Value* GATHERPS(Value* src, + Value* pBase, + Value* indices, + Value* mask, + uint8_t scale = 1, + JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); + +void GATHER4PS(const SWR_FORMAT_INFO& info, + Value* pSrcBase, + Value* byteOffsets, + Value* mask, + Value* vGatherComponents[], + bool bPackedOutput, + JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); + +virtual Value* GATHERDD(Value* src, + Value* pBase, + Value* indices, + Value* mask, + uint8_t scale = 1, + JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); + +void GATHER4DD(const SWR_FORMAT_INFO& info, + Value* pSrcBase, + Value* byteOffsets, + Value* mask, + Value* vGatherComponents[], + bool bPackedOutput, + JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); + +Value* GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1); + +Value* GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru); + +virtual void SCATTERPS(Value* pDst, + Value* vSrc, + Value* vOffsets, + Value* vMask, + JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); + +void Shuffle8bpcGather4(const SWR_FORMAT_INFO& info, + Value* vGatherInput, + Value* vGatherOutput[], + bool bPackedOutput); +void Shuffle16bpcGather4(const SWR_FORMAT_INFO& info, + Value* vGatherInput[], + Value* vGatherOutput[], + bool bPackedOutput); // Static stack allocations for scatter operations -Value* pScatterStackSrc{ nullptr }; -Value* pScatterStackOffsets{ nullptr }; +Value* pScatterStackSrc{nullptr}; +Value* pScatterStackOffsets{nullptr}; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index 231fa94d00c..4116dad4430 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -1,32 +1,32 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file builder_misc.cpp -* -* @brief Implementation for miscellaneous builder functions -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file builder_misc.cpp + * + * @brief Implementation for miscellaneous builder functions + * + * Notes: + * + ******************************************************************************/ #include "jit_pch.hpp" #include "builder.h" #include "common/rdtsc_buckets.h" @@ -50,25 +50,25 @@ namespace SwrJit // Extract the sign, exponent, and mantissa uint32_t uf = *(uint32_t*)&val; - sign = (uf & 0x80000000) >> 31; - exp = (uf & 0x7F800000) >> 23; - mant = uf & 0x007FFFFF; + sign = (uf & 0x80000000) >> 31; + exp = (uf & 0x7F800000) >> 23; + mant = uf & 0x007FFFFF; // Check for out of range if (std::isnan(val)) { - exp = 0x1F; + exp = 0x1F; mant = 0x200; - sign = 1; // set the sign bit for NANs + sign = 1; // set the sign bit for NANs } else if (std::isinf(val)) { - exp = 0x1f; + exp = 0x1f; mant = 0x0; } else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value { - exp = 0x1E; + exp = 0x1E; mant = 0x3FF; } else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm @@ -76,12 +76,12 @@ namespace SwrJit mant |= 0x00800000; for (; exp <= 0x70; mant >>= 1, exp++) ; - exp = 0; + exp = 0; mant = mant >> 13; } else if (exp < 0x66) // Too small to represent -> Zero { - exp = 0; + exp = 0; mant = 0; } else @@ -89,7 +89,7 @@ namespace SwrJit // Saves bits that will be shifted off for rounding roundBits = mant & 0x1FFFu; // convert exponent and mantissa to 16 bit format - exp = exp - 0x70; + exp = exp - 0x70; mant = mant >> 13; // Essentially RTZ, but round up if off by only 1 lsb @@ -129,7 +129,7 @@ namespace SwrJit { uint32_t sign = (val & 0x8000) << 16; uint32_t mant = (val & 0x3ff) << 13; - uint32_t exp = (val >> 10) & 0x1f; + uint32_t exp = (val >> 10) & 0x1f; if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals { mant <<= 1; @@ -140,139 +140,94 @@ namespace SwrJit } mant &= (0x3ff << 13); } - exp = ((exp - 15 + 127) & 0xff) << 23; + exp = ((exp - 15 + 127) & 0xff) << 23; result = sign | exp | mant; } return *(float*)&result; } - Constant *Builder::C(bool i) - { - return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0)); - } + Constant* Builder::C(bool i) { return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0)); } - Constant *Builder::C(char i) - { - return ConstantInt::get(IRB()->getInt8Ty(), i); - } + Constant* Builder::C(char i) { return ConstantInt::get(IRB()->getInt8Ty(), i); } - Constant *Builder::C(uint8_t i) - { - return ConstantInt::get(IRB()->getInt8Ty(), i); - } + Constant* Builder::C(uint8_t i) { return ConstantInt::get(IRB()->getInt8Ty(), i); } - Constant *Builder::C(int i) - { - return ConstantInt::get(IRB()->getInt32Ty(), i); - } + Constant* Builder::C(int i) { return ConstantInt::get(IRB()->getInt32Ty(), i); } - Constant *Builder::C(int64_t i) - { - return ConstantInt::get(IRB()->getInt64Ty(), i); - } + Constant* Builder::C(int64_t i) { return ConstantInt::get(IRB()->getInt64Ty(), i); } - Constant *Builder::C(uint16_t i) - { - return ConstantInt::get(mInt16Ty,i); - } + Constant* Builder::C(uint16_t i) { return ConstantInt::get(mInt16Ty, i); } - Constant *Builder::C(uint32_t i) - { - return ConstantInt::get(IRB()->getInt32Ty(), i); - } + Constant* Builder::C(uint32_t i) { return ConstantInt::get(IRB()->getInt32Ty(), i); } - Constant *Builder::C(uint64_t i) - { - return ConstantInt::get(IRB()->getInt64Ty(), i); - } + Constant* Builder::C(uint64_t i) { return ConstantInt::get(IRB()->getInt64Ty(), i); } - Constant *Builder::C(float i) - { - return ConstantFP::get(IRB()->getFloatTy(), i); - } + Constant* Builder::C(float i) { return ConstantFP::get(IRB()->getFloatTy(), i); } - Constant *Builder::PRED(bool pred) + Constant* Builder::PRED(bool pred) { return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0)); } - Value *Builder::VIMMED1(int i) + Value* Builder::VIMMED1(int i) { return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); } - Value *Builder::VIMMED1_16(int i) + Value* Builder::VIMMED1_16(int i) { return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i))); } - Value *Builder::VIMMED1(uint32_t i) + Value* Builder::VIMMED1(uint32_t i) { return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); } - Value *Builder::VIMMED1_16(uint32_t i) + Value* Builder::VIMMED1_16(uint32_t i) { return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i))); } - Value *Builder::VIMMED1(float i) + Value* Builder::VIMMED1(float i) { return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i))); } - Value *Builder::VIMMED1_16(float i) + Value* Builder::VIMMED1_16(float i) { return ConstantVector::getSplat(mVWidth16, cast<ConstantFP>(C(i))); } - Value *Builder::VIMMED1(bool i) + Value* Builder::VIMMED1(bool i) { return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); } - Value *Builder::VIMMED1_16(bool i) + Value* Builder::VIMMED1_16(bool i) { return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i))); } - Value *Builder::VUNDEF_IPTR() - { - return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth)); - } + Value* Builder::VUNDEF_IPTR() { return UndefValue::get(VectorType::get(mInt32PtrTy, mVWidth)); } - Value *Builder::VUNDEF(Type* t) - { - return UndefValue::get(VectorType::get(t, mVWidth)); - } + Value* Builder::VUNDEF(Type* t) { return UndefValue::get(VectorType::get(t, mVWidth)); } - Value *Builder::VUNDEF_I() - { - return UndefValue::get(VectorType::get(mInt32Ty, mVWidth)); - } + Value* Builder::VUNDEF_I() { return UndefValue::get(VectorType::get(mInt32Ty, mVWidth)); } - Value *Builder::VUNDEF_I_16() - { - return UndefValue::get(VectorType::get(mInt32Ty, mVWidth16)); - } + Value* Builder::VUNDEF_I_16() { return UndefValue::get(VectorType::get(mInt32Ty, mVWidth16)); } - Value *Builder::VUNDEF_F() - { - return UndefValue::get(VectorType::get(mFP32Ty, mVWidth)); - } + Value* Builder::VUNDEF_F() { return UndefValue::get(VectorType::get(mFP32Ty, mVWidth)); } - Value *Builder::VUNDEF_F_16() - { - return UndefValue::get(VectorType::get(mFP32Ty, mVWidth16)); - } + Value* Builder::VUNDEF_F_16() { return UndefValue::get(VectorType::get(mFP32Ty, mVWidth16)); } - Value *Builder::VUNDEF(Type *ty, uint32_t size) + Value* Builder::VUNDEF(Type* ty, uint32_t size) { return UndefValue::get(VectorType::get(ty, size)); } - Value *Builder::VBROADCAST(Value *src, const llvm::Twine& name) + Value* Builder::VBROADCAST(Value* src, const llvm::Twine& name) { // check if src is already a vector if (src->getType()->isVectorTy()) @@ -283,7 +238,7 @@ namespace SwrJit return VECTOR_SPLAT(mVWidth, src, name); } - Value *Builder::VBROADCAST_16(Value *src) + Value* Builder::VBROADCAST_16(Value* src) { // check if src is already a vector if (src->getType()->isVectorTy()) @@ -297,18 +252,20 @@ namespace SwrJit uint32_t Builder::IMMED(Value* v) { SWR_ASSERT(isa<ConstantInt>(v)); - ConstantInt *pValConst = cast<ConstantInt>(v); + ConstantInt* pValConst = cast<ConstantInt>(v); return pValConst->getZExtValue(); } int32_t Builder::S_IMMED(Value* v) { SWR_ASSERT(isa<ConstantInt>(v)); - ConstantInt *pValConst = cast<ConstantInt>(v); + ConstantInt* pValConst = cast<ConstantInt>(v); return pValConst->getSExtValue(); } - CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList, const llvm::Twine& name) + CallInst* Builder::CALL(Value* Callee, + const std::initializer_list<Value*>& argsList, + const llvm::Twine& name) { std::vector<Value*> args; for (auto arg : argsList) @@ -316,14 +273,14 @@ namespace SwrJit return CALLA(Callee, args, name); } - CallInst *Builder::CALL(Value *Callee, Value* arg) + CallInst* Builder::CALL(Value* Callee, Value* arg) { std::vector<Value*> args; args.push_back(arg); return CALLA(Callee, args); } - CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2) + CallInst* Builder::CALL2(Value* Callee, Value* arg1, Value* arg2) { std::vector<Value*> args; args.push_back(arg1); @@ -331,7 +288,7 @@ namespace SwrJit return CALLA(Callee, args); } - CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3) + CallInst* Builder::CALL3(Value* Callee, Value* arg1, Value* arg2, Value* arg3) { std::vector<Value*> args; args.push_back(arg1); @@ -340,15 +297,15 @@ namespace SwrJit return CALLA(Callee, args); } - Value *Builder::VRCP(Value *va, const llvm::Twine& name) + Value* Builder::VRCP(Value* va, const llvm::Twine& name) { - return FDIV(VIMMED1(1.0f), va, name); // 1 / a + return FDIV(VIMMED1(1.0f), va, name); // 1 / a } - Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY) + Value* Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value*& vX, Value*& vY) { Value* vOut = FMADDPS(vA, vX, vC); - vOut = FMADDPS(vB, vY, vOut); + vOut = FMADDPS(vB, vY, vOut); return vOut; } @@ -362,7 +319,8 @@ namespace SwrJit /// result from a GEP, printing out the pointer to memory /// @param printStr - constant string to print, which includes format specifiers /// @param printArgs - initializer list of Value*'s to print to std out - CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs) + CallInst* Builder::PRINT(const std::string& printStr, + const std::initializer_list<Value*>& printArgs) { // push the arguments to CallPrint into a vector std::vector<Value*> printCallArgs; @@ -370,15 +328,15 @@ namespace SwrJit printCallArgs.resize(1); // search through the format string for special processing - size_t pos = 0; + size_t pos = 0; std::string tempStr(printStr); - pos = tempStr.find('%', pos); + pos = tempStr.find('%', pos); auto v = printArgs.begin(); while ((pos != std::string::npos) && (v != printArgs.end())) { - Value* pArg = *v; - Type* pType = pArg->getType(); + Value* pArg = *v; + Type* pType = pArg->getType(); if (pType->isVectorTy()) { @@ -386,7 +344,7 @@ namespace SwrJit if (toupper(tempStr[pos + 1]) == 'X') { - tempStr[pos] = '0'; + tempStr[pos] = '0'; tempStr[pos + 1] = 'x'; tempStr.insert(pos + 2, "%08X "); pos += 7; @@ -410,9 +368,11 @@ namespace SwrJit { tempStr.insert(pos, std::string("%f ")); pos += 3; - printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext))); + printCallArgs.push_back( + FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext))); } - printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext))); + printCallArgs.push_back( + FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext))); } else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy())) { @@ -421,9 +381,11 @@ namespace SwrJit { tempStr.insert(pos, std::string("%d ")); pos += 3; - printCallArgs.push_back(S_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext))); + printCallArgs.push_back( + S_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext))); } - printCallArgs.push_back(S_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext))); + printCallArgs.push_back( + S_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext))); } else if ((tempStr[pos + 1] == 'u') && (pContainedType->isIntegerTy())) { @@ -432,9 +394,11 @@ namespace SwrJit { tempStr.insert(pos, std::string("%d ")); pos += 3; - printCallArgs.push_back(Z_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext))); + printCallArgs.push_back( + Z_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext))); } - printCallArgs.push_back(Z_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext))); + printCallArgs.push_back( + Z_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext))); } } else @@ -464,89 +428,82 @@ namespace SwrJit } // create global variable constant string - Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true); - GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr"); + Constant* constString = ConstantDataArray::getString(JM()->mContext, tempStr, true); + GlobalVariable* gvPtr = new GlobalVariable( + constString->getType(), true, GlobalValue::InternalLinkage, constString, "printStr"); JM()->mpCurrentModule->getGlobalList().push_back(gvPtr); // get a pointer to the first character in the constant string array - std::vector<Constant*> geplist{C(0),C(0)}; - Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false); + std::vector<Constant*> geplist{C(0), C(0)}; + Constant* strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr, geplist, false); // insert the pointer to the format string in the argument vector printCallArgs[0] = strGEP; // get pointer to CallPrint function and insert decl into the module if needed std::vector<Type*> args; - args.push_back(PointerType::get(mInt8Ty,0)); - FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true); - Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy)); + args.push_back(PointerType::get(mInt8Ty, 0)); + FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, true); + Function* callPrintFn = + cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy)); // if we haven't yet added the symbol to the symbol table - if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr) + if ((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr) { - sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint); + sys::DynamicLibrary::AddSymbol("CallPrint", (void*)&CallPrint); } // insert a call to CallPrint - return CALLA(callPrintFn,printCallArgs); + return CALLA(callPrintFn, printCallArgs); } ////////////////////////////////////////////////////////////////////////// /// @brief Wrapper around PRINT with initializer list. - CallInst* Builder::PRINT(const std::string &printStr) - { - return PRINT(printStr, {}); - } + CallInst* Builder::PRINT(const std::string& printStr) { return PRINT(printStr, {}); } - Value *Builder::EXTRACT_16(Value *x, uint32_t imm) + Value* Builder::EXTRACT_16(Value* x, uint32_t imm) { if (imm == 0) { - return VSHUFFLE(x, UndefValue::get(x->getType()), { 0, 1, 2, 3, 4, 5, 6, 7 }); + return VSHUFFLE(x, UndefValue::get(x->getType()), {0, 1, 2, 3, 4, 5, 6, 7}); } else { - return VSHUFFLE(x, UndefValue::get(x->getType()), { 8, 9, 10, 11, 12, 13, 14, 15 }); + return VSHUFFLE(x, UndefValue::get(x->getType()), {8, 9, 10, 11, 12, 13, 14, 15}); } } - Value *Builder::JOIN_16(Value *a, Value *b) + Value* Builder::JOIN_16(Value* a, Value* b) { - return VSHUFFLE(a, b, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }); + return VSHUFFLE(a, b, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); } ////////////////////////////////////////////////////////////////////////// /// @brief convert x86 <N x float> mask to llvm <N x i1> mask - Value *Builder::MASK(Value *vmask) + Value* Builder::MASK(Value* vmask) { - Value *src = BITCAST(vmask, mSimdInt32Ty); + Value* src = BITCAST(vmask, mSimdInt32Ty); return ICMP_SLT(src, VIMMED1(0)); } - Value *Builder::MASK_16(Value *vmask) + Value* Builder::MASK_16(Value* vmask) { - Value *src = BITCAST(vmask, mSimd16Int32Ty); + Value* src = BITCAST(vmask, mSimd16Int32Ty); return ICMP_SLT(src, VIMMED1_16(0)); } ////////////////////////////////////////////////////////////////////////// /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask - Value *Builder::VMASK(Value *mask) - { - return S_EXT(mask, mSimdInt32Ty); - } + Value* Builder::VMASK(Value* mask) { return S_EXT(mask, mSimdInt32Ty); } - Value *Builder::VMASK_16(Value *mask) - { - return S_EXT(mask, mSimd16Int32Ty); - } + Value* Builder::VMASK_16(Value* mask) { return S_EXT(mask, mSimd16Int32Ty); } /// @brief Convert <Nxi1> llvm mask to integer - Value *Builder::VMOVMSK(Value* mask) + Value* Builder::VMOVMSK(Value* mask) { SWR_ASSERT(mask->getType()->getVectorElementType() == mInt1Ty); uint32_t numLanes = mask->getType()->getVectorNumElements(); - Value* i32Result; + Value* i32Result; if (numLanes == 8) { i32Result = BITCAST(mask, mInt8Ty); @@ -564,18 +521,18 @@ namespace SwrJit } ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a VPSHUFB operation in LLVM IR. If not + /// @brief Generate a VPSHUFB operation in LLVM IR. If not /// supported on the underlying platform, emulate it /// @param a - 256bit SIMD(32x8bit) of 8bit integer values /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values - /// Byte masks in lower 128 lane of b selects 8 bit values from lower - /// 128bits of a, and vice versa for the upper lanes. If the mask + /// Byte masks in lower 128 lane of b selects 8 bit values from lower + /// 128bits of a, and vice versa for the upper lanes. If the mask /// value is negative, '0' is inserted. - Value *Builder::PSHUFB(Value* a, Value* b) + Value* Builder::PSHUFB(Value* a, Value* b) { Value* res; // use avx2 pshufb instruction if available - if(JM()->mArch.AVX2()) + if (JM()->mArch.AVX2()) { res = VPSHUFB(a, b); } @@ -589,22 +546,26 @@ namespace SwrJit // insert an 8 bit value from the high and low lanes of a per loop iteration numElms /= 2; - for(uint32_t i = 0; i < numElms; i++) + for (uint32_t i = 0; i < numElms; i++) { - ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i)); + ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i)); ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms)); // extract values from constant mask - char valLow128bLane = (char)(cLow128b->getSExtValue()); + char valLow128bLane = (char)(cLow128b->getSExtValue()); char valHigh128bLane = (char)(cHigh128b->getSExtValue()); Value* insertValLow128b; Value* insertValHigh128b; // if the mask value is negative, insert a '0' in the respective output position - // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector - insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF))); - insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms)); + // otherwise, lookup the value at mask position (bits 3..0 of the respective mask + // byte) in a and insert in output vector + insertValLow128b = + (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF))); + insertValHigh128b = (valHigh128bLane < 0) + ? C((char)0) + : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms)); vShuf = VINSERT(vShuf, insertValLow128b, i); vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms)); @@ -615,11 +576,11 @@ namespace SwrJit } ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32 + /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it - /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only + /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only /// lower 8 values are used. - Value *Builder::PMOVSXBD(Value* a) + Value* Builder::PMOVSXBD(Value* a) { // VPMOVSXBD output type Type* v8x32Ty = VectorType::get(mInt32Ty, 8); @@ -628,10 +589,10 @@ namespace SwrJit } ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32 + /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values. - Value *Builder::PMOVSXWD(Value* a) + Value* Builder::PMOVSXWD(Value* a) { // VPMOVSXWD output type Type* v8x32Ty = VectorType::get(mInt32Ty, 8); @@ -643,7 +604,7 @@ namespace SwrJit /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion) /// in LLVM IR. If not supported on the underlying platform, emulate it /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format. - Value *Builder::CVTPH2PS(Value* a, const llvm::Twine& name) + Value* Builder::CVTPH2PS(Value* a, const llvm::Twine& name) { if (JM()->mArch.F16C()) { @@ -651,20 +612,22 @@ namespace SwrJit } else { - FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty); - Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat16ToFloat32", pFuncTy)); + FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty); + Function* pCvtPh2Ps = cast<Function>( + JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat16ToFloat32", pFuncTy)); if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat16ToFloat32") == nullptr) { - sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32", (void *)&ConvertFloat16ToFloat32); + sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32", + (void*)&ConvertFloat16ToFloat32); } Value* pResult = UndefValue::get(mSimdFP32Ty); for (uint32_t i = 0; i < mVWidth; ++i) { - Value* pSrc = VEXTRACT(a, C(i)); + Value* pSrc = VEXTRACT(a, C(i)); Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc}); - pResult = VINSERT(pResult, pConv, C(i)); + pResult = VINSERT(pResult, pConv, C(i)); } pResult->setName(name); @@ -676,7 +639,7 @@ namespace SwrJit /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion) /// in LLVM IR. If not supported on the underlying platform, emulate it /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format. - Value *Builder::CVTPS2PH(Value* a, Value* rounding) + Value* Builder::CVTPS2PH(Value* a, Value* rounding) { if (JM()->mArch.F16C()) { @@ -685,45 +648,47 @@ namespace SwrJit else { // call scalar C function for now - FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty); - Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy)); + FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty); + Function* pCvtPs2Ph = cast<Function>( + JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy)); if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr) { - sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16", (void *)&ConvertFloat32ToFloat16); + sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16", + (void*)&ConvertFloat32ToFloat16); } Value* pResult = UndefValue::get(mSimdInt16Ty); for (uint32_t i = 0; i < mVWidth; ++i) { - Value* pSrc = VEXTRACT(a, C(i)); + Value* pSrc = VEXTRACT(a, C(i)); Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc}); - pResult = VINSERT(pResult, pConv, C(i)); + pResult = VINSERT(pResult, pConv, C(i)); } return pResult; } } - Value *Builder::PMAXSD(Value* a, Value* b) + Value* Builder::PMAXSD(Value* a, Value* b) { Value* cmp = ICMP_SGT(a, b); return SELECT(cmp, a, b); } - Value *Builder::PMINSD(Value* a, Value* b) + Value* Builder::PMINSD(Value* a, Value* b) { Value* cmp = ICMP_SLT(a, b); return SELECT(cmp, a, b); } - Value *Builder::PMAXUD(Value* a, Value* b) + Value* Builder::PMAXUD(Value* a, Value* b) { Value* cmp = ICMP_UGT(a, b); return SELECT(cmp, a, b); } - Value *Builder::PMINUD(Value* a, Value* b) + Value* Builder::PMINUD(Value* a, Value* b) { Value* cmp = ICMP_ULT(a, b); return SELECT(cmp, a, b); @@ -733,65 +698,65 @@ namespace SwrJit Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType) { auto saveIP = IRB()->saveIP(); - IRB()->SetInsertPoint(&pFunc->getEntryBlock(), - pFunc->getEntryBlock().begin()); + IRB()->SetInsertPoint(&pFunc->getEntryBlock(), pFunc->getEntryBlock().begin()); Value* pAlloca = ALLOCA(pType); - if (saveIP.isSet()) IRB()->restoreIP(saveIP); + if (saveIP.isSet()) + IRB()->restoreIP(saveIP); return pAlloca; } Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize) { auto saveIP = IRB()->saveIP(); - IRB()->SetInsertPoint(&pFunc->getEntryBlock(), - pFunc->getEntryBlock().begin()); + IRB()->SetInsertPoint(&pFunc->getEntryBlock(), pFunc->getEntryBlock().begin()); Value* pAlloca = ALLOCA(pType, pArraySize); - if (saveIP.isSet()) IRB()->restoreIP(saveIP); + if (saveIP.isSet()) + IRB()->restoreIP(saveIP); return pAlloca; } Value* Builder::VABSPS(Value* a) { - Value* asInt = BITCAST(a, mSimdInt32Ty); + Value* asInt = BITCAST(a, mSimdInt32Ty); Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty); return result; } - Value *Builder::ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name) + Value* Builder::ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name) { - Value *lowCmp = ICMP_SLT(src, low); - Value *ret = SELECT(lowCmp, low, src); + Value* lowCmp = ICMP_SLT(src, low); + Value* ret = SELECT(lowCmp, low, src); - Value *highCmp = ICMP_SGT(ret, high); - ret = SELECT(highCmp, high, ret, name); + Value* highCmp = ICMP_SGT(ret, high); + ret = SELECT(highCmp, high, ret, name); return ret; } - Value *Builder::FCLAMP(Value* src, Value* low, Value* high) + Value* Builder::FCLAMP(Value* src, Value* low, Value* high) { - Value *lowCmp = FCMP_OLT(src, low); - Value *ret = SELECT(lowCmp, low, src); + Value* lowCmp = FCMP_OLT(src, low); + Value* ret = SELECT(lowCmp, low, src); - Value *highCmp = FCMP_OGT(ret, high); - ret = SELECT(highCmp, high, ret); + Value* highCmp = FCMP_OGT(ret, high); + ret = SELECT(highCmp, high, ret); return ret; } - Value *Builder::FCLAMP(Value* src, float low, float high) + Value* Builder::FCLAMP(Value* src, float low, float high) { Value* result = VMAXPS(src, VIMMED1(low)); - result = VMINPS(result, VIMMED1(high)); + result = VMINPS(result, VIMMED1(high)); return result; } - Value *Builder::FMADDPS(Value* a, Value* b, Value* c) + Value* Builder::FMADDPS(Value* a, Value* b, Value* c) { Value* vOut; // use FMADs if available - if(JM()->mArch.AVX2()) + if (JM()->mArch.AVX2()) { vOut = VFMADDPS(a, b, c); } @@ -804,39 +769,40 @@ namespace SwrJit ////////////////////////////////////////////////////////////////////////// /// @brief pop count on vector mask (e.g. <8 x i1>) - Value* Builder::VPOPCNT(Value* a) - { - return POPCNT(VMOVMSK(a)); - } + Value* Builder::VPOPCNT(Value* a) { return POPCNT(VMOVMSK(a)); } ////////////////////////////////////////////////////////////////////////// /// @brief C functions called by LLVM IR ////////////////////////////////////////////////////////////////////////// - Value *Builder::VEXTRACTI128(Value* a, Constant* imm8) + Value* Builder::VEXTRACTI128(Value* a, Constant* imm8) { - bool flag = !imm8->isZeroValue(); - SmallVector<Constant*,8> idx; - for (unsigned i = 0; i < mVWidth / 2; i++) { + bool flag = !imm8->isZeroValue(); + SmallVector<Constant*, 8> idx; + for (unsigned i = 0; i < mVWidth / 2; i++) + { idx.push_back(C(flag ? i + mVWidth / 2 : i)); } return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx)); } - Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8) + Value* Builder::VINSERTI128(Value* a, Value* b, Constant* imm8) { - bool flag = !imm8->isZeroValue(); - SmallVector<Constant*,8> idx; - for (unsigned i = 0; i < mVWidth; i++) { + bool flag = !imm8->isZeroValue(); + SmallVector<Constant*, 8> idx; + for (unsigned i = 0; i < mVWidth; i++) + { idx.push_back(C(i)); } - Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx)); + Value* inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx)); - SmallVector<Constant*,8> idx2; - for (unsigned i = 0; i < mVWidth / 2; i++) { + SmallVector<Constant*, 8> idx2; + for (unsigned i = 0; i < mVWidth / 2; i++) + { idx2.push_back(C(flag ? i : i + mVWidth)); } - for (unsigned i = mVWidth / 2; i < mVWidth; i++) { + for (unsigned i = mVWidth / 2; i < mVWidth; i++) + { idx2.push_back(C(flag ? i + mVWidth / 2 : i)); } return VSHUFFLE(a, inter, ConstantVector::get(idx2)); @@ -845,45 +811,51 @@ namespace SwrJit // rdtsc buckets macros void Builder::RDTSC_START(Value* pBucketMgr, Value* pId) { - // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into - // buckets framework when single threaded + // @todo due to an issue with thread local storage propagation in llvm, we can only safely + // call into buckets framework when single threaded if (KNOB_SINGLE_THREADED) { std::vector<Type*> args{ - PointerType::get(mInt32Ty, 0), // pBucketMgr - mInt32Ty // id + PointerType::get(mInt32Ty, 0), // pBucketMgr + mInt32Ty // id }; FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false); - Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy)); - if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr) + Function* pFunc = cast<Function>( + JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy)); + if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == + nullptr) { - sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket); + sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", + (void*)&BucketManager_StartBucket); } - CALL(pFunc, { pBucketMgr, pId }); + CALL(pFunc, {pBucketMgr, pId}); } } void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId) { - // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into - // buckets framework when single threaded + // @todo due to an issue with thread local storage propagation in llvm, we can only safely + // call into buckets framework when single threaded if (KNOB_SINGLE_THREADED) { std::vector<Type*> args{ - PointerType::get(mInt32Ty, 0), // pBucketMgr - mInt32Ty // id + PointerType::get(mInt32Ty, 0), // pBucketMgr + mInt32Ty // id }; FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false); - Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy)); - if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr) + Function* pFunc = cast<Function>( + JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy)); + if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == + nullptr) { - sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket); + sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", + (void*)&BucketManager_StopBucket); } - CALL(pFunc, { pBucketMgr, pId }); + CALL(pFunc, {pBucketMgr, pId}); } } @@ -892,14 +864,14 @@ namespace SwrJit if (pType->isStructTy()) { uint32_t numElems = pType->getStructNumElements(); - Type* pElemTy = pType->getStructElementType(0); + Type* pElemTy = pType->getStructElementType(0); return numElems * GetTypeSize(pElemTy); } if (pType->isArrayTy()) { uint32_t numElems = pType->getArrayNumElements(); - Type* pElemTy = pType->getArrayElementType(); + Type* pElemTy = pType->getArrayElementType(); return numElems * GetTypeSize(pElemTy); } @@ -927,4 +899,4 @@ namespace SwrJit SWR_ASSERT(false, "Unimplemented type."); return 0; } -} +} // namespace SwrJit diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h index d7732ef8c2a..f8701f9ba84 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h @@ -1,156 +1,164 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file builder_misc.h -* -* @brief miscellaneous builder functions -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file builder_misc.h + * + * @brief miscellaneous builder functions + * + * Notes: + * + ******************************************************************************/ #pragma once -Constant *C(bool i); -Constant *C(char i); -Constant *C(uint8_t i); -Constant *C(int i); -Constant *C(int64_t i); -Constant *C(uint64_t i); -Constant *C(uint16_t i); -Constant *C(uint32_t i); -Constant *C(float i); - -template<typename Ty> -Constant *C(const std::initializer_list<Ty> &constList) +Constant* C(bool i); +Constant* C(char i); +Constant* C(uint8_t i); +Constant* C(int i); +Constant* C(int64_t i); +Constant* C(uint64_t i); +Constant* C(uint16_t i); +Constant* C(uint32_t i); +Constant* C(float i); + +template <typename Ty> +Constant* C(const std::initializer_list<Ty>& constList) { std::vector<Constant*> vConsts; - for(auto i : constList) { - + for (auto i : constList) + { vConsts.push_back(C((Ty)i)); } return ConstantVector::get(vConsts); } -template<typename Ty> -Constant *CA(LLVMContext& ctx, ArrayRef<Ty> constList) +template <typename Ty> +Constant* CA(LLVMContext& ctx, ArrayRef<Ty> constList) { return ConstantDataArray::get(ctx, constList); } -template<typename Ty> -Constant *CInc(uint32_t base, uint32_t count) +template <typename Ty> +Constant* CInc(uint32_t base, uint32_t count) { std::vector<Constant*> vConsts; - for(uint32_t i = 0; i < count; i++) { + for (uint32_t i = 0; i < count; i++) + { vConsts.push_back(C((Ty)base)); base++; } return ConstantVector::get(vConsts); } -Constant *PRED(bool pred); +Constant* PRED(bool pred); -Value *VIMMED1(int i); -Value *VIMMED1_16(int i); +Value* VIMMED1(int i); +Value* VIMMED1_16(int i); -Value *VIMMED1(uint32_t i); -Value *VIMMED1_16(uint32_t i); +Value* VIMMED1(uint32_t i); +Value* VIMMED1_16(uint32_t i); -Value *VIMMED1(float i); -Value *VIMMED1_16(float i); +Value* VIMMED1(float i); +Value* VIMMED1_16(float i); -Value *VIMMED1(bool i); -Value *VIMMED1_16(bool i); +Value* VIMMED1(bool i); +Value* VIMMED1_16(bool i); -Value *VUNDEF(Type* t); +Value* VUNDEF(Type* t); -Value *VUNDEF_F(); -Value *VUNDEF_F_16(); +Value* VUNDEF_F(); +Value* VUNDEF_F_16(); -Value *VUNDEF_I(); -Value *VUNDEF_I_16(); +Value* VUNDEF_I(); +Value* VUNDEF_I_16(); -Value *VUNDEF(Type* ty, uint32_t size); +Value* VUNDEF(Type* ty, uint32_t size); -Value *VUNDEF_IPTR(); +Value* VUNDEF_IPTR(); -Value *VBROADCAST(Value *src, const llvm::Twine& name = ""); -Value *VBROADCAST_16(Value *src); +Value* VBROADCAST(Value* src, const llvm::Twine& name = ""); +Value* VBROADCAST_16(Value* src); -Value *VRCP(Value *va, const llvm::Twine& name = ""); -Value *VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY); +Value* VRCP(Value* va, const llvm::Twine& name = ""); +Value* VPLANEPS(Value* vA, Value* vB, Value* vC, Value*& vX, Value*& vY); uint32_t IMMED(Value* i); -int32_t S_IMMED(Value* i); +int32_t S_IMMED(Value* i); -CallInst *CALL(Value *Callee, const std::initializer_list<Value*> &args, const llvm::Twine& name = ""); -CallInst *CALL(Value *Callee) { return CALLA(Callee); } -CallInst *CALL(Value *Callee, Value* arg); -CallInst *CALL2(Value *Callee, Value* arg1, Value* arg2); -CallInst *CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3); +CallInst* + CALL(Value* Callee, const std::initializer_list<Value*>& args, const llvm::Twine& name = ""); +CallInst* CALL(Value* Callee) +{ + return CALLA(Callee); +} +CallInst* CALL(Value* Callee, Value* arg); +CallInst* CALL2(Value* Callee, Value* arg1, Value* arg2); +CallInst* CALL3(Value* Callee, Value* arg1, Value* arg2, Value* arg3); -Value *MASK(Value *vmask); -Value *MASK_16(Value *vmask); +Value* MASK(Value* vmask); +Value* MASK_16(Value* vmask); -Value *VMASK(Value *mask); -Value *VMASK_16(Value *mask); +Value* VMASK(Value* mask); +Value* VMASK_16(Value* mask); -Value *VMOVMSK(Value *mask); +Value* VMOVMSK(Value* mask); ////////////////////////////////////////////////////////////////////////// /// @brief functions that build IR to call x86 intrinsics directly, or /// emulate them with other instructions if not available on the host ////////////////////////////////////////////////////////////////////////// -Value *EXTRACT_16(Value *x, uint32_t imm); -Value *JOIN_16(Value *a, Value *b); +Value* EXTRACT_16(Value* x, uint32_t imm); +Value* JOIN_16(Value* a, Value* b); -Value *PSHUFB(Value* a, Value* b); -Value *PMOVSXBD(Value* a); -Value *PMOVSXWD(Value* a); -Value *CVTPH2PS(Value* a, const llvm::Twine& name = ""); -Value *CVTPS2PH(Value* a, Value* rounding); -Value *PMAXSD(Value* a, Value* b); -Value *PMINSD(Value* a, Value* b); -Value *PMAXUD(Value* a, Value* b); -Value *PMINUD(Value* a, Value* b); -Value *VABSPS(Value* a); -Value *FMADDPS(Value* a, Value* b, Value* c); +Value* PSHUFB(Value* a, Value* b); +Value* PMOVSXBD(Value* a); +Value* PMOVSXWD(Value* a); +Value* CVTPH2PS(Value* a, const llvm::Twine& name = ""); +Value* CVTPS2PH(Value* a, Value* rounding); +Value* PMAXSD(Value* a, Value* b); +Value* PMINSD(Value* a, Value* b); +Value* PMAXUD(Value* a, Value* b); +Value* PMINUD(Value* a, Value* b); +Value* VABSPS(Value* a); +Value* FMADDPS(Value* a, Value* b, Value* c); -Value *ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name = ""); -Value *FCLAMP(Value* src, Value* low, Value* high); -Value *FCLAMP(Value* src, float low, float high); +Value* ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name = ""); +Value* FCLAMP(Value* src, Value* low, Value* high); +Value* FCLAMP(Value* src, float low, float high); -CallInst *PRINT(const std::string &printStr); -CallInst *PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs); +CallInst* PRINT(const std::string& printStr); +CallInst* PRINT(const std::string& printStr, const std::initializer_list<Value*>& printArgs); Value* VPOPCNT(Value* a); -Value* INT3() { return DEBUGTRAP(); } +Value* INT3() +{ + return DEBUGTRAP(); +} -Value *VEXTRACTI128(Value* a, Constant* imm8); -Value *VINSERTI128(Value* a, Value* b, Constant* imm8); +Value* VEXTRACTI128(Value* a, Constant* imm8); +Value* VINSERTI128(Value* a, Value* b, Constant* imm8); // rdtsc buckets macros void RDTSC_START(Value* pBucketMgr, Value* pId); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index 0abcd1a8d76..b4d326ebdcc 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -1,32 +1,32 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file fetch_jit.cpp -* -* @brief Implementation of the fetch jitter -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file fetch_jit.cpp + * + * @brief Implementation of the fetch jitter + * + * Notes: + * + ******************************************************************************/ #include "jit_pch.hpp" #include "builder_gfx_mem.h" #include "jit_api.h" @@ -54,42 +54,64 @@ enum ConversionType ////////////////////////////////////////////////////////////////////////// struct FetchJit : public BuilderGfxMem { - FetchJit(JitManager* pJitMgr) : - BuilderGfxMem(pJitMgr) - {} + FetchJit(JitManager* pJitMgr) : BuilderGfxMem(pJitMgr) {} Function* Create(const FETCH_COMPILE_STATE& fetchState); Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex); Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex); Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex); - template<typename T> Value* GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex); + template <typename T> + Value* GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex); // package up Shuffle*bpcGatherd args into a tuple for convenience - typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType, - uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4], - const uint32_t(&)[4]> Shuffle8bpcArgs; - - void Shuffle8bpcGatherd16(Shuffle8bpcArgs &args); - void Shuffle8bpcGatherd(Shuffle8bpcArgs &args); - - typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType, - uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs; - - void Shuffle16bpcGather16(Shuffle16bpcArgs &args); - void Shuffle16bpcGather(Shuffle16bpcArgs &args); - - void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]); - - Value *GenerateCompCtrlVector(const ComponentControl ctrl); - - void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut); + typedef std::tuple<Value*&, + Value*, + const Instruction::CastOps, + const ConversionType, + uint32_t&, + uint32_t&, + const ComponentEnable, + const ComponentControl (&)[4], + Value* (&)[4], + const uint32_t (&)[4]> + Shuffle8bpcArgs; + + void Shuffle8bpcGatherd16(Shuffle8bpcArgs& args); + void Shuffle8bpcGatherd(Shuffle8bpcArgs& args); + + typedef std::tuple<Value* (&)[2], + Value*, + const Instruction::CastOps, + const ConversionType, + uint32_t&, + uint32_t&, + const ComponentEnable, + const ComponentControl (&)[4], + Value* (&)[4]> + Shuffle16bpcArgs; + + void Shuffle16bpcGather16(Shuffle16bpcArgs& args); + void Shuffle16bpcGather(Shuffle16bpcArgs& args); + + void StoreVertexElements(Value* pVtxOut, + const uint32_t outputElt, + const uint32_t numEltsToStore, + Value* (&vVertexElements)[4]); + + Value* GenerateCompCtrlVector(const ComponentControl ctrl); + + void JitGatherVertices(const FETCH_COMPILE_STATE& fetchState, + Value* streams, + Value* vIndices, + Value* pVtxOut); bool IsOddFormat(SWR_FORMAT format); bool IsUniformFormat(SWR_FORMAT format); void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]); - void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]); - void ConvertFormat(SWR_FORMAT format, Value *texels[4]); + void CreateGatherOddFormats( + SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]); + void ConvertFormat(SWR_FORMAT format, Value* texels[4]); Value* mpWorkerData; Value* mpFetchInfo; @@ -100,25 +122,29 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate); fnName << ComputeCRC(0, &fetchState, sizeof(fetchState)); - Function* fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); - BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch); + Function* fetch = Function::Create( + JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); + BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch); fetch->getParent()->setModuleIdentifier(fetch->getName()); IRB()->SetInsertPoint(entry); - auto argitr = fetch->arg_begin(); + auto argitr = fetch->arg_begin(); // Fetch shader arguments - Value* privateContext = &*argitr; ++argitr; + Value* privateContext = &*argitr; + ++argitr; privateContext->setName("privateContext"); SetPrivateContext(privateContext); - mpWorkerData = &*argitr; ++argitr; + mpWorkerData = &*argitr; + ++argitr; mpWorkerData->setName("pWorkerData"); - mpFetchInfo = &*argitr; ++argitr; + mpFetchInfo = &*argitr; + ++argitr; mpFetchInfo->setName("fetchInfo"); - Value* pVtxOut = &*argitr; + Value* pVtxOut = &*argitr; pVtxOut->setName("vtxOutput"); uint32_t baseWidth = mVWidth; @@ -133,71 +159,77 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) pVtxOut = BITCAST(pVtxOut, PointerType::get(mSimdFP32Ty, 0)); // SWR_FETCH_CONTEXT::pStreams - Value* streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams}); + Value* streams = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_pStreams}); streams->setName("pStreams"); // SWR_FETCH_CONTEXT::pIndices - Value* indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_xpIndices}); + Value* indices = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpIndices}); indices->setName("pIndices"); // SWR_FETCH_CONTEXT::pLastIndex - Value* pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_xpLastIndex}); + Value* pLastIndex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpLastIndex}); pLastIndex->setName("pLastIndex"); Value* vIndices; - switch(fetchState.indexType) + switch (fetchState.indexType) { - case R8_UINT: - indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0)); - if(fetchState.bDisableIndexOOBCheck) - { - vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0}); - vIndices = Z_EXT(vIndices, mSimdInt32Ty); - } - else - { - vIndices = GetSimdValid8bitIndices(indices, pLastIndex); - } - break; - case R16_UINT: - if(fetchState.bDisableIndexOOBCheck) - { - vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0}); - vIndices = Z_EXT(vIndices, mSimdInt32Ty); - } - else - { - vIndices = GetSimdValid16bitIndices(indices, pLastIndex); - } - break; - case R32_UINT: - (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(indices, "", PointerType::get(mSimdInt32Ty, 0), GFX_MEM_CLIENT_FETCH) - : vIndices = GetSimdValid32bitIndices(indices, pLastIndex); - break; // incoming type is already 32bit int - default: - SWR_INVALID("Unsupported index type"); - vIndices = nullptr; - break; + case R8_UINT: + indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0)); + if (fetchState.bDisableIndexOOBCheck) + { + vIndices = LOAD( + BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), + {(uint32_t)0}); + vIndices = Z_EXT(vIndices, mSimdInt32Ty); + } + else + { + vIndices = GetSimdValid8bitIndices(indices, pLastIndex); + } + break; + case R16_UINT: + if (fetchState.bDisableIndexOOBCheck) + { + vIndices = LOAD( + BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), + {(uint32_t)0}); + vIndices = Z_EXT(vIndices, mSimdInt32Ty); + } + else + { + vIndices = GetSimdValid16bitIndices(indices, pLastIndex); + } + break; + case R32_UINT: + (fetchState.bDisableIndexOOBCheck) + ? vIndices = LOAD(indices, "", PointerType::get(mSimdInt32Ty, 0), GFX_MEM_CLIENT_FETCH) + : vIndices = GetSimdValid32bitIndices(indices, pLastIndex); + break; // incoming type is already 32bit int + default: + SWR_INVALID("Unsupported index type"); + vIndices = nullptr; + break; } - if(fetchState.bForceSequentialAccessEnable) + if (fetchState.bForceSequentialAccessEnable) { - Value* pOffsets = mVWidth == 8 ? C({ 0, 1, 2, 3, 4, 5, 6, 7 }) : - C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }); + Value* pOffsets = mVWidth == 8 ? C({0, 1, 2, 3, 4, 5, 6, 7}) + : C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); // VertexData buffers are accessed sequentially, the index is equal to the vertex number - vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex })); + vIndices = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex})); vIndices = ADD(vIndices, pOffsets); } Value* vVertexId = vIndices; if (fetchState.bVertexIDOffsetEnable) { - // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct - Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex })); - Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex })); - vVertexId = ADD(vIndices, vBaseVertex); - vVertexId = ADD(vVertexId, vStartVertex); + // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally + // correct + Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex})); + Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex})); + vVertexId = ADD(vIndices, vBaseVertex); + vVertexId = ADD(vVertexId, vStartVertex); } // store out vertex IDs @@ -206,30 +238,30 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) // store out in simd8 halves until core supports 16-wide natively auto vVertexIdLo = EXTRACT_16(vVertexId, 0); auto vVertexIdHi = EXTRACT_16(vVertexId, 1); - STORE(vVertexIdLo, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })); - STORE(vVertexIdHi, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })); + STORE(vVertexIdLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})); + STORE(vVertexIdHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2})); } else if (mVWidth == 8) { - STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })); + STORE(vVertexId, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})); } // store out cut mask if enabled if (fetchState.bEnableCutIndex) { Value* vCutIndex = VIMMED1(fetchState.cutIndex); - Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex)); - + Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex)); + if (mVWidth == 16) { auto cutMaskLo = EXTRACT_16(cutMask, 0); auto cutMaskHi = EXTRACT_16(cutMask, 1); - STORE(cutMaskLo, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask })); - STORE(cutMaskHi, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 })); + STORE(cutMaskLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask})); + STORE(cutMaskHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask2})); } else if (mVWidth == 8) { - STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask })); + STORE(cutMask, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask})); } } @@ -279,7 +311,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) #if USE_SIMD16_SHADERS SetTargetWidth(baseWidth); #endif - + return fetch; } @@ -297,9 +329,9 @@ bool FetchJit::IsOddFormat(SWR_FORMAT format) // format is uniform if all components are the same size and type bool FetchJit::IsUniformFormat(SWR_FORMAT format) { - const SWR_FORMAT_INFO& info = GetFormatInfo(format); - uint32_t bpc0 = info.bpc[0]; - uint32_t type0 = info.type[0]; + const SWR_FORMAT_INFO& info = GetFormatInfo(format); + uint32_t bpc0 = info.bpc[0]; + uint32_t type0 = info.type[0]; for (uint32_t c = 1; c < info.numComps; ++c) { @@ -323,10 +355,10 @@ void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[ for (uint32_t c = 0; c < info.numComps; ++c) { uint32_t swizzledIndex = info.swizzle[c]; - uint32_t compBits = info.bpc[c]; - uint32_t bitmask = ((1 << compBits) - 1) << bitOffset; - Value* comp = AND(vInput, bitmask); - comp = LSHR(comp, bitOffset); + uint32_t compBits = info.bpc[c]; + uint32_t bitmask = ((1 << compBits) - 1) << bitOffset; + Value* comp = AND(vInput, bitmask); + comp = LSHR(comp, bitOffset); result[swizzledIndex] = comp; bitOffset += compBits; @@ -336,14 +368,15 @@ void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[ // gather for odd component size formats // gather SIMD full pixels per lane then shift/mask to move each component to their // own vector -void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4]) +void FetchJit::CreateGatherOddFormats( + SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4]) { - const SWR_FORMAT_INFO &info = GetFormatInfo(format); + const SWR_FORMAT_INFO& info = GetFormatInfo(format); // only works if pixel size is <= 32bits SWR_ASSERT(info.bpp <= 32); - Value *pGather; + Value* pGather; if (info.bpp == 32) { pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask); @@ -351,17 +384,17 @@ void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pB else { // Can't use 32-bit gather for items less than 32-bits, could cause page faults. - Value *pMem = ALLOCA(mSimdInt32Ty); + Value* pMem = ALLOCA(mSimdInt32Ty); STORE(VIMMED1(0u), pMem); - pBase = BITCAST(pBase, PointerType::get(mInt8Ty, 0)); + pBase = BITCAST(pBase, PointerType::get(mInt8Ty, 0)); Value* pDstMem = BITCAST(pMem, mInt32PtrTy); for (uint32_t lane = 0; lane < mVWidth; ++lane) { // Get index Value* index = VEXTRACT(pOffsets, C(lane)); - Value* mask = VEXTRACT(pMask, C(lane)); + Value* mask = VEXTRACT(pMask, C(lane)); switch (info.bpp) { case 8: @@ -418,9 +451,9 @@ void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pB pResult[3] = BITCAST(pResult[3], mSimdFP32Ty); } -void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4]) +void FetchJit::ConvertFormat(SWR_FORMAT format, Value* texels[4]) { - const SWR_FORMAT_INFO &info = GetFormatInfo(format); + const SWR_FORMAT_INFO& info = GetFormatInfo(format); for (uint32_t c = 0; c < info.numComps; ++c) { @@ -436,13 +469,14 @@ void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4]) { if (info.type[c] == SWR_TYPE_SNORM) { - /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f. + /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to + /// -1.0f. /// result = c * (1.0f / (2^(n-1) - 1); - uint32_t n = info.bpc[c]; - uint32_t pow2 = 1 << (n - 1); - float scale = 1.0f / (float)(pow2 - 1); - Value *vScale = VIMMED1(scale); + uint32_t n = info.bpc[c]; + uint32_t pow2 = 1 << (n - 1); + float scale = 1.0f / (float)(pow2 - 1); + Value* vScale = VIMMED1(scale); texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty); texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty); texels[compIndex] = FMUL(texels[compIndex], vScale); @@ -452,21 +486,22 @@ void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4]) SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM); /// result = c * (1.0f / (2^n - 1)) - uint32_t n = info.bpc[c]; + uint32_t n = info.bpc[c]; uint32_t pow2 = 1 << n; - // special case 24bit unorm format, which requires a full divide to meet ULP requirement + // special case 24bit unorm format, which requires a full divide to meet ULP + // requirement if (n == 24) { - float scale = (float)(pow2 - 1); - Value* vScale = VIMMED1(scale); + float scale = (float)(pow2 - 1); + Value* vScale = VIMMED1(scale); texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty); texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty); texels[compIndex] = FDIV(texels[compIndex], vScale); } else { - float scale = 1.0f / (float)(pow2 - 1); - Value *vScale = VIMMED1(scale); + float scale = 1.0f / (float)(pow2 - 1); + Value* vScale = VIMMED1(scale); texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty); texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty); texels[compIndex] = FMUL(texels[compIndex], vScale); @@ -483,17 +518,19 @@ void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4]) /// @param streams - value pointer to the current vertex stream /// @param vIndices - vector value of indices to gather /// @param pVtxOut - value pointer to output simdvertex struct -void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, - Value* streams, Value* vIndices, Value* pVtxOut) +void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE& fetchState, + Value* streams, + Value* vIndices, + Value* pVtxOut) { uint32_t currentVertexElement = 0; - uint32_t outputElt = 0; - Value* vVertexElements[4]; + uint32_t outputElt = 0; + Value* vVertexElements[4]; - Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}); + Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}); Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance}); - Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance}); - Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex })); + Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance}); + Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex})); curInstance->setName("curInstance"); for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1) @@ -506,23 +543,25 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, continue; } - const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format); + const SWR_FORMAT_INFO& info = GetFormatInfo((SWR_FORMAT)ied.Format); SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices."); - uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix. + uint32_t bpc = + info.bpp / + info.numComps; ///@todo Code below assumes all components are same size. Need to fix. - Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData}); + Value* stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData}); // VGATHER* takes an *i8 src pointer - Value *pStreamBase = INT_TO_PTR(stream, PointerType::get(mInt8Ty, 0)); + Value* pStreamBase = INT_TO_PTR(stream, PointerType::get(mInt8Ty, 0)); - Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch}); - Value *vStride = VBROADCAST(stride); + Value* stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch}); + Value* vStride = VBROADCAST(stride); // max vertex index that is fully in bounds - Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)}); - maxVertex = LOAD(maxVertex); + Value* maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)}); + maxVertex = LOAD(maxVertex); - Value *minVertex = NULL; + Value* minVertex = NULL; if (fetchState.bPartialVertexBuffer) { // min vertex index for low bounds OOB checking @@ -536,9 +575,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, curInstance = ADD(curInstance, startInstance); } - Value *vCurIndices; - Value *startOffset; - Value *vInstanceStride = VIMMED1(0); + Value* vCurIndices; + Value* startOffset; + Value* vInstanceStride = VIMMED1(0); if (ied.InstanceEnable) { @@ -546,7 +585,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // prevent a div by 0 for 0 step rate Value* isNonZeroStep = ICMP_UGT(stepRate, C(0)); - stepRate = SELECT(isNonZeroStep, stepRate, C(1)); + stepRate = SELECT(isNonZeroStep, stepRate, C(1)); // calc the current offset into instanced data buffer Value* calcInstance = UDIV(curInstance, stepRate); @@ -559,7 +598,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } else if (ied.InstanceStrideEnable) { - // grab the instance advancement state, determines stride in bytes from one instance to the next + // grab the instance advancement state, determines stride in bytes from one instance to + // the next Value* stepRate = C(ied.InstanceAdvancementState); vInstanceStride = VBROADCAST(MUL(curInstance, stepRate)); @@ -576,16 +616,16 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, startOffset = startVertex; } - // All of the OOB calculations are in vertices, not VB offsets, to prevent having to + // All of the OOB calculations are in vertices, not VB offsets, to prevent having to // do 64bit address offset calculations. // calculate byte offset to the start of the VB - Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty)); - pStreamBase = GEP(pStreamBase, baseOffset); + Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty)); + pStreamBase = GEP(pStreamBase, baseOffset); Value* pStreamBaseGFX = ADD(stream, baseOffset); // if we have a start offset, subtract from max vertex. Used for OOB check - maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty)); + maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty)); Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0)); // if we have a negative value, we're already OOB. clamp at 0. maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty)); @@ -593,38 +633,39 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, if (fetchState.bPartialVertexBuffer) { // similary for min vertex - minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty)); - Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0)); - minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty)); + minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty)); + Value* minNeg = ICMP_SLT(minVertex, C((int64_t)0)); + minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty)); } // Load the in bounds size of a partially valid vertex - Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)}); - partialInboundsSize = LOAD(partialInboundsSize); - Value *vPartialVertexSize = VBROADCAST(partialInboundsSize); - Value *vBpp = VBROADCAST(C(info.Bpp)); - Value *vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset)); + Value* partialInboundsSize = + GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)}); + partialInboundsSize = LOAD(partialInboundsSize); + Value* vPartialVertexSize = VBROADCAST(partialInboundsSize); + Value* vBpp = VBROADCAST(C(info.Bpp)); + Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset)); // is the element is <= the partially valid size - Value *vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets)); + Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets)); // override cur indices with 0 if pitch is 0 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0)); - vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices); + vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices); // are vertices partially OOB? - Value* vMaxVertex = VBROADCAST(maxVertex); + Value* vMaxVertex = VBROADCAST(maxVertex); Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex); // are vertices fully in bounds? Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex); - Value *vGatherMask; + Value* vGatherMask; if (fetchState.bPartialVertexBuffer) { // are vertices below minVertex limit? - Value *vMinVertex = VBROADCAST(minVertex); - Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex); + Value* vMinVertex = VBROADCAST(minVertex); + Value* vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex); // only fetch lanes that pass both tests vGatherMask = AND(vMaxGatherMask, vMinGatherMask); @@ -639,23 +680,26 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // calculate the actual offsets into the VB Value* vOffsets = MUL(vCurIndices, vStride); - vOffsets = ADD(vOffsets, vAlignmentOffsets); + vOffsets = ADD(vOffsets, vAlignmentOffsets); // if instance stride enable is: // true - add product of the instanceID and advancement state to the offst into the VB // false - value of vInstanceStride has been initialialized to zero vOffsets = ADD(vOffsets, vInstanceStride); - // Packing and component control - ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking; - const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1, - (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3}; + // Packing and component control + ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking; + const ComponentControl compCtrl[4]{(ComponentControl)ied.ComponentControl0, + (ComponentControl)ied.ComponentControl1, + (ComponentControl)ied.ComponentControl2, + (ComponentControl)ied.ComponentControl3}; // Special gather/conversion for formats without equal component sizes if (IsOddFormat((SWR_FORMAT)ied.Format)) { - Value *pResults[4]; - CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults); + Value* pResults[4]; + CreateGatherOddFormats( + (SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults); ConvertFormat((SWR_FORMAT)ied.Format, pResults); for (uint32_t c = 0; c < 4; c += 1) @@ -672,193 +716,214 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } } } - else if(info.type[0] == SWR_TYPE_FLOAT) + else if (info.type[0] == SWR_TYPE_FLOAT) { ///@todo: support 64 bit vb accesses - Value *gatherSrc = VIMMED1(0.0f); + Value* gatherSrc = VIMMED1(0.0f); - SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), - "Unsupported format for standard gather fetch."); + SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), + "Unsupported format for standard gather fetch."); // Gather components from memory to store in a simdvertex structure switch (bpc) { - case 16: - { - Value *vGatherResult[2]; + case 16: + { + Value* vGatherResult[2]; - // if we have at least one component out of x or y to fetch - if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) - { - vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask); - // e.g. result of first 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy - // - } + // if we have at least one component out of x or y to fetch + if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) + { + vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask); + // e.g. result of first 8x32bit integer gather for 16bit components + // 256i - 0 1 2 3 4 5 6 7 + // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy + // + } - // if we have at least one component out of z or w to fetch - if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) - { - // offset base to the next components(zw) in the vertex to gather - pStreamBase = GEP(pStreamBase, C((char)4)); - - vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask); - // e.g. result of second 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw - // - } + // if we have at least one component out of z or w to fetch + if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) + { + // offset base to the next components(zw) in the vertex to gather + pStreamBase = GEP(pStreamBase, C((char)4)); - // if we have at least one component to shuffle into place - if (compMask) - { - Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE, - currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); + vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask); + // e.g. result of second 8x32bit integer gather for 16bit components + // 256i - 0 1 2 3 4 5 6 7 + // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw + // + } - // Shuffle gathered components into place in simdvertex struct - mVWidth == 16 ? Shuffle16bpcGather16(args) : Shuffle16bpcGather(args); // outputs to vVertexElements ref - } + // if we have at least one component to shuffle into place + if (compMask) + { + Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, + pVtxOut, + Instruction::CastOps::FPExt, + CONVERT_NONE, + currentVertexElement, + outputElt, + compMask, + compCtrl, + vVertexElements); + + // Shuffle gathered components into place in simdvertex struct + mVWidth == 16 ? Shuffle16bpcGather16(args) + : Shuffle16bpcGather(args); // outputs to vVertexElements ref } - break; - case 32: + } + break; + case 32: + { + for (uint32_t i = 0; i < 4; i += 1) { - for (uint32_t i = 0; i < 4; i += 1) + if (isComponentEnabled(compMask, i)) { - if (isComponentEnabled(compMask, i)) + // if we need to gather the component + if (compCtrl[i] == StoreSrc) { - // if we need to gather the component - if (compCtrl[i] == StoreSrc) - { - // Gather a SIMD of vertices - // APIs allow a 4GB range for offsets - // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :( - // But, we know that elements must be aligned for FETCH. :) - // Right shift the offset by a bit and then scale by 2 to remove the sign extension. - Value *vShiftedOffsets = LSHR(vOffsets, 1); - vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBaseGFX, vShiftedOffsets, vGatherMask, 2, GFX_MEM_CLIENT_FETCH); - } - else - { - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); - } - - if (currentVertexElement > 3) - { - StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); - // reset to the next vVertexElement to output - currentVertexElement = 0; - } + // Gather a SIMD of vertices + // APIs allow a 4GB range for offsets + // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :( + // But, we know that elements must be aligned for FETCH. :) + // Right shift the offset by a bit and then scale by 2 to remove the + // sign extension. + Value* vShiftedOffsets = LSHR(vOffsets, 1); + vVertexElements[currentVertexElement++] = + GATHERPS(gatherSrc, + pStreamBaseGFX, + vShiftedOffsets, + vGatherMask, + 2, + GFX_MEM_CLIENT_FETCH); + } + else + { + vVertexElements[currentVertexElement++] = + GenerateCompCtrlVector(compCtrl[i]); } - // offset base to the next component in the vertex to gather - pStreamBase = GEP(pStreamBase, C((char)4)); - pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4)); + if (currentVertexElement > 3) + { + StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); + // reset to the next vVertexElement to output + currentVertexElement = 0; + } } + + // offset base to the next component in the vertex to gather + pStreamBase = GEP(pStreamBase, C((char)4)); + pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4)); } - break; - case 64: + } + break; + case 64: + { + for (uint32_t i = 0; i < 4; i += 1) { - for (uint32_t i = 0; i < 4; i += 1) + if (isComponentEnabled(compMask, i)) { - if (isComponentEnabled(compMask, i)) + // if we need to gather the component + if (compCtrl[i] == StoreSrc) { - // if we need to gather the component - if (compCtrl[i] == StoreSrc) - { - Value* vShufLo; - Value* vShufHi; - Value* vShufAll; + Value* vShufLo; + Value* vShufHi; + Value* vShufAll; - if (mVWidth == 8) - { - vShufLo = C({ 0, 1, 2, 3 }); - vShufHi = C({ 4, 5, 6, 7 }); - vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7 }); - } - else - { - SWR_ASSERT(mVWidth == 16); - vShufLo = C({ 0, 1, 2, 3, 4, 5, 6, 7 }); - vShufHi = C({ 8, 9, 10, 11, 12, 13, 14, 15 }); - vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }); - } + if (mVWidth == 8) + { + vShufLo = C({0, 1, 2, 3}); + vShufHi = C({4, 5, 6, 7}); + vShufAll = C({0, 1, 2, 3, 4, 5, 6, 7}); + } + else + { + SWR_ASSERT(mVWidth == 16); + vShufLo = C({0, 1, 2, 3, 4, 5, 6, 7}); + vShufHi = C({8, 9, 10, 11, 12, 13, 14, 15}); + vShufAll = + C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); + } - Value *vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo); - Value *vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi); + Value* vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo); + Value* vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi); - Value *vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo); - Value *vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi); + Value* vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo); + Value* vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi); - Value *vZeroDouble = VECTOR_SPLAT(mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f)); + Value* vZeroDouble = VECTOR_SPLAT( + mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f)); - Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo); - Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi); + Value* pGatherLo = + GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo); + Value* pGatherHi = + GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi); - pGatherLo = VCVTPD2PS(pGatherLo); - pGatherHi = VCVTPD2PS(pGatherHi); + pGatherLo = VCVTPD2PS(pGatherLo); + pGatherHi = VCVTPD2PS(pGatherHi); - Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll); + Value* pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll); - vVertexElements[currentVertexElement++] = pGather; - } - else - { - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); - } - - if (currentVertexElement > 3) - { - StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); - // reset to the next vVertexElement to output - currentVertexElement = 0; - } + vVertexElements[currentVertexElement++] = pGather; + } + else + { + vVertexElements[currentVertexElement++] = + GenerateCompCtrlVector(compCtrl[i]); } - // offset base to the next component in the vertex to gather - pStreamBase = GEP(pStreamBase, C((char)8)); + if (currentVertexElement > 3) + { + StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); + // reset to the next vVertexElement to output + currentVertexElement = 0; + } } + + // offset base to the next component in the vertex to gather + pStreamBase = GEP(pStreamBase, C((char)8)); } - break; - default: - SWR_INVALID("Tried to fetch invalid FP format"); - break; + } + break; + default: + SWR_INVALID("Tried to fetch invalid FP format"); + break; } } else { Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd; - ConversionType conversionType = CONVERT_NONE; + ConversionType conversionType = CONVERT_NONE; - SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), - "Unsupported format for standard gather fetch."); + SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), + "Unsupported format for standard gather fetch."); - switch(info.type[0]) + switch (info.type[0]) { - case SWR_TYPE_UNORM: - conversionType = CONVERT_NORMALIZED; - case SWR_TYPE_UINT: - extendCastType = Instruction::CastOps::ZExt; - break; - case SWR_TYPE_SNORM: - conversionType = CONVERT_NORMALIZED; - case SWR_TYPE_SINT: - extendCastType = Instruction::CastOps::SExt; - break; - case SWR_TYPE_USCALED: - conversionType = CONVERT_USCALED; - extendCastType = Instruction::CastOps::UIToFP; - break; - case SWR_TYPE_SSCALED: - conversionType = CONVERT_SSCALED; - extendCastType = Instruction::CastOps::SIToFP; - break; - case SWR_TYPE_SFIXED: - conversionType = CONVERT_SFIXED; - extendCastType = Instruction::CastOps::SExt; - break; - default: - break; + case SWR_TYPE_UNORM: + conversionType = CONVERT_NORMALIZED; + case SWR_TYPE_UINT: + extendCastType = Instruction::CastOps::ZExt; + break; + case SWR_TYPE_SNORM: + conversionType = CONVERT_NORMALIZED; + case SWR_TYPE_SINT: + extendCastType = Instruction::CastOps::SExt; + break; + case SWR_TYPE_USCALED: + conversionType = CONVERT_USCALED; + extendCastType = Instruction::CastOps::UIToFP; + break; + case SWR_TYPE_SSCALED: + conversionType = CONVERT_SSCALED; + extendCastType = Instruction::CastOps::SIToFP; + break; + case SWR_TYPE_SFIXED: + conversionType = CONVERT_SFIXED; + extendCastType = Instruction::CastOps::SExt; + break; + default: + break; } // value substituted when component of gather is masked @@ -867,113 +932,132 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // Gather components from memory to store in a simdvertex structure switch (bpc) { - case 8: + case 8: + { + // if we have at least one component to fetch + if (compMask) { - // if we have at least one component to fetch - if (compMask) - { - Value *vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); - // e.g. result of an 8x32bit integer gather for 8bit components - // 256i - 0 1 2 3 4 5 6 7 - // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw - - Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, - currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle); - - // Shuffle gathered components into place in simdvertex struct - mVWidth == 16 ? Shuffle8bpcGatherd16(args) : Shuffle8bpcGatherd(args); // outputs to vVertexElements ref - } + Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); + // e.g. result of an 8x32bit integer gather for 8bit components + // 256i - 0 1 2 3 4 5 6 7 + // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw + + Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, + pVtxOut, + extendCastType, + conversionType, + currentVertexElement, + outputElt, + compMask, + compCtrl, + vVertexElements, + info.swizzle); + + // Shuffle gathered components into place in simdvertex struct + mVWidth == 16 ? Shuffle8bpcGatherd16(args) + : Shuffle8bpcGatherd(args); // outputs to vVertexElements ref } - break; - case 16: - { - Value *vGatherResult[2]; + } + break; + case 16: + { + Value* vGatherResult[2]; - // if we have at least one component out of x or y to fetch - if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) - { - vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); - // e.g. result of first 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy - // - } + // if we have at least one component out of x or y to fetch + if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) + { + vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); + // e.g. result of first 8x32bit integer gather for 16bit components + // 256i - 0 1 2 3 4 5 6 7 + // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy + // + } - // if we have at least one component out of z or w to fetch - if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) - { - // offset base to the next components(zw) in the vertex to gather - pStreamBase = GEP(pStreamBase, C((char)4)); - - vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); - // e.g. result of second 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw - // - } + // if we have at least one component out of z or w to fetch + if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) + { + // offset base to the next components(zw) in the vertex to gather + pStreamBase = GEP(pStreamBase, C((char)4)); - // if we have at least one component to shuffle into place - if (compMask) - { - Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, - currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); + vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); + // e.g. result of second 8x32bit integer gather for 16bit components + // 256i - 0 1 2 3 4 5 6 7 + // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw + // + } - // Shuffle gathered components into place in simdvertex struct - mVWidth == 16 ? Shuffle16bpcGather16(args) : Shuffle16bpcGather(args); // outputs to vVertexElements ref - } + // if we have at least one component to shuffle into place + if (compMask) + { + Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, + pVtxOut, + extendCastType, + conversionType, + currentVertexElement, + outputElt, + compMask, + compCtrl, + vVertexElements); + + // Shuffle gathered components into place in simdvertex struct + mVWidth == 16 ? Shuffle16bpcGather16(args) + : Shuffle16bpcGather(args); // outputs to vVertexElements ref } - break; - case 32: + } + break; + case 32: + { + // Gathered components into place in simdvertex struct + for (uint32_t i = 0; i < 4; i++) { - // Gathered components into place in simdvertex struct - for (uint32_t i = 0; i < 4; i++) + if (isComponentEnabled(compMask, i)) { - if (isComponentEnabled(compMask, i)) + // if we need to gather the component + if (compCtrl[i] == StoreSrc) { - // if we need to gather the component - if (compCtrl[i] == StoreSrc) + Value* pGather = + GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); + + if (conversionType == CONVERT_USCALED) { - Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); - - if (conversionType == CONVERT_USCALED) - { - pGather = UI_TO_FP(pGather, mSimdFP32Ty); - } - else if (conversionType == CONVERT_SSCALED) - { - pGather = SI_TO_FP(pGather, mSimdFP32Ty); - } - else if (conversionType == CONVERT_SFIXED) - { - pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f))); - } - - vVertexElements[currentVertexElement++] = pGather; - - // e.g. result of a single 8x32bit integer gather for 32bit components - // 256i - 0 1 2 3 4 5 6 7 - // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx + pGather = UI_TO_FP(pGather, mSimdFP32Ty); } - else + else if (conversionType == CONVERT_SSCALED) { - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); + pGather = SI_TO_FP(pGather, mSimdFP32Ty); } - - if (currentVertexElement > 3) + else if (conversionType == CONVERT_SFIXED) { - StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); - - // reset to the next vVertexElement to output - currentVertexElement = 0; + pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), + VBROADCAST(C(1 / 65536.0f))); } + vVertexElements[currentVertexElement++] = pGather; + + // e.g. result of a single 8x32bit integer gather for 32bit components + // 256i - 0 1 2 3 4 5 6 7 + // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx } + else + { + vVertexElements[currentVertexElement++] = + GenerateCompCtrlVector(compCtrl[i]); + } + + if (currentVertexElement > 3) + { + StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); - // offset base to the next component in the vertex to gather - pStreamBase = GEP(pStreamBase, C((char)4)); + // reset to the next vVertexElement to output + currentVertexElement = 0; + } } + + // offset base to the next component in the vertex to gather + pStreamBase = GEP(pStreamBase, C((char)4)); } - break; + } + break; } } } @@ -985,13 +1069,16 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } } -template<typename T> Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex) +template <typename T> +Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex) { - SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty, "Function expects gfxptr_t for both input parameters."); + SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty, + "Function expects gfxptr_t for both input parameters."); Type* Ty = nullptr; - static_assert(sizeof(T) == sizeof(uint16_t) || sizeof(T) == sizeof(uint8_t), "Unsupported type for use with GetSimdValidIndicesHelper<T>"); + static_assert(sizeof(T) == sizeof(uint16_t) || sizeof(T) == sizeof(uint8_t), + "Unsupported type for use with GetSimdValidIndicesHelper<T>"); constexpr bool bSize = (sizeof(T) == sizeof(uint16_t)); if (bSize) { @@ -1017,19 +1104,19 @@ template<typename T> Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, for (int64_t lane = 0; lane < mVWidth; lane++) { // Calculate the address of the requested index - Value *pIndex = GEP(pIndices, C(lane), Ty); + Value* pIndex = GEP(pIndices, C(lane), Ty); pLastIndex = INT_TO_PTR(pLastIndex, Ty); - // check if the address is less than the max index, + // check if the address is less than the max index, Value* mask = ICMP_ULT(pIndex, pLastIndex); // if valid, load the index. if not, load 0 from the stack Value* pValid = SELECT(mask, pIndex, pZeroIndex); - Value *index = LOAD(pValid, "valid index", Ty, GFX_MEM_CLIENT_FETCH); + Value* index = LOAD(pValid, "valid index", Ty, GFX_MEM_CLIENT_FETCH); // zero extended index to 32 bits and insert into the correct simd lane - index = Z_EXT(index, mInt32Ty); + index = Z_EXT(index, mInt32Ty); vIndices = VINSERT(vIndices, index, lane); } } @@ -1066,23 +1153,23 @@ Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex) Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex) { DataLayout dL(JM()->mpCurrentModule); - Value* iLastIndex = pLastIndex; - Value* iIndices = pIndices; + Value* iLastIndex = pLastIndex; + Value* iIndices = pIndices; // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index) - Value* numIndicesLeft = SUB(iLastIndex,iIndices); - numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty); - numIndicesLeft = SDIV(numIndicesLeft, C(4)); + Value* numIndicesLeft = SUB(iLastIndex, iIndices); + numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty); + numIndicesLeft = SDIV(numIndicesLeft, C(4)); // create a vector of index counts from the base index ptr passed into the fetch Constant* vIndexOffsets; if (mVWidth == 8) { - vIndexOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 }); + vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7}); } else { - vIndexOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }); + vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); } // compare index count to the max valid index @@ -1091,16 +1178,22 @@ Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex) // ------------------------------ // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0 - Value* vMaxIndex = VBROADCAST(numIndicesLeft); + Value* vMaxIndex = VBROADCAST(numIndicesLeft); Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets); // Load the indices; OOB loads 0 - return MASKED_LOAD(pIndices, 4, vIndexMask, VIMMED1(0), "vIndices", PointerType::get(mSimdInt32Ty, 0), GFX_MEM_CLIENT_FETCH); + return MASKED_LOAD(pIndices, + 4, + vIndexMask, + VIMMED1(0), + "vIndices", + PointerType::get(mSimdInt32Ty, 0), + GFX_MEM_CLIENT_FETCH); } ////////////////////////////////////////////////////////////////////////// -/// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends, -/// denormalizes if needed, converts to F32 if needed, and positions in +/// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends, +/// denormalizes if needed, converts to F32 if needed, and positions in // the proper SIMD rows to be output to the simdvertex structure /// @param args: (tuple of args, listed below) /// @param vGatherResult - 8 gathered 8bpc vertices @@ -1113,60 +1206,67 @@ Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex) /// @param compCtrl - component control val /// @param vVertexElements[4] - vertex components to output /// @param swizzle[4] - component swizzle location -void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args) +void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs& args) { // Unpack tuple args - Value*& vGatherResult = std::get<0>(args); - Value* pVtxOut = std::get<1>(args); - const Instruction::CastOps extendType = std::get<2>(args); - const ConversionType conversionType = std::get<3>(args); - uint32_t ¤tVertexElement = std::get<4>(args); - uint32_t &outputElt = std::get<5>(args); - const ComponentEnable compMask = std::get<6>(args); - const ComponentControl(&compCtrl)[4] = std::get<7>(args); - Value* (&vVertexElements)[4] = std::get<8>(args); - const uint32_t(&swizzle)[4] = std::get<9>(args); + Value*& vGatherResult = std::get<0>(args); + Value* pVtxOut = std::get<1>(args); + const Instruction::CastOps extendType = std::get<2>(args); + const ConversionType conversionType = std::get<3>(args); + uint32_t& currentVertexElement = std::get<4>(args); + uint32_t& outputElt = std::get<5>(args); + const ComponentEnable compMask = std::get<6>(args); + const ComponentControl(&compCtrl)[4] = std::get<7>(args); + Value*(&vVertexElements)[4] = std::get<8>(args); + const uint32_t(&swizzle)[4] = std::get<9>(args); // cast types - Type *vGatherTy = VectorType::get(mInt32Ty, 8); - Type *v32x8Ty = VectorType::get(mInt8Ty, 32); + Type* vGatherTy = VectorType::get(mInt32Ty, 8); + Type* v32x8Ty = VectorType::get(mInt8Ty, 32); // have to do extra work for sign extending if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)) { - Type *v16x8Ty = VectorType::get(mInt8Ty, 16); // 8x16bit ints in a 128bit lane - Type *v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2); + Type* v16x8Ty = VectorType::get(mInt8Ty, 16); // 8x16bit ints in a 128bit lane + Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2); // shuffle mask, including any swizzling - const char x = (char)swizzle[0]; const char y = (char)swizzle[1]; - const char z = (char)swizzle[2]; const char w = (char)swizzle[3]; - Value *vConstMask = C<char>({ char(x), char(x + 4), char(x + 8), char(x + 12), - char(y), char(y + 4), char(y + 8), char(y + 12), - char(z), char(z + 4), char(z + 8), char(z + 12), - char(w), char(w + 4), char(w + 8), char(w + 12), - char(x), char(x + 4), char(x + 8), char(x + 12), - char(y), char(y + 4), char(y + 8), char(y + 12), - char(z), char(z + 4), char(z + 8), char(z + 12), - char(w), char(w + 4), char(w + 8), char(w + 12) }); + const char x = (char)swizzle[0]; + const char y = (char)swizzle[1]; + const char z = (char)swizzle[2]; + const char w = (char)swizzle[3]; + Value* vConstMask = C<char>( + {char(x), char(x + 4), char(x + 8), char(x + 12), char(y), char(y + 4), + char(y + 8), char(y + 12), char(z), char(z + 4), char(z + 8), char(z + 12), + char(w), char(w + 4), char(w + 8), char(w + 12), char(x), char(x + 4), + char(x + 8), char(x + 12), char(y), char(y + 4), char(y + 8), char(y + 12), + char(z), char(z + 4), char(z + 8), char(z + 12), char(w), char(w + 4), + char(w + 8), char(w + 12)}); // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now.. - Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0); - Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1); + Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0); + Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1); - Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); - Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); + Value* vShufResult_lo = + BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); + Value* vShufResult_hi = + BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); // after pshufb: group components together in each 128bit lane // 256i - 0 1 2 3 4 5 6 7 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww - Value *vi128XY_lo = nullptr; - Value *vi128XY_hi = nullptr; + Value* vi128XY_lo = nullptr; + Value* vi128XY_hi = nullptr; if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) { - vi128XY_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty); - vi128XY_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty); + vi128XY_lo = BITCAST( + VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), + v128Ty); + vi128XY_hi = BITCAST( + VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), + v128Ty); // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane // 256i - 0 1 2 3 4 5 6 7 @@ -1174,26 +1274,30 @@ void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args) } // do the same for zw components - Value *vi128ZW_lo = nullptr; - Value *vi128ZW_hi = nullptr; + Value* vi128ZW_lo = nullptr; + Value* vi128ZW_hi = nullptr; if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) { - vi128ZW_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty); - vi128ZW_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty); + vi128ZW_lo = BITCAST( + VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), + v128Ty); + vi128ZW_hi = BITCAST( + VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), + v128Ty); } // init denormalize variables if needed Instruction::CastOps fpCast; - Value *conversionFactor; + Value* conversionFactor; switch (conversionType) { case CONVERT_NORMALIZED: - fpCast = Instruction::CastOps::SIToFP; + fpCast = Instruction::CastOps::SIToFP; conversionFactor = VIMMED1((float)(1.0 / 127.0)); break; case CONVERT_SSCALED: - fpCast = Instruction::CastOps::SIToFP; + fpCast = Instruction::CastOps::SIToFP; conversionFactor = VIMMED1((float)(1.0)); break; case CONVERT_USCALED: @@ -1206,7 +1310,8 @@ void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args) break; } - // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex + // sign extend all enabled components. If we have a fill vVertexElements, output to current + // simdvertex for (uint32_t i = 0; i < 4; i++) { if (isComponentEnabled(compMask, i)) @@ -1216,12 +1321,14 @@ void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args) // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; // if x or y, use vi128XY permute result, else use vi128ZW - Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo; - Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi; + Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo; + Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi; // sign extend - Value *temp_lo = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty)); - Value *temp_hi = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty)); + Value* temp_lo = + PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty)); + Value* temp_hi = + PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty)); Value* temp = JOIN_16(temp_lo, temp_hi); @@ -1250,20 +1357,21 @@ void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args) } } // else zero extend - else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP)) + else if ((extendType == Instruction::CastOps::ZExt) || + (extendType == Instruction::CastOps::UIToFP)) { // init denormalize variables if needed Instruction::CastOps fpCast; - Value *conversionFactor; + Value* conversionFactor; switch (conversionType) { case CONVERT_NORMALIZED: - fpCast = Instruction::CastOps::UIToFP; + fpCast = Instruction::CastOps::UIToFP; conversionFactor = VIMMED1((float)(1.0 / 255.0)); break; case CONVERT_USCALED: - fpCast = Instruction::CastOps::UIToFP; + fpCast = Instruction::CastOps::UIToFP; conversionFactor = VIMMED1((float)(1.0)); break; case CONVERT_SSCALED: @@ -1284,43 +1392,49 @@ void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args) if (compCtrl[i] == ComponentControl::StoreSrc) { // pshufb masks for each component - Value *vConstMask; + Value* vConstMask; switch (swizzle[i]) { case 0: // x shuffle mask - vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, - 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 }); + vConstMask = + C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, + 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1}); break; case 1: // y shuffle mask - vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, - 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 }); + vConstMask = + C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, + 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1}); break; case 2: // z shuffle mask - vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, - 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 }); + vConstMask = + C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, + 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1}); break; case 3: // w shuffle mask - vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, - 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 }); + vConstMask = + C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, + 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1}); break; default: vConstMask = nullptr; break; } - Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0); - Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1); + Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0); + Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1); - Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); - Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); + Value* temp_lo = + BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); + Value* temp_hi = + BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); // after pshufb for x channel // 256i - 0 1 2 3 4 5 6 7 - // x000 x000 x000 x000 x000 x000 x000 x000 + // x000 x000 x000 x000 x000 x000 x000 x000 Value* temp = JOIN_16(temp_lo, temp_hi); @@ -1354,19 +1468,19 @@ void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args) } } -void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args) +void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs& args) { // Unpack tuple args - Value*& vGatherResult = std::get<0>(args); - Value* pVtxOut = std::get<1>(args); - const Instruction::CastOps extendType = std::get<2>(args); - const ConversionType conversionType = std::get<3>(args); - uint32_t ¤tVertexElement = std::get<4>(args); - uint32_t &outputElt = std::get<5>(args); - const ComponentEnable compMask = std::get<6>(args); - const ComponentControl(&compCtrl)[4] = std::get<7>(args); - Value* (&vVertexElements)[4] = std::get<8>(args); - const uint32_t(&swizzle)[4] = std::get<9>(args); + Value*& vGatherResult = std::get<0>(args); + Value* pVtxOut = std::get<1>(args); + const Instruction::CastOps extendType = std::get<2>(args); + const ConversionType conversionType = std::get<3>(args); + uint32_t& currentVertexElement = std::get<4>(args); + uint32_t& outputElt = std::get<5>(args); + const ComponentEnable compMask = std::get<6>(args); + const ComponentControl(&compCtrl)[4] = std::get<7>(args); + Value*(&vVertexElements)[4] = std::get<8>(args); + const uint32_t(&swizzle)[4] = std::get<9>(args); // cast types Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits @@ -1379,18 +1493,19 @@ void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args) if (compCtrl[i] == ComponentControl::StoreSrc) { std::vector<uint32_t> vShuffleMasks[4] = { - { 0, 4, 8, 12, 16, 20, 24, 28 }, // x - { 1, 5, 9, 13, 17, 21, 25, 29 }, // y - { 2, 6, 10, 14, 18, 22, 26, 30 }, // z - { 3, 7, 11, 15, 19, 23, 27, 31 }, // w + {0, 4, 8, 12, 16, 20, 24, 28}, // x + {1, 5, 9, 13, 17, 21, 25, 29}, // y + {2, 6, 10, 14, 18, 22, 26, 30}, // z + {3, 7, 11, 15, 19, 23, 27, 31}, // w }; - Value *val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty), - UndefValue::get(v32x8Ty), - vShuffleMasks[swizzle[i]]); + Value* val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty), + UndefValue::get(v32x8Ty), + vShuffleMasks[swizzle[i]]); if ((extendType == Instruction::CastOps::SExt) || - (extendType == Instruction::CastOps::SIToFP)) { + (extendType == Instruction::CastOps::SIToFP)) + { switch (conversionType) { case CONVERT_NORMALIZED: @@ -1409,7 +1524,8 @@ void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args) } } else if ((extendType == Instruction::CastOps::ZExt) || - (extendType == Instruction::CastOps::UIToFP)) { + (extendType == Instruction::CastOps::UIToFP)) + { switch (conversionType) { case CONVERT_NORMALIZED: @@ -1449,8 +1565,8 @@ void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args) } ////////////////////////////////////////////////////////////////////////// -/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends, -/// denormalizes if needed, converts to F32 if needed, and positions in +/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends, +/// denormalizes if needed, converts to F32 if needed, and positions in // the proper SIMD rows to be output to the simdvertex structure /// @param args: (tuple of args, listed below) /// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index @@ -1462,53 +1578,59 @@ void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args) /// @param compMask - component packing mask /// @param compCtrl - component control val /// @param vVertexElements[4] - vertex components to output -void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args) +void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs& args) { // Unpack tuple args - Value* (&vGatherResult)[2] = std::get<0>(args); - Value* pVtxOut = std::get<1>(args); - const Instruction::CastOps extendType = std::get<2>(args); - const ConversionType conversionType = std::get<3>(args); - uint32_t ¤tVertexElement = std::get<4>(args); - uint32_t &outputElt = std::get<5>(args); - const ComponentEnable compMask = std::get<6>(args); - const ComponentControl(&compCtrl)[4] = std::get<7>(args); - Value* (&vVertexElements)[4] = std::get<8>(args); + Value*(&vGatherResult)[2] = std::get<0>(args); + Value* pVtxOut = std::get<1>(args); + const Instruction::CastOps extendType = std::get<2>(args); + const ConversionType conversionType = std::get<3>(args); + uint32_t& currentVertexElement = std::get<4>(args); + uint32_t& outputElt = std::get<5>(args); + const ComponentEnable compMask = std::get<6>(args); + const ComponentControl(&compCtrl)[4] = std::get<7>(args); + Value*(&vVertexElements)[4] = std::get<8>(args); // cast types - Type *vGatherTy = VectorType::get(mInt32Ty, 8); - Type *v32x8Ty = VectorType::get(mInt8Ty, 32); + Type* vGatherTy = VectorType::get(mInt32Ty, 8); + Type* v32x8Ty = VectorType::get(mInt8Ty, 32); // have to do extra work for sign extending - if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt)) + if ((extendType == Instruction::CastOps::SExt) || + (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt)) { // is this PP float? bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false; - Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane - Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2); + Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane + Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2); // shuffle mask - Value *vConstMask = C<uint8_t>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }); - Value *vi128XY_lo = nullptr; - Value *vi128XY_hi = nullptr; + Value* vConstMask = C<uint8_t>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); + Value* vi128XY_lo = nullptr; + Value* vi128XY_hi = nullptr; if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) { - // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now.. + // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for + // now.. - Value *vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty); - Value *vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty); + Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty); + Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty); - Value *vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy); - Value *vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy); + Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy); + Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy); // after pshufb: group components together in each 128bit lane // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy - vi128XY_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); - vi128XY_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + vi128XY_lo = BITCAST( + VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), + v128bitTy); + vi128XY_hi = BITCAST( + VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), + v128bitTy); // after PERMD: move and pack xy components into each 128bit lane // 256i - 0 1 2 3 4 5 6 7 @@ -1516,32 +1638,36 @@ void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args) } // do the same for zw components - Value *vi128ZW_lo = nullptr; - Value *vi128ZW_hi = nullptr; + Value* vi128ZW_lo = nullptr; + Value* vi128ZW_hi = nullptr; if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) { - Value *vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty); - Value *vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty); - - Value *vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy); - Value *vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy); - - vi128ZW_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); - vi128ZW_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty); + Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty); + + Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy); + Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy); + + vi128ZW_lo = BITCAST( + VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), + v128bitTy); + vi128ZW_hi = BITCAST( + VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), + v128bitTy); } // init denormalize variables if needed Instruction::CastOps IntToFpCast; - Value *conversionFactor; + Value* conversionFactor; switch (conversionType) { case CONVERT_NORMALIZED: - IntToFpCast = Instruction::CastOps::SIToFP; + IntToFpCast = Instruction::CastOps::SIToFP; conversionFactor = VIMMED1((float)(1.0 / 32767.0)); break; case CONVERT_SSCALED: - IntToFpCast = Instruction::CastOps::SIToFP; + IntToFpCast = Instruction::CastOps::SIToFP; conversionFactor = VIMMED1((float)(1.0)); break; case CONVERT_USCALED: @@ -1554,7 +1680,8 @@ void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args) break; } - // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex + // sign extend all enabled components. If we have a fill vVertexElements, output to current + // simdvertex for (uint32_t i = 0; i < 4; i++) { if (isComponentEnabled(compMask, i)) @@ -1564,22 +1691,26 @@ void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args) // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; // if x or y, use vi128XY permute result, else use vi128ZW - Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo; - Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi; + Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo; + Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi; if (bFP) { // extract 128 bit lanes to sign extend each component - Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty)); - Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty)); + Value* temp_lo = + CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty)); + Value* temp_hi = + CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty)); vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi); } else { // extract 128 bit lanes to sign extend each component - Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty)); - Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty)); + Value* temp_lo = + PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty)); + Value* temp_hi = + PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty)); Value* temp = JOIN_16(temp_lo, temp_hi); @@ -1609,37 +1740,40 @@ void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args) } } // else zero extend - else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP)) + else if ((extendType == Instruction::CastOps::ZExt) || + (extendType == Instruction::CastOps::UIToFP)) { // pshufb masks for each component - Value *vConstMask[2]; + Value* vConstMask[2]; if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) { // x/z shuffle mask - vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, - 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); + vConstMask[0] = C<char>({ + 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, + 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, + }); } if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) { // y/w shuffle mask - vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, - 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 }); + vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, + 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); } // init denormalize variables if needed Instruction::CastOps fpCast; - Value* conversionFactor; + Value* conversionFactor; switch (conversionType) { case CONVERT_NORMALIZED: - fpCast = Instruction::CastOps::UIToFP; + fpCast = Instruction::CastOps::UIToFP; conversionFactor = VIMMED1((float)(1.0 / 65535.0)); break; case CONVERT_USCALED: - fpCast = Instruction::CastOps::UIToFP; + fpCast = Instruction::CastOps::UIToFP; conversionFactor = VIMMED1((float)(1.0f)); break; case CONVERT_SSCALED: @@ -1664,17 +1798,22 @@ void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args) // if x or y, use vi128XY permute result, else use vi128ZW uint32_t selectedGather = (i < 2) ? 0 : 1; - // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now.. + // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, + // for now.. - Value *vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0); - Value *vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1); + Value* vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0); + Value* vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1); - Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy); - Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy); + Value* temp_lo = BITCAST( + PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), + vGatherTy); + Value* temp_hi = BITCAST( + PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), + vGatherTy); - // after pshufb mask for x channel; z uses the same shuffle from the second gather - // 256i - 0 1 2 3 4 5 6 7 - // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 + // after pshufb mask for x channel; z uses the same shuffle from the second + // gather 256i - 0 1 2 3 4 5 6 7 + // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 Value* temp = JOIN_16(temp_lo, temp_hi); @@ -1708,44 +1847,47 @@ void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args) } } -void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) +void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs& args) { // Unpack tuple args - Value* (&vGatherResult)[2] = std::get<0>(args); - Value* pVtxOut = std::get<1>(args); - const Instruction::CastOps extendType = std::get<2>(args); - const ConversionType conversionType = std::get<3>(args); - uint32_t ¤tVertexElement = std::get<4>(args); - uint32_t &outputElt = std::get<5>(args); - const ComponentEnable compMask = std::get<6>(args); - const ComponentControl(&compCtrl)[4] = std::get<7>(args); - Value* (&vVertexElements)[4] = std::get<8>(args); + Value*(&vGatherResult)[2] = std::get<0>(args); + Value* pVtxOut = std::get<1>(args); + const Instruction::CastOps extendType = std::get<2>(args); + const ConversionType conversionType = std::get<3>(args); + uint32_t& currentVertexElement = std::get<4>(args); + uint32_t& outputElt = std::get<5>(args); + const ComponentEnable compMask = std::get<6>(args); + const ComponentControl(&compCtrl)[4] = std::get<7>(args); + Value*(&vVertexElements)[4] = std::get<8>(args); // cast types Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); - Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits + Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits - // have to do extra work for sign extending - if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || - (extendType == Instruction::CastOps::FPExt)) + // have to do extra work for sign extending + if ((extendType == Instruction::CastOps::SExt) || + (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt)) { // is this PP float? bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false; - Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane - Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits + Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane + Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), + mVWidth / 4); // vwidth is units of 32 bits - // shuffle mask - Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }); - Value* vi128XY = nullptr; - if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) { - Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy); + // shuffle mask + Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); + Value* vi128XY = nullptr; + if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) + { + Value* vShufResult = + BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy); // after pshufb: group components together in each 128bit lane // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy - vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); // after PERMD: move and pack xy components into each 128bit lane // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy @@ -1753,23 +1895,25 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) // do the same for zw components Value* vi128ZW = nullptr; - if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) { - Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy); - vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) + { + Value* vShufResult = + BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy); + vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); } // init denormalize variables if needed Instruction::CastOps IntToFpCast; - Value* conversionFactor; + Value* conversionFactor; switch (conversionType) { case CONVERT_NORMALIZED: - IntToFpCast = Instruction::CastOps::SIToFP; + IntToFpCast = Instruction::CastOps::SIToFP; conversionFactor = VIMMED1((float)(1.0 / 32767.0)); break; case CONVERT_SSCALED: - IntToFpCast = Instruction::CastOps::SIToFP; + IntToFpCast = Instruction::CastOps::SIToFP; conversionFactor = VIMMED1((float)(1.0)); break; case CONVERT_USCALED: @@ -1782,7 +1926,8 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) break; } - // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex + // sign extend all enabled components. If we have a fill vVertexElements, output to current + // simdvertex for (uint32_t i = 0; i < 4; i++) { if (isComponentEnabled(compMask, i)) @@ -1794,17 +1939,26 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) // if x or y, use vi128XY permute result, else use vi128ZW Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; - if (bFP) { + if (bFP) + { // extract 128 bit lanes to sign extend each component - vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); + vVertexElements[currentVertexElement] = + CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); } - else { + else + { // extract 128 bit lanes to sign extend each component - vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); + vVertexElements[currentVertexElement] = + PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); // denormalize if needed - if (conversionType != CONVERT_NONE) { - vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); + if (conversionType != CONVERT_NONE) + { + vVertexElements[currentVertexElement] = + FMUL(CAST(IntToFpCast, + vVertexElements[currentVertexElement], + mSimdFP32Ty), + conversionFactor); } } currentVertexElement++; @@ -1824,34 +1978,39 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) } } // else zero extend - else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP)) + else if ((extendType == Instruction::CastOps::ZExt) || + (extendType == Instruction::CastOps::UIToFP)) { // pshufb masks for each component Value* vConstMask[2]; - if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) { + if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) + { // x/z shuffle mask - vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, - 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); + vConstMask[0] = C<char>({ + 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, + 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, + }); } - if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) { + if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) + { // y/w shuffle mask - vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, - 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 }); + vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, + 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); } // init denormalize variables if needed Instruction::CastOps fpCast; - Value* conversionFactor; + Value* conversionFactor; switch (conversionType) { case CONVERT_NORMALIZED: - fpCast = Instruction::CastOps::UIToFP; + fpCast = Instruction::CastOps::UIToFP; conversionFactor = VIMMED1((float)(1.0 / 65535.0)); break; case CONVERT_USCALED: - fpCast = Instruction::CastOps::UIToFP; + fpCast = Instruction::CastOps::UIToFP; conversionFactor = VIMMED1((float)(1.0f)); break; case CONVERT_SSCALED: @@ -1876,15 +2035,20 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) // if x or y, use vi128XY permute result, else use vi128ZW uint32_t selectedGather = (i < 2) ? 0 : 1; - vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy); - // after pshufb mask for x channel; z uses the same shuffle from the second gather - // 256i - 0 1 2 3 4 5 6 7 - // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 + vVertexElements[currentVertexElement] = + BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), + vConstMask[selectedMask]), + vGatherTy); + // after pshufb mask for x channel; z uses the same shuffle from the second + // gather 256i - 0 1 2 3 4 5 6 7 + // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 // denormalize if needed if (conversionType != CONVERT_NONE) { - vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); + vVertexElements[currentVertexElement] = + FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), + conversionFactor); } currentVertexElement++; } @@ -1914,7 +2078,10 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) /// @param outputElt - simdvertex offset in VIN to write to /// @param numEltsToStore - number of simdvertex rows to write out /// @param vVertexElements - LLVM Value*[] simdvertex to write out -void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]) +void FetchJit::StoreVertexElements(Value* pVtxOut, + const uint32_t outputElt, + const uint32_t numEltsToStore, + Value* (&vVertexElements)[4]) { SWR_ASSERT(numEltsToStore <= 4, "Invalid element count."); @@ -1924,14 +2091,14 @@ void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, con if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy()) { #if FETCH_DUMP_VERTEX - PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] }); + PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]}); #endif vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty); } #if FETCH_DUMP_VERTEX else { - PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] }); + PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]}); } #endif // outputElt * 4 = offsetting by the size of a simdvertex @@ -1942,10 +2109,10 @@ void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, con } ////////////////////////////////////////////////////////////////////////// -/// @brief Generates a constant vector of values based on the +/// @brief Generates a constant vector of values based on the /// ComponentControl value /// @param ctrl - ComponentControl value -Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl) +Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl) { switch (ctrl) { @@ -1961,21 +2128,23 @@ Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl) { if (mVWidth == 16) { - Type* pSimd8FPTy = VectorType::get(mFP32Ty, 8); - Value *pIdLo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), pSimd8FPTy); - Value *pIdHi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), pSimd8FPTy); + Type* pSimd8FPTy = VectorType::get(mFP32Ty, 8); + Value* pIdLo = + BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), pSimd8FPTy); + Value* pIdHi = + BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2})), pSimd8FPTy); return JOIN_16(pIdLo, pIdHi); } else { - return BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty); + return BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), mSimdFP32Ty); } } case StoreInstanceId: - { - Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty); - return VBROADCAST(pId); - } + { + Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance})), mFP32Ty); + return VBROADCAST(pId); + } case StoreSrc: @@ -1994,15 +2163,20 @@ bool isComponentEnabled(ComponentEnable enableMask, uint8_t component) switch (component) { // X - case 0: return (enableMask & ComponentEnable::X); + case 0: + return (enableMask & ComponentEnable::X); // Y - case 1: return (enableMask & ComponentEnable::Y); + case 1: + return (enableMask & ComponentEnable::Y); // Z - case 2: return (enableMask & ComponentEnable::Z); + case 2: + return (enableMask & ComponentEnable::Z); // W - case 3: return (enableMask & ComponentEnable::W); + case 3: + return (enableMask & ComponentEnable::W); - default: return false; + default: + return false; } } @@ -2018,21 +2192,22 @@ static std::mutex gFetchCodegenMutex; /// @return PFN_FETCH_FUNC - pointer to fetch code PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc) { - const llvm::Function* func = (const llvm::Function*)hFunc; - JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); - PFN_FETCH_FUNC pfnFetch; + const llvm::Function* func = (const llvm::Function*)hFunc; + JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); + PFN_FETCH_FUNC pfnFetch; gFetchCodegenMutex.lock(); pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); - // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module + // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot + // add new IR to the module pJitMgr->mIsModuleFinalized = true; #if defined(KNOB_SWRC_TRACING) - char fName[1024]; - const char *funcName = func->getName().data(); + char fName[1024]; + const char* funcName = func->getName().data(); sprintf(fName, "%s.bin", funcName); - FILE *fd = fopen(fName, "wb"); - fwrite((void *)pfnFetch, 1, 2048, fd); + FILE* fd = fopen(fName, "wb"); + fwrite((void*)pfnFetch, 1, 2048, fd); fclose(fd); #endif @@ -2040,7 +2215,6 @@ PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc) gFetchCodegenMutex.unlock(); - return pfnFetch; } @@ -2055,7 +2229,7 @@ extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_CO pJitMgr->SetupNewModule(); FetchJit theJit(pJitMgr); - HANDLE hFunc = theJit.Create(state); + HANDLE hFunc = theJit.Create(state); return JitFetchFunc(hJitMgr, hFunc); } diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h index de0ec4f8330..abc3091354f 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h @@ -1,32 +1,32 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file fetch_jit.h -* -* @brief Definition of the fetch jitter -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file fetch_jit.h + * + * @brief Definition of the fetch jitter + * + * Notes: + * + ******************************************************************************/ #pragma once #include "common/formats.h" @@ -41,17 +41,17 @@ struct INPUT_ELEMENT_DESC { struct { - uint32_t AlignedByteOffset : 12; - uint32_t Format : 10; - uint32_t StreamIndex : 6; - uint32_t InstanceEnable : 1; - uint32_t InstanceStrideEnable : 1; - uint32_t ComponentControl0 : 4; - uint32_t ComponentControl1 : 4; - uint32_t ComponentControl2 : 4; - uint32_t ComponentControl3 : 4; - uint32_t ComponentPacking : 4; - uint32_t _reserved : 14; + uint32_t AlignedByteOffset : 12; + uint32_t Format : 10; + uint32_t StreamIndex : 6; + uint32_t InstanceEnable : 1; + uint32_t InstanceStrideEnable : 1; + uint32_t ComponentControl0 : 4; + uint32_t ComponentControl1 : 4; + uint32_t ComponentControl2 : 4; + uint32_t ComponentControl3 : 4; + uint32_t ComponentPacking : 4; + uint32_t _reserved : 14; }; uint64_t bits; }; @@ -95,40 +95,52 @@ enum ComponentControl ////////////////////////////////////////////////////////////////////////// struct FETCH_COMPILE_STATE { - uint32_t numAttribs{ 0 }; + uint32_t numAttribs{0}; INPUT_ELEMENT_DESC layout[SWR_VTX_NUM_SLOTS]; - SWR_FORMAT indexType; - uint32_t cutIndex{ 0xffffffff }; + SWR_FORMAT indexType; + uint32_t cutIndex{0xffffffff}; // Options that effect the JIT'd code - bool bDisableIndexOOBCheck; // If enabled, FetchJit will exclude index OOB check - bool bEnableCutIndex{ false }; // Compares indices with the cut index and returns a cut mask - bool bVertexIDOffsetEnable{ false }; // Offset vertexID by StartVertex for non-indexed draws or BaseVertex for indexed draws - bool bPartialVertexBuffer{ false }; // for indexed draws, map illegal indices to a known resident vertex + bool bDisableIndexOOBCheck; // If enabled, FetchJit will exclude index OOB check + bool bEnableCutIndex{false}; // Compares indices with the cut index and returns a cut mask + bool bVertexIDOffsetEnable{false}; // Offset vertexID by StartVertex for non-indexed draws or + // BaseVertex for indexed draws + bool bPartialVertexBuffer{ + false}; // for indexed draws, map illegal indices to a known resident vertex - bool bForceSequentialAccessEnable{ false }; - bool bInstanceIDOffsetEnable{ false }; + bool bForceSequentialAccessEnable{false}; + bool bInstanceIDOffsetEnable{false}; - FETCH_COMPILE_STATE(bool diableIndexOOBCheck = false): - bDisableIndexOOBCheck(diableIndexOOBCheck){ }; + FETCH_COMPILE_STATE(bool diableIndexOOBCheck = false) : + bDisableIndexOOBCheck(diableIndexOOBCheck){}; - bool operator==(const FETCH_COMPILE_STATE &other) const + bool operator==(const FETCH_COMPILE_STATE& other) const { - if (numAttribs != other.numAttribs) return false; - if (indexType != other.indexType) return false; - if (bDisableIndexOOBCheck != other.bDisableIndexOOBCheck) return false; - if (bEnableCutIndex != other.bEnableCutIndex) return false; - if (cutIndex != other.cutIndex) return false; - if (bVertexIDOffsetEnable != other.bVertexIDOffsetEnable) return false; - if (bPartialVertexBuffer != other.bPartialVertexBuffer) return false; - if (bForceSequentialAccessEnable != other.bForceSequentialAccessEnable) return false; - if (bInstanceIDOffsetEnable != other.bInstanceIDOffsetEnable) return false; + if (numAttribs != other.numAttribs) + return false; + if (indexType != other.indexType) + return false; + if (bDisableIndexOOBCheck != other.bDisableIndexOOBCheck) + return false; + if (bEnableCutIndex != other.bEnableCutIndex) + return false; + if (cutIndex != other.cutIndex) + return false; + if (bVertexIDOffsetEnable != other.bVertexIDOffsetEnable) + return false; + if (bPartialVertexBuffer != other.bPartialVertexBuffer) + return false; + if (bForceSequentialAccessEnable != other.bForceSequentialAccessEnable) + return false; + if (bInstanceIDOffsetEnable != other.bInstanceIDOffsetEnable) + return false; for (uint32_t i = 0; i < numAttribs; ++i) { if ((layout[i].bits != other.layout[i].bits) || - (((layout[i].InstanceEnable == 1) || (layout[i].InstanceStrideEnable == 1)) && - (layout[i].InstanceAdvancementState != other.layout[i].InstanceAdvancementState))){ + (((layout[i].InstanceEnable == 1) || (layout[i].InstanceStrideEnable == 1)) && + (layout[i].InstanceAdvancementState != other.layout[i].InstanceAdvancementState))) + { return false; } } diff --git a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp index f2bd8889fc5..2a01c706b96 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp @@ -1,32 +1,32 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file lower_x86.cpp -* -* @brief llvm pass to lower meta code to x86 -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file lower_x86.cpp + * + * @brief llvm pass to lower meta code to x86 + * + * Notes: + * + ******************************************************************************/ #include "jit_pch.hpp" #include "passes.h" @@ -34,12 +34,11 @@ #include <unordered_map> - namespace llvm { // foward declare the initializer - void initializeLowerX86Pass(PassRegistry&); -} + void initializeLowerX86Pass(PassRegistry &); +} // namespace llvm namespace SwrJit { @@ -47,97 +46,135 @@ namespace SwrJit enum TargetArch { - AVX = 0, - AVX2 = 1, + AVX = 0, + AVX2 = 1, AVX512 = 2 }; enum TargetWidth { - W256 = 0, - W512 = 1, + W256 = 0, + W512 = 1, NUM_WIDTHS = 2 }; struct LowerX86; - typedef std::function<Instruction*(LowerX86*, TargetArch, TargetWidth, CallInst*)> EmuFunc; + typedef std::function<Instruction *(LowerX86 *, TargetArch, TargetWidth, CallInst *)> EmuFunc; struct X86Intrinsic { Intrinsic::ID intrin[NUM_WIDTHS]; - EmuFunc emuFunc; + EmuFunc emuFunc; }; - // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the previous behavior of - // mapping directly to avx/avx2 intrinsics. + // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the + // previous behavior of mapping directly to avx/avx2 intrinsics. static std::map<std::string, Intrinsic::ID> intrinsicMap = { - {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32}, - {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b}, - {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256}, - {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256}, - {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256}, - {"meta.intrinsic.VFMADDPS", Intrinsic::x86_fma_vfmadd_ps_256}, - {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d}, - {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32}, - {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc}, + {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32}, + {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b}, + {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256}, + {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256}, + {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256}, + {"meta.intrinsic.VFMADDPS", Intrinsic::x86_fma_vfmadd_ps_256}, + {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d}, + {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32}, + {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc}, }; // Forward decls - Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); - Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); - Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); - Instruction* VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); - Instruction* VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); - - Instruction* DOUBLE_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst, Intrinsic::ID intrin); - + Instruction *NO_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst); + Instruction * + VPERM_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst); + Instruction * + VGATHER_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst); + Instruction * + VROUND_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst); + Instruction * + VHSUB_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst); + + Instruction *DOUBLE_EMU(LowerX86 * pThis, + TargetArch arch, + TargetWidth width, + CallInst * pCallInst, + Intrinsic::ID intrin); + static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1; static std::map<std::string, X86Intrinsic> intrinsicMap2[] = { // 256 wide 512 wide - { // AVX - {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}}, - {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, - {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, - {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, - {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, - {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}}, - {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}}, - }, - { // AVX2 - {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}}, - {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}}, - {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}}, - {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE}, NO_EMU}}, - {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, - {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}}, - {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}}, - }, - { // AVX512 - {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}}, - {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256, Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}}, - {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256, Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}}, - {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512 }, NO_EMU}}, - {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_avx512_mask_vcvtph2ps_256, Intrinsic::x86_avx512_mask_vcvtph2ps_512 }, NO_EMU}}, - {"meta.intrinsic.VROUND", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VROUND_EMU}}, - {"meta.intrinsic.VHSUBPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VHSUB_EMU}}, - } - }; + { + // AVX + {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}}, + {"meta.intrinsic.VPERMPS", + {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, + {"meta.intrinsic.VPERMD", + {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, + {"meta.intrinsic.VGATHERPD", + {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERPS", + {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERDD", + {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VCVTPD2PS", + {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, + {"meta.intrinsic.VCVTPH2PS", + {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, + {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}}, + {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}}, + }, + { + // AVX2 + {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}}, + {"meta.intrinsic.VPERMPS", + {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}}, + {"meta.intrinsic.VPERMD", + {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}}, + {"meta.intrinsic.VGATHERPD", + {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERPS", + {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERDD", + {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE}, NO_EMU}}, + {"meta.intrinsic.VCVTPH2PS", + {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, + {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}}, + {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}}, + }, + { + // AVX512 + {"meta.intrinsic.VRCPPS", + {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}}, + {"meta.intrinsic.VPERMPS", + {{Intrinsic::x86_avx512_mask_permvar_sf_256, + Intrinsic::x86_avx512_mask_permvar_sf_512}, + NO_EMU}}, + {"meta.intrinsic.VPERMD", + {{Intrinsic::x86_avx512_mask_permvar_si_256, + Intrinsic::x86_avx512_mask_permvar_si_512}, + NO_EMU}}, + {"meta.intrinsic.VGATHERPD", + {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERPS", + {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERDD", + {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VCVTPD2PS", + {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512}, + NO_EMU}}, + {"meta.intrinsic.VCVTPH2PS", + {{Intrinsic::x86_avx512_mask_vcvtph2ps_256, Intrinsic::x86_avx512_mask_vcvtph2ps_512}, + NO_EMU}}, + {"meta.intrinsic.VROUND", + {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VROUND_EMU}}, + {"meta.intrinsic.VHSUBPS", + {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VHSUB_EMU}}, + }}; struct LowerX86 : public FunctionPass { - LowerX86(Builder* b = nullptr) - : FunctionPass(ID), B(b) + LowerX86(Builder *b = nullptr) : FunctionPass(ID), B(b) { initializeLowerX86Pass(*PassRegistry::getPassRegistry()); @@ -153,7 +190,6 @@ namespace SwrJit else if (JM()->mArch.AVX()) { mTarget = AVX; - } else { @@ -166,9 +202,12 @@ namespace SwrJit // across all intrinsics, and will have to be rethought. Probably need something // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed // intrinsic. - void GetRequestedWidthAndType(CallInst* pCallInst, const StringRef intrinName, TargetWidth* pWidth, Type** pTy) + void GetRequestedWidthAndType(CallInst * pCallInst, + const StringRef intrinName, + TargetWidth * pWidth, + Type ** pTy) { - Type* pVecTy = pCallInst->getType(); + Type *pVecTy = pCallInst->getType(); // Check for intrinsic specific types // VCVTPD2PS type comes from src, not dst @@ -179,7 +218,7 @@ namespace SwrJit if (!pVecTy->isVectorTy()) { - for (auto& op : pCallInst->arg_operands()) + for (auto &op : pCallInst->arg_operands()) { if (op.get()->getType()->isVectorTy()) { @@ -193,53 +232,68 @@ namespace SwrJit uint32_t width = cast<VectorType>(pVecTy)->getBitWidth(); switch (width) { - case 256: *pWidth = W256; break; - case 512: *pWidth = W512; break; - default: SWR_ASSERT(false, "Unhandled vector width %d", width); + case 256: + *pWidth = W256; + break; + case 512: + *pWidth = W512; + break; + default: + SWR_ASSERT(false, "Unhandled vector width %d", width); *pWidth = W256; } *pTy = pVecTy->getScalarType(); } - Value* GetZeroVec(TargetWidth width, Type* pTy) + Value *GetZeroVec(TargetWidth width, Type *pTy) { uint32_t numElem = 0; switch (width) { - case W256: numElem = 8; break; - case W512: numElem = 16; break; - default: SWR_ASSERT(false, "Unhandled vector width type %d\n", width); + case W256: + numElem = 8; + break; + case W512: + numElem = 16; + break; + default: + SWR_ASSERT(false, "Unhandled vector width type %d\n", width); } return ConstantVector::getNullValue(VectorType::get(pTy, numElem)); } - Value* GetMask(TargetWidth width) + Value *GetMask(TargetWidth width) { - Value* mask; + Value *mask; switch (width) { - case W256: mask = B->C((uint8_t)-1); break; - case W512: mask = B->C((uint16_t)-1); break; - default: SWR_ASSERT(false, "Unhandled vector width type %d\n", width); + case W256: + mask = B->C((uint8_t)-1); + break; + case W512: + mask = B->C((uint16_t)-1); + break; + default: + SWR_ASSERT(false, "Unhandled vector width type %d\n", width); } return mask; } // Convert <N x i1> mask to <N x i32> x86 mask - Value* VectorMask(Value* vi1Mask) + Value *VectorMask(Value *vi1Mask) { uint32_t numElem = vi1Mask->getType()->getVectorNumElements(); return B->S_EXT(vi1Mask, VectorType::get(B->mInt32Ty, numElem)); } - Instruction* ProcessIntrinsicAdvanced(CallInst* pCallInst) + Instruction *ProcessIntrinsicAdvanced(CallInst *pCallInst) { - Function* pFunc = pCallInst->getCalledFunction(); - auto& intrinsic = intrinsicMap2[mTarget][pFunc->getName()]; + Function * pFunc = pCallInst->getCalledFunction(); + auto & intrinsic = intrinsicMap2[mTarget][pFunc->getName()]; TargetWidth vecWidth; - Type* pElemTy; + Type * pElemTy; GetRequestedWidthAndType(pCallInst, pFunc->getName(), &vecWidth, &pElemTy); // Check if there is a native intrinsic for this instruction @@ -249,29 +303,33 @@ namespace SwrJit // Double pump the next smaller SIMD intrinsic SWR_ASSERT(vecWidth != 0, "Cannot double pump smallest SIMD width."); Intrinsic::ID id2 = intrinsic.intrin[vecWidth - 1]; - SWR_ASSERT(id2 != Intrinsic::not_intrinsic, "Cannot find intrinsic to double pump."); + SWR_ASSERT(id2 != Intrinsic::not_intrinsic, + "Cannot find intrinsic to double pump."); return DOUBLE_EMU(this, mTarget, vecWidth, pCallInst, id2); } else if (id != Intrinsic::not_intrinsic) { - Function* pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id); - SmallVector<Value*, 8> args; - for (auto& arg : pCallInst->arg_operands()) + Function *pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id); + SmallVector<Value *, 8> args; + for (auto &arg : pCallInst->arg_operands()) { args.push_back(arg.get()); } - // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and full mask for now - // Assuming the intrinsics are consistent and place the src operand and mask last in the argument list. + // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and + // full mask for now Assuming the intrinsics are consistent and place the src + // operand and mask last in the argument list. if (mTarget == AVX512) { - if (pFunc->getName().equals("meta.intrinsic.VCVTPD2PS")) { + if (pFunc->getName().equals("meta.intrinsic.VCVTPD2PS")) + { args.push_back(GetZeroVec(W256, pCallInst->getType()->getScalarType())); args.push_back(GetMask(W256)); // for AVX512 VCVTPD2PS, we also have to add rounding mode - args.push_back(B->C(_MM_FROUND_TO_NEAREST_INT | - _MM_FROUND_NO_EXC)); - } else { + args.push_back(B->C(_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + } + else + { args.push_back(GetZeroVec(vecWidth, pElemTy)); args.push_back(GetMask(vecWidth)); } @@ -289,23 +347,26 @@ namespace SwrJit return nullptr; } - Instruction* ProcessIntrinsic(CallInst* pCallInst) + Instruction *ProcessIntrinsic(CallInst *pCallInst) { - Function* pFunc = pCallInst->getCalledFunction(); - + Function *pFunc = pCallInst->getCalledFunction(); + // Forward to the advanced support if found if (intrinsicMap2[mTarget].find(pFunc->getName()) != intrinsicMap2[mTarget].end()) { return ProcessIntrinsicAdvanced(pCallInst); } - SWR_ASSERT(intrinsicMap.find(pFunc->getName()) != intrinsicMap.end(), "Unimplemented intrinsic %s.", pFunc->getName()); + SWR_ASSERT(intrinsicMap.find(pFunc->getName()) != intrinsicMap.end(), + "Unimplemented intrinsic %s.", + pFunc->getName()); Intrinsic::ID x86Intrinsic = intrinsicMap[pFunc->getName()]; - Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic); + Function * pX86IntrinFunc = + Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic); - SmallVector<Value*, 8> args; - for (auto& arg : pCallInst->arg_operands()) + SmallVector<Value *, 8> args; + for (auto &arg : pCallInst->arg_operands()) { args.push_back(arg.get()); } @@ -315,34 +376,33 @@ namespace SwrJit ////////////////////////////////////////////////////////////////////////// /// @brief LLVM funtion pass run method. /// @param f- The function we're working on with this pass. - virtual bool runOnFunction(Function& F) + virtual bool runOnFunction(Function &F) { - std::vector<Instruction*> toRemove; + std::vector<Instruction *> toRemove; - for (auto& BB : F.getBasicBlockList()) + for (auto &BB : F.getBasicBlockList()) { - for (auto& I : BB.getInstList()) + for (auto &I : BB.getInstList()) { - if (CallInst* pCallInst = dyn_cast<CallInst>(&I)) + if (CallInst *pCallInst = dyn_cast<CallInst>(&I)) { - Function* pFunc = pCallInst->getCalledFunction(); + Function *pFunc = pCallInst->getCalledFunction(); if (pFunc) { if (pFunc->getName().startswith("meta.intrinsic")) { B->IRB()->SetInsertPoint(&I); - Instruction* pReplace = ProcessIntrinsic(pCallInst); + Instruction *pReplace = ProcessIntrinsic(pCallInst); SWR_ASSERT(pReplace); toRemove.push_back(pCallInst); pCallInst->replaceAllUsesWith(pReplace); } } - } } } - for (auto* pInst : toRemove) + for (auto *pInst : toRemove) { pInst->eraseFromParent(); } @@ -352,42 +412,37 @@ namespace SwrJit return true; } - virtual void getAnalysisUsage(AnalysisUsage& AU) const - { - } + virtual void getAnalysisUsage(AnalysisUsage &AU) const {} - JitManager* JM() { return B->JM(); } + JitManager *JM() { return B->JM(); } - Builder* B; + Builder *B; TargetArch mTarget; - static char ID; ///< Needed by LLVM to generate ID for FunctionPass. + static char ID; ///< Needed by LLVM to generate ID for FunctionPass. }; - char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID. + char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID. - FunctionPass* createLowerX86Pass(Builder* b) - { - return new LowerX86(b); - } + FunctionPass *createLowerX86Pass(Builder *b) { return new LowerX86(b); } - Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) + Instruction *NO_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst) { SWR_ASSERT(false, "Unimplemented intrinsic emulation."); return nullptr; } - Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) + Instruction *VPERM_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst) { // Only need vperm emulation for AVX SWR_ASSERT(arch == AVX); - Builder* B = pThis->B; - auto v32A = pCallInst->getArgOperand(0); - auto vi32Index = pCallInst->getArgOperand(1); + Builder *B = pThis->B; + auto v32A = pCallInst->getArgOperand(0); + auto vi32Index = pCallInst->getArgOperand(1); - Value* v32Result; + Value *v32Result; if (isa<Constant>(vi32Index)) { // Can use llvm shuffle vector directly with constant shuffle indices @@ -399,67 +454,71 @@ namespace SwrJit for (uint32_t l = 0; l < v32A->getType()->getVectorNumElements(); ++l) { auto i32Index = B->VEXTRACT(vi32Index, B->C(l)); - auto val = B->VEXTRACT(v32A, i32Index); - v32Result = B->VINSERT(v32Result, val, B->C(l)); + auto val = B->VEXTRACT(v32A, i32Index); + v32Result = B->VINSERT(v32Result, val, B->C(l)); } } return cast<Instruction>(v32Result); } - Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) + Instruction * + VGATHER_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst) { - Builder* B = pThis->B; - auto vSrc = pCallInst->getArgOperand(0); - auto pBase = pCallInst->getArgOperand(1); - auto vi32Indices = pCallInst->getArgOperand(2); - auto vi1Mask = pCallInst->getArgOperand(3); - auto i8Scale = pCallInst->getArgOperand(4); - - pBase = B->POINTER_CAST(pBase, PointerType::get(B->mInt8Ty, 0)); - uint32_t numElem = vSrc->getType()->getVectorNumElements(); - auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty); - auto srcTy = vSrc->getType()->getVectorElementType(); - Value* v32Gather; + Builder *B = pThis->B; + auto vSrc = pCallInst->getArgOperand(0); + auto pBase = pCallInst->getArgOperand(1); + auto vi32Indices = pCallInst->getArgOperand(2); + auto vi1Mask = pCallInst->getArgOperand(3); + auto i8Scale = pCallInst->getArgOperand(4); + + pBase = B->POINTER_CAST(pBase, PointerType::get(B->mInt8Ty, 0)); + uint32_t numElem = vSrc->getType()->getVectorNumElements(); + auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty); + auto srcTy = vSrc->getType()->getVectorElementType(); + Value * v32Gather; if (arch == AVX) { // Full emulation for AVX // Store source on stack to provide a valid address to load from inactive lanes auto pStack = B->STACKSAVE(); - auto pTmp = B->ALLOCA(vSrc->getType()); + auto pTmp = B->ALLOCA(vSrc->getType()); B->STORE(vSrc, pTmp); - v32Gather = UndefValue::get(vSrc->getType()); - auto vi32Scale = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale)); + v32Gather = UndefValue::get(vSrc->getType()); + auto vi32Scale = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale)); auto vi32Offsets = B->MUL(vi32Indices, vi32Scale); for (uint32_t i = 0; i < numElem; ++i) { - auto i32Offset = B->VEXTRACT(vi32Offsets, B->C(i)); - auto pLoadAddress = B->GEP(pBase, i32Offset); - pLoadAddress = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0)); - auto pMaskedLoadAddress = B->GEP(pTmp, { 0, i }); - auto i1Mask = B->VEXTRACT(vi1Mask, B->C(i)); - auto pValidAddress = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress); - auto val = B->LOAD(pValidAddress); - v32Gather = B->VINSERT(v32Gather, val, B->C(i)); + auto i32Offset = B->VEXTRACT(vi32Offsets, B->C(i)); + auto pLoadAddress = B->GEP(pBase, i32Offset); + pLoadAddress = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0)); + auto pMaskedLoadAddress = B->GEP(pTmp, {0, i}); + auto i1Mask = B->VEXTRACT(vi1Mask, B->C(i)); + auto pValidAddress = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress); + auto val = B->LOAD(pValidAddress); + v32Gather = B->VINSERT(v32Gather, val, B->C(i)); } B->STACKRESTORE(pStack); } else if (arch == AVX2 || (arch == AVX512 && width == W256)) { - Function* pX86IntrinFunc; + Function *pX86IntrinFunc; if (srcTy == B->mFP32Ty) { - pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_ps_256); - } + pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, + Intrinsic::x86_avx2_gather_d_ps_256); + } else if (srcTy == B->mInt32Ty) { - pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_d_256); + pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, + Intrinsic::x86_avx2_gather_d_d_256); } else if (srcTy == B->mDoubleTy) { - pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_q_256); + pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, + Intrinsic::x86_avx2_gather_d_q_256); } else { @@ -469,7 +528,7 @@ namespace SwrJit if (width == W256) { auto v32Mask = B->BITCAST(pThis->VectorMask(vi1Mask), vSrc->getType()); - v32Gather = B->CALL(pX86IntrinFunc, { vSrc, pBase, vi32Indices, v32Mask, i8Scale }); + v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, v32Mask, i8Scale}); } else if (width == W512) { @@ -477,45 +536,58 @@ namespace SwrJit if (vSrc->getType()->getVectorElementType() == B->mDoubleTy) { auto v64Mask = pThis->VectorMask(vi1Mask); - v64Mask = B->S_EXT(v64Mask, - VectorType::get(B->mInt64Ty, v64Mask->getType()->getVectorNumElements())); + v64Mask = B->S_EXT( + v64Mask, + VectorType::get(B->mInt64Ty, v64Mask->getType()->getVectorNumElements())); v64Mask = B->BITCAST(v64Mask, vSrc->getType()); - Value* src0 = B->VSHUFFLE(vSrc, vSrc, B->C({ 0, 1, 2, 3 })); - Value* src1 = B->VSHUFFLE(vSrc, vSrc, B->C({ 4, 5, 6, 7 })); - - Value* indices0 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({ 0, 1, 2, 3 })); - Value* indices1 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({ 4, 5, 6, 7 })); - - Value* mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({ 0, 1, 2, 3 })); - Value* mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({ 4, 5, 6, 7 })); - - src0 = B->BITCAST(src0, VectorType::get(B->mInt64Ty, src0->getType()->getVectorNumElements())); - mask0 = B->BITCAST(mask0, VectorType::get(B->mInt64Ty, mask0->getType()->getVectorNumElements())); - Value* gather0 = B->CALL(pX86IntrinFunc, { src0, pBase, indices0, mask0, i8Scale }); - src1 = B->BITCAST(src1, VectorType::get(B->mInt64Ty, src1->getType()->getVectorNumElements())); - mask1 = B->BITCAST(mask1, VectorType::get(B->mInt64Ty, mask1->getType()->getVectorNumElements())); - Value* gather1 = B->CALL(pX86IntrinFunc, { src1, pBase, indices1, mask1, i8Scale }); - - v32Gather = B->VSHUFFLE(gather0, gather1, B->C({ 0, 1, 2, 3, 4, 5, 6, 7 })); + Value *src0 = B->VSHUFFLE(vSrc, vSrc, B->C({0, 1, 2, 3})); + Value *src1 = B->VSHUFFLE(vSrc, vSrc, B->C({4, 5, 6, 7})); + + Value *indices0 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3})); + Value *indices1 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({4, 5, 6, 7})); + + Value *mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({0, 1, 2, 3})); + Value *mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({4, 5, 6, 7})); + + src0 = B->BITCAST( + src0, + VectorType::get(B->mInt64Ty, src0->getType()->getVectorNumElements())); + mask0 = B->BITCAST( + mask0, + VectorType::get(B->mInt64Ty, mask0->getType()->getVectorNumElements())); + Value *gather0 = + B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale}); + src1 = B->BITCAST( + src1, + VectorType::get(B->mInt64Ty, src1->getType()->getVectorNumElements())); + mask1 = B->BITCAST( + mask1, + VectorType::get(B->mInt64Ty, mask1->getType()->getVectorNumElements())); + Value *gather1 = + B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale}); + + v32Gather = B->VSHUFFLE(gather0, gather1, B->C({0, 1, 2, 3, 4, 5, 6, 7})); v32Gather = B->BITCAST(v32Gather, vSrc->getType()); } else { // Double pump 8-wide for 32bit elements auto v32Mask = pThis->VectorMask(vi1Mask); - v32Mask = B->BITCAST(v32Mask, vSrc->getType()); - Value* src0 = B->EXTRACT_16(vSrc, 0); - Value* src1 = B->EXTRACT_16(vSrc, 1); + v32Mask = B->BITCAST(v32Mask, vSrc->getType()); + Value *src0 = B->EXTRACT_16(vSrc, 0); + Value *src1 = B->EXTRACT_16(vSrc, 1); - Value* indices0 = B->EXTRACT_16(vi32Indices, 0); - Value* indices1 = B->EXTRACT_16(vi32Indices, 1); + Value *indices0 = B->EXTRACT_16(vi32Indices, 0); + Value *indices1 = B->EXTRACT_16(vi32Indices, 1); - Value* mask0 = B->EXTRACT_16(v32Mask, 0); - Value* mask1 = B->EXTRACT_16(v32Mask, 1); + Value *mask0 = B->EXTRACT_16(v32Mask, 0); + Value *mask1 = B->EXTRACT_16(v32Mask, 1); - Value* gather0 = B->CALL(pX86IntrinFunc, { src0, pBase, indices0, mask0, i8Scale }); - Value* gather1 = B->CALL(pX86IntrinFunc, { src1, pBase, indices1, mask1, i8Scale }); + Value *gather0 = + B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale}); + Value *gather1 = + B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale}); v32Gather = B->JOIN_16(gather0, gather1); } @@ -523,22 +595,25 @@ namespace SwrJit } else if (arch == AVX512) { - Value* iMask; - Function* pX86IntrinFunc; + Value * iMask; + Function *pX86IntrinFunc; if (srcTy == B->mFP32Ty) { - pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dps_512); - iMask = B->BITCAST(vi1Mask, B->mInt16Ty); + pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, + Intrinsic::x86_avx512_gather_dps_512); + iMask = B->BITCAST(vi1Mask, B->mInt16Ty); } else if (srcTy == B->mInt32Ty) { - pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dpi_512); - iMask = B->BITCAST(vi1Mask, B->mInt16Ty); + pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, + Intrinsic::x86_avx512_gather_dpi_512); + iMask = B->BITCAST(vi1Mask, B->mInt16Ty); } else if (srcTy == B->mDoubleTy) { - pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dpd_512); - iMask = B->BITCAST(vi1Mask, B->mInt8Ty); + pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, + Intrinsic::x86_avx512_gather_dpd_512); + iMask = B->BITCAST(vi1Mask, B->mInt8Ty); } else { @@ -546,21 +621,24 @@ namespace SwrJit } auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty); - v32Gather = B->CALL(pX86IntrinFunc, { vSrc, pBase, vi32Indices, iMask, i32Scale }); + v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, iMask, i32Scale}); } return cast<Instruction>(v32Gather); } - // No support for vroundps in avx512 (it is available in kncni), so emulate with avx instructions - Instruction* VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) + // No support for vroundps in avx512 (it is available in kncni), so emulate with avx + // instructions + Instruction * + VROUND_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst) { SWR_ASSERT(arch == AVX512); - auto B = pThis->B; + auto B = pThis->B; auto vf32Src = pCallInst->getOperand(0); auto i8Round = pCallInst->getOperand(1); - auto pfnFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256); + auto pfnFunc = + Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256); if (width == W256) { @@ -585,25 +663,26 @@ namespace SwrJit } // No support for hsub in AVX512 - Instruction* VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) + Instruction *VHSUB_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst) { SWR_ASSERT(arch == AVX512); - auto B = pThis->B; + auto B = pThis->B; auto src0 = pCallInst->getOperand(0); auto src1 = pCallInst->getOperand(1); // 256b hsub can just use avx intrinsic if (width == W256) { - auto pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256); + auto pX86IntrinFunc = + Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256); return cast<Instruction>(B->CALL2(pX86IntrinFunc, src0, src1)); } else if (width == W512) { // 512b hsub can be accomplished with shuf/sub combo - auto minuend = B->VSHUFFLE(src0, src1, B->C({ 0, 2, 8, 10, 4, 6, 12, 14 })); - auto subtrahend = B->VSHUFFLE(src0, src1, B->C({ 1, 3, 9, 11, 5, 7, 13, 15 })); + auto minuend = B->VSHUFFLE(src0, src1, B->C({0, 2, 8, 10, 4, 6, 12, 14})); + auto subtrahend = B->VSHUFFLE(src0, src1, B->C({1, 3, 9, 11, 5, 7, 13, 15})); return cast<Instruction>(B->SUB(minuend, subtrahend)); } else @@ -613,25 +692,30 @@ namespace SwrJit } } - // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from each vector argument and - // calls the 256 wide intrinsic, then merges the results to 512 wide - Instruction* DOUBLE_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst, Intrinsic::ID intrin) + // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from + // each vector argument and calls the 256 wide intrinsic, then merges the results to 512 wide + Instruction *DOUBLE_EMU(LowerX86 * pThis, + TargetArch arch, + TargetWidth width, + CallInst * pCallInst, + Intrinsic::ID intrin) { auto B = pThis->B; SWR_ASSERT(width == W512); - Value* result[2]; - Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, intrin); + Value * result[2]; + Function *pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, intrin); for (uint32_t i = 0; i < 2; ++i) { - SmallVector<Value*, 8> args; - for (auto& arg : pCallInst->arg_operands()) + SmallVector<Value *, 8> args; + for (auto &arg : pCallInst->arg_operands()) { auto argType = arg.get()->getType(); if (argType->isVectorTy()) { - uint32_t vecWidth = argType->getVectorNumElements(); - Value *lanes = B->CInc<int>(i*vecWidth/2, vecWidth/2); - Value *argToPush = B->VSHUFFLE(arg.get(), B->VUNDEF(argType->getVectorElementType(), vecWidth), lanes); + uint32_t vecWidth = argType->getVectorNumElements(); + Value * lanes = B->CInc<int>(i * vecWidth / 2, vecWidth / 2); + Value * argToPush = B->VSHUFFLE( + arg.get(), B->VUNDEF(argType->getVectorElementType(), vecWidth), lanes); args.push_back(argToPush); } else @@ -646,7 +730,7 @@ namespace SwrJit { assert(result[1]->getType()->isVectorTy()); vecWidth = result[0]->getType()->getVectorNumElements() + - result[1]->getType()->getVectorNumElements(); + result[1]->getType()->getVectorNumElements(); } else { @@ -656,10 +740,9 @@ namespace SwrJit return cast<Instruction>(B->VSHUFFLE(result[0], result[1], lanes)); } -} +} // namespace SwrJit using namespace SwrJit; INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false) INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false) - diff --git a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h index 95ef4bcf016..d3c732af042 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h @@ -1,30 +1,30 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file passes.h -* -* @brief Include file for llvm passes -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file passes.h + * + * @brief Include file for llvm passes + * + ******************************************************************************/ #include "JitManager.h" #include "builder.h" @@ -34,4 +34,4 @@ namespace SwrJit using namespace llvm; FunctionPass* createLowerX86Pass(Builder* b); -} +} // namespace SwrJit diff --git a/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h index fb6cf9b3f0a..cc986a78e0a 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h @@ -1,32 +1,32 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file jit_api.h -* -* @brief Platform independent JIT interface -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file jit_api.h + * + * @brief Platform independent JIT interface + * + * Notes: + * + ******************************************************************************/ #pragma once #include "common/os.h" #include "core/utils.h" @@ -48,7 +48,6 @@ #endif - struct ShaderInfo; ////////////////////////////////////////////////////////////////////////// @@ -59,15 +58,15 @@ struct JIT_COMPILE_INPUT SWR_SHADER_TYPE type; uint32_t crc; - const void* pIR; ///< Pointer to LLVM IR text. - size_t irLength; + const void* pIR; ///< Pointer to LLVM IR text. + size_t irLength; bool enableJitSampler; }; -extern "C" -{ + +extern "C" { ////////////////////////////////////////////////////////////////////////// /// @brief Create JIT context. @@ -82,17 +81,13 @@ void JITCALL JitDestroyContext(HANDLE hJitContext); /// @param hJitContext - Jit Context /// @param input - Input containing LLVM IR and other information /// @param output - Output containing information about JIT shader -ShaderInfo* JITCALL JitCompileShader( - HANDLE hJitContext, - const JIT_COMPILE_INPUT& input); +ShaderInfo* JITCALL JitCompileShader(HANDLE hJitContext, const JIT_COMPILE_INPUT& input); ////////////////////////////////////////////////////////////////////////// /// @brief JIT destroy shader. /// @param hJitContext - Jit Context /// @param pShaderInfo - pointer to shader object. -void JITCALL JitDestroyShader( - HANDLE hJitContext, - ShaderInfo*& pShaderInfo); +void JITCALL JitDestroyShader(HANDLE hJitContext, ShaderInfo*& pShaderInfo); ////////////////////////////////////////////////////////////////////////// /// @brief JIT compiles fetch shader diff --git a/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp b/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp index 001a1ab241f..47f717bfc2a 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp @@ -1,32 +1,32 @@ /**************************************************************************** -* Copyright (C) 2017-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file jit_pch.hpp -* -* @brief Pre-compiled header for jitter -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2017-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file jit_pch.hpp + * + * @brief Pre-compiled header for jitter + * + * Notes: + * + ******************************************************************************/ #pragma once @@ -58,7 +58,7 @@ #include "llvm/IR/LegacyPassManager.h" using FunctionPassManager = llvm::legacy::FunctionPassManager; -using PassManager = llvm::legacy::PassManager; +using PassManager = llvm::legacy::PassManager; #include "llvm/CodeGen/Passes.h" #include "llvm/ExecutionEngine/ExecutionEngine.h" @@ -92,7 +92,6 @@ using PassManager = llvm::legacy::PassManager; #include "llvm/Transforms/Utils/Cloning.h" - #if defined(_WIN32) #include "llvm/ADT/Triple.h" #endif @@ -117,16 +116,18 @@ using PassManager = llvm::legacy::PassManager; #endif #if LLVM_VERSION_MAJOR >= 5 -static const auto Sync_CrossThread = llvm::SyncScope::System; -static const auto Attrib_FunctionIndex = llvm::AttributeList::FunctionIndex; -static inline llvm::AttributeSet GetFuncAttribSet(llvm::LLVMContext& ctx, const llvm::AttrBuilder &b) +static const auto Sync_CrossThread = llvm::SyncScope::System; +static const auto Attrib_FunctionIndex = llvm::AttributeList::FunctionIndex; +static inline llvm::AttributeSet GetFuncAttribSet(llvm::LLVMContext& ctx, + const llvm::AttrBuilder& b) { return llvm::AttributeSet::get(ctx, b); } #else -static const auto Sync_CrossThread = llvm::SynchronizationScope::CrossThread; -static const auto Attrib_FunctionIndex = llvm::AttributeSet::FunctionIndex; -static inline llvm::AttributeSet GetFuncAttribSet(llvm::LLVMContext& ctx, const llvm::AttrBuilder &b) +static const auto Sync_CrossThread = llvm::SynchronizationScope::CrossThread; +static const auto Attrib_FunctionIndex = llvm::AttributeSet::FunctionIndex; +static inline llvm::AttributeSet GetFuncAttribSet(llvm::LLVMContext& ctx, + const llvm::AttrBuilder& b) { return llvm::AttributeSet::get(ctx, Attrib_FunctionIndex, b); } @@ -134,7 +135,6 @@ static inline llvm::AttributeSet GetFuncAttribSet(llvm::LLVMContext& ctx, const #pragma pop_macro("DEBUG") - #include <deque> #include <list> #include <unordered_map> diff --git a/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/DebugOutput.cpp b/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/DebugOutput.cpp index 54d45e6bc4c..1c9db0c2d2a 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/DebugOutput.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/DebugOutput.cpp @@ -1,36 +1,35 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file DebugOutput.cpp -* -* @brief Shader support library implementation for printed Debug output -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file DebugOutput.cpp + * + * @brief Shader support library implementation for printed Debug output + * + * Notes: + * + ******************************************************************************/ #include <stdarg.h> #include "common/os.h" - ////////////////////////////////////////////////////////////////////////// /// @brief called in JIT code, inserted by PRINT /// output to both stdout and visual studio debug console @@ -40,7 +39,7 @@ extern "C" void CallPrint(const char* fmt, ...) va_start(args, fmt); vprintf(fmt, args); -#if defined( _WIN32 ) +#if defined(_WIN32) char strBuf[1024]; vsnprintf_s(strBuf, _TRUNCATE, fmt, args); OutputDebugStringA(strBuf); @@ -48,4 +47,3 @@ extern "C" void CallPrint(const char* fmt, ...) va_end(args); } - diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp index cb2e3aed61a..8f86af2a4b4 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp @@ -1,32 +1,32 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file streamout_jit.cpp -* -* @brief Implementation of the streamout jitter -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file streamout_jit.cpp + * + * @brief Implementation of the streamout jitter + * + * Notes: + * + ******************************************************************************/ #include "jit_pch.hpp" #include "builder.h" #include "jit_api.h" @@ -44,13 +44,12 @@ struct StreamOutJit : public Builder { StreamOutJit(JitManager* pJitMgr) : Builder(pJitMgr){}; - // returns pointer to SWR_STREAMOUT_BUFFER + // returns pointer to SWR_STREAMOUT_BUFFER Value* getSOBuffer(Value* pSoCtx, uint32_t buffer) { - return LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pBuffer, buffer }); + return LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_pBuffer, buffer}); } - ////////////////////////////////////////////////////////////////////////// // @brief checks if streamout buffer is oob // @return <i1> true/false @@ -62,28 +61,27 @@ struct StreamOutJit : public Builder // load enable // @todo bool data types should generate <i1> llvm type - Value* enabled = TRUNC(LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_enable }), IRB()->getInt1Ty()); + Value* enabled = TRUNC(LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_enable}), IRB()->getInt1Ty()); // load buffer size - Value* bufferSize = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_bufferSize }); - + Value* bufferSize = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_bufferSize}); + // load current streamOffset - Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset }); + Value* streamOffset = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset}); // load buffer pitch - Value* pitch = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch }); + Value* pitch = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_pitch}); // buffer is considered oob if in use in a decl but not enabled returnMask = OR(returnMask, NOT(enabled)); // buffer is oob if cannot fit a prims worth of verts Value* newOffset = ADD(streamOffset, MUL(pitch, C(state.numVertsPerPrim))); - returnMask = OR(returnMask, ICMP_SGT(newOffset, bufferSize)); + returnMask = OR(returnMask, ICMP_SGT(newOffset, bufferSize)); return returnMask; } - ////////////////////////////////////////////////////////////////////////// // @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector, // packing the active mask bits @@ -93,8 +91,8 @@ struct StreamOutJit : public Builder Value* PackMask(uint32_t bitmask) { std::vector<Constant*> indices(4, C(0)); - DWORD index; - uint32_t elem = 0; + DWORD index; + uint32_t elem = 0; while (_BitScanForward(&index, bitmask)) { indices[elem++] = C((int)index); @@ -133,17 +131,17 @@ struct StreamOutJit : public Builder void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl) { uint32_t numComponents = _mm_popcnt_u32(decl.componentMask); - uint32_t packedMask = (1 << numComponents) - 1; + uint32_t packedMask = (1 << numComponents) - 1; if (!decl.hole) { // increment stream pointer to correct slot Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot)); // load 4 components from stream - Type* simd4Ty = VectorType::get(IRB()->getFloatTy(), 4); + Type* simd4Ty = VectorType::get(IRB()->getFloatTy(), 4); Type* simd4PtrTy = PointerType::get(simd4Ty, 0); - pAttrib = BITCAST(pAttrib, simd4PtrTy); - Value *vattrib = LOAD(pAttrib); + pAttrib = BITCAST(pAttrib, simd4PtrTy); + Value* vattrib = LOAD(pAttrib); // shuffle/pack enabled components Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask)); @@ -178,7 +176,11 @@ struct StreamOutJit : public Builder } } - void buildStream(const STREAMOUT_COMPILE_STATE& state, const STREAMOUT_STREAM& streamState, Value* pSoCtx, BasicBlock* returnBB, Function* soFunc) + void buildStream(const STREAMOUT_COMPILE_STATE& state, + const STREAMOUT_STREAM& streamState, + Value* pSoCtx, + BasicBlock* returnBB, + Function* soFunc) { // get list of active SO buffers std::unordered_set<uint32_t> activeSOBuffers; @@ -189,9 +191,9 @@ struct StreamOutJit : public Builder } // always increment numPrimStorageNeeded - Value *numPrimStorageNeeded = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded }); - numPrimStorageNeeded = ADD(numPrimStorageNeeded, C(1)); - STORE(numPrimStorageNeeded, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded }); + Value* numPrimStorageNeeded = LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded}); + numPrimStorageNeeded = ADD(numPrimStorageNeeded, C(1)); + STORE(numPrimStorageNeeded, pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded}); // check OOB on active SO buffers. If any buffer is out of bound, don't write // the primitive to any buffer @@ -208,27 +210,27 @@ struct StreamOutJit : public Builder IRB()->SetInsertPoint(validBB); - Value* numPrimsWritten = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten }); - numPrimsWritten = ADD(numPrimsWritten, C(1)); - STORE(numPrimsWritten, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten }); + Value* numPrimsWritten = LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimsWritten}); + numPrimsWritten = ADD(numPrimsWritten, C(1)); + STORE(numPrimsWritten, pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimsWritten}); // compute start pointer for each output buffer Value* pOutBuffer[4]; Value* pOutBufferStartVertex[4]; Value* outBufferPitch[4]; - for (uint32_t b: activeSOBuffers) + for (uint32_t b : activeSOBuffers) { - Value* pBuf = getSOBuffer(pSoCtx, b); - Value* pData = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pBuffer }); - Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset }); - pOutBuffer[b] = GEP(pData, streamOffset); + Value* pBuf = getSOBuffer(pSoCtx, b); + Value* pData = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_pBuffer}); + Value* streamOffset = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset}); + pOutBuffer[b] = GEP(pData, streamOffset); pOutBufferStartVertex[b] = pOutBuffer[b]; - outBufferPitch[b] = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch }); + outBufferPitch[b] = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_pitch}); } // loop over the vertices of the prim - Value* pStreamData = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pPrimData }); + Value* pStreamData = LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_pPrimData}); for (uint32_t v = 0; v < state.numVertsPerPrim; ++v) { buildVertex(streamState, pStreamData, pOutBuffer); @@ -241,23 +243,24 @@ struct StreamOutJit : public Builder for (uint32_t b : activeSOBuffers) { pOutBufferStartVertex[b] = GEP(pOutBufferStartVertex[b], outBufferPitch[b]); - pOutBuffer[b] = pOutBufferStartVertex[b]; + pOutBuffer[b] = pOutBufferStartVertex[b]; } } // update each active buffer's streamOffset for (uint32_t b : activeSOBuffers) { - Value* pBuf = getSOBuffer(pSoCtx, b); - Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset }); + Value* pBuf = getSOBuffer(pSoCtx, b); + Value* streamOffset = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset}); streamOffset = ADD(streamOffset, MUL(C(state.numVertsPerPrim), outBufferPitch[b])); - STORE(streamOffset, pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset }); + STORE(streamOffset, pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset}); } } Function* Create(const STREAMOUT_COMPILE_STATE& state) { - std::stringstream fnName("SO_", std::ios_base::in | std::ios_base::out | std::ios_base::ate); + std::stringstream fnName("SO_", + std::ios_base::in | std::ios_base::out | std::ios_base::ate); fnName << ComputeCRC(0, &state, sizeof(state)); // SO function signature @@ -267,19 +270,20 @@ struct StreamOutJit : public Builder PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT* }; - FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false); - Function* soFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); + FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false); + Function* soFunc = Function::Create( + fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); soFunc->getParent()->setModuleIdentifier(soFunc->getName()); // create return basic block - BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", soFunc); + BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", soFunc); BasicBlock* returnBB = BasicBlock::Create(JM()->mContext, "return", soFunc); IRB()->SetInsertPoint(entry); // arguments - auto argitr = soFunc->arg_begin(); + auto argitr = soFunc->arg_begin(); Value* pSoCtx = &*argitr++; pSoCtx->setName("pSoCtx"); @@ -325,11 +329,12 @@ struct StreamOutJit : public Builder /// @return PFN_SO_FUNC - pointer to SOS function PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc) { - llvm::Function *func = (llvm::Function*)hFunc; - JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); - PFN_SO_FUNC pfnStreamOut; + llvm::Function* func = (llvm::Function*)hFunc; + JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); + PFN_SO_FUNC pfnStreamOut; pfnStreamOut = (PFN_SO_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); - // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module + // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot + // add new IR to the module pJitMgr->mIsModuleFinalized = true; pJitMgr->DumpAsm(func, "SoFunc_optimized"); @@ -342,7 +347,8 @@ PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc) /// @brief JIT compiles streamout shader /// @param hJitMgr - JitManager handle /// @param state - SO state to build function from -extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitMgr, const STREAMOUT_COMPILE_STATE& state) +extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitMgr, + const STREAMOUT_COMPILE_STATE& state) { JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); @@ -358,7 +364,7 @@ extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitMgr, const STREAMO pJitMgr->SetupNewModule(); StreamOutJit theJit(pJitMgr); - HANDLE hFunc = theJit.Create(soState); + HANDLE hFunc = theJit.Create(soState); return JitStreamoutFunc(hJitMgr, hFunc); } diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h index 097f8ab44d9..cee7b5748ed 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h @@ -1,32 +1,32 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file streamout_jit.h -* -* @brief Definition of the streamout jitter -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file streamout_jit.h + * + * @brief Definition of the streamout jitter + * + * Notes: + * + ******************************************************************************/ #pragma once #include "common/formats.h" @@ -43,7 +43,7 @@ struct STREAMOUT_DECL // attribute to stream uint32_t attribSlot; - // attribute component mask + // attribute component mask uint32_t componentMask; // indicates this decl is a hole @@ -69,24 +69,31 @@ struct STREAMOUT_COMPILE_STATE { // number of verts per primitive uint32_t numVertsPerPrim; - uint32_t offsetAttribs; ///< attrib offset to subtract from all STREAMOUT_DECL::attribSlot values. + uint32_t + offsetAttribs; ///< attrib offset to subtract from all STREAMOUT_DECL::attribSlot values. uint64_t streamMask; // stream decls STREAMOUT_STREAM stream; - bool operator==(const STREAMOUT_COMPILE_STATE &other) const + bool operator==(const STREAMOUT_COMPILE_STATE& other) const { - if (numVertsPerPrim != other.numVertsPerPrim) return false; - if (stream.numDecls != other.stream.numDecls) return false; + if (numVertsPerPrim != other.numVertsPerPrim) + return false; + if (stream.numDecls != other.stream.numDecls) + return false; for (uint32_t i = 0; i < stream.numDecls; ++i) { - if (stream.decl[i].bufferIndex != other.stream.decl[i].bufferIndex) return false; - if (stream.decl[i].attribSlot != other.stream.decl[i].attribSlot) return false; - if (stream.decl[i].componentMask != other.stream.decl[i].componentMask) return false; - if (stream.decl[i].hole != other.stream.decl[i].hole) return false; + if (stream.decl[i].bufferIndex != other.stream.decl[i].bufferIndex) + return false; + if (stream.decl[i].attribSlot != other.stream.decl[i].attribSlot) + return false; + if (stream.decl[i].componentMask != other.stream.decl[i].componentMask) + return false; + if (stream.decl[i].hole != other.stream.decl[i].hole) + return false; } return true; |