summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTim Rowley <[email protected]>2017-06-26 13:00:27 -0500
committerTim Rowley <[email protected]>2017-06-30 13:26:19 -0500
commitcae53b24d7a739647193711e9a16c7face7ec72a (patch)
tree2d70ab5c42624d507b10816187b116c1fc3470a0
parentb89bd3694c12f95a74af02e8095edcd631a05801 (diff)
swr/rast: Split backend.cpp to improve compile time
Hardcode split to four files currently. Decreases swr build time on a quad-core by ~10%. Reviewed-by: Bruce Cherniak <bruce.cherniak at intel.com>
-rw-r--r--src/gallium/drivers/swr/Makefile.am29
-rw-r--r--src/gallium/drivers/swr/Makefile.sources4
-rw-r--r--src/gallium/drivers/swr/SConscript19
-rw-r--r--src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py19
-rw-r--r--src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp1
-rw-r--r--src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp43
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/api.cpp7
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/backend.cpp809
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/backend.h1033
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp281
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/backend_impl.h1067
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp345
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp321
13 files changed, 2146 insertions, 1832 deletions
diff --git a/src/gallium/drivers/swr/Makefile.am b/src/gallium/drivers/swr/Makefile.am
index 6650abda5ae..578f15909b6 100644
--- a/src/gallium/drivers/swr/Makefile.am
+++ b/src/gallium/drivers/swr/Makefile.am
@@ -34,6 +34,7 @@ COMMON_CXXFLAGS = \
$(LLVM_CXXFLAGS) \
$(SWR_CXX11_CXXFLAGS) \
-I$(builddir)/rasterizer/codegen \
+ -I$(builddir)/rasterizer/core \
-I$(builddir)/rasterizer/jitter \
-I$(builddir)/rasterizer/archrast \
-I$(srcdir)/rasterizer \
@@ -62,7 +63,11 @@ BUILT_SOURCES = \
rasterizer/archrast/gen_ar_event.cpp \
rasterizer/archrast/gen_ar_eventhandler.hpp \
rasterizer/archrast/gen_ar_eventhandlerfile.hpp \
- rasterizer/core/gen_BackendPixelRate0.cpp
+ rasterizer/core/backends/gen_BackendPixelRate0.cpp \
+ rasterizer/core/backends/gen_BackendPixelRate1.cpp \
+ rasterizer/core/backends/gen_BackendPixelRate2.cpp \
+ rasterizer/core/backends/gen_BackendPixelRate3.cpp \
+ rasterizer/core/backends/gen_BackendPixelRate.hpp
MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
PYTHON_GEN = $(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS)
@@ -140,20 +145,33 @@ rasterizer/archrast/gen_ar_eventhandlerfile.hpp: rasterizer/codegen/gen_archrast
--output rasterizer/archrast/gen_ar_eventhandlerfile.hpp \
--gen_eventhandlerfile_h
+rasterizer/core/backends/gen_BackendPixelRate0.cpp \
+rasterizer/core/backends/gen_BackendPixelRate1.cpp \
+rasterizer/core/backends/gen_BackendPixelRate2.cpp \
+rasterizer/core/backends/gen_BackendPixelRate3.cpp \
+rasterizer/core/backends/gen_BackendPixelRate.hpp: \
+backend.intermediate
+
# 5 SWR_MULTISAMPLE_TYPE_COUNT
# 2 SWR_MSAA_SAMPLE_PATTERN_COUNT
# 3 SWR_INPUT_COVERAGE_COUNT
# 2 centroid
# 2 forcedSampleCount
# 2 canEarlyZ
-rasterizer/core/gen_BackendPixelRate0.cpp: rasterizer/codegen/gen_backends.py rasterizer/codegen/templates/gen_backend.cpp
+
+# use intermediate rule to tell make that all files can be
+# generated in one invocation of gen_backends.py (prevents
+# parallel make race condition)
+.INTERMEDIATE: backend.intermediate
+backend.intermediate: rasterizer/codegen/gen_backends.py rasterizer/codegen/templates/gen_backend.cpp rasterizer/codegen/templates/gen_header_init.hpp
$(MKDIR_GEN)
$(PYTHON_GEN) \
$(srcdir)/rasterizer/codegen/gen_backends.py \
- --outdir rasterizer/core \
+ --outdir rasterizer/core/backends \
--dim 5 2 3 2 2 2 \
- --split 0 \
- --cpp
+ --numfiles 4 \
+ --cpp \
+ --hpp
COMMON_LIBADD = \
$(top_builddir)/src/gallium/auxiliary/libgallium.la \
@@ -227,5 +245,6 @@ EXTRA_DIST = \
rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp \
rasterizer/codegen/templates/gen_backend.cpp \
rasterizer/codegen/templates/gen_builder.hpp \
+ rasterizer/codegen/templates/gen_header_init.hpp \
rasterizer/codegen/templates/gen_knobs.cpp \
rasterizer/codegen/templates/gen_llvm.hpp
diff --git a/src/gallium/drivers/swr/Makefile.sources b/src/gallium/drivers/swr/Makefile.sources
index a1172b72cad..d9894c26015 100644
--- a/src/gallium/drivers/swr/Makefile.sources
+++ b/src/gallium/drivers/swr/Makefile.sources
@@ -73,7 +73,11 @@ CORE_CXX_SOURCES := \
rasterizer/core/api.h \
rasterizer/core/arena.h \
rasterizer/core/backend.cpp \
+ rasterizer/core/backend_clear.cpp \
+ rasterizer/core/backend_sample.cpp \
+ rasterizer/core/backend_singlesample.cpp \
rasterizer/core/backend.h \
+ rasterizer/core/backend_impl.h \
rasterizer/core/binner.cpp \
rasterizer/core/binner.h \
rasterizer/core/blend.h \
diff --git a/src/gallium/drivers/swr/SConscript b/src/gallium/drivers/swr/SConscript
index cdb85e2cad4..0f3cd6c8aa3 100644
--- a/src/gallium/drivers/swr/SConscript
+++ b/src/gallium/drivers/swr/SConscript
@@ -140,12 +140,22 @@ Depends('rasterizer/jitter/gen_state_llvm.h',
# 2 centroid
# 2 forcedSampleCount
# 2 canEarlyZ
+backendPixelRateFileCount = 4
+backendPixelRateFilePat = "rasterizer/core/backends/gen_BackendPixelRate%s.cpp"
+backendPixelRateFiles = map(lambda x: backendPixelRateFilePat % x,
+ range(0, backendPixelRateFileCount))
env.CodeGenerate(
- target = 'rasterizer/core/gen_BackendPixelRate0.cpp',
+ target = 'rasterizer/core/backends/gen_BackendPixelRate.hpp',
script = swrroot + 'rasterizer/codegen/gen_backends.py',
source = '',
- command = python_cmd + ' $SCRIPT --outdir ' + bldroot + '/rasterizer/core --dim 5 2 3 2 2 2 --split 0 --cpp'
-)
+ command = python_cmd + ' $SCRIPT --outdir ' + bldroot + '/rasterizer/core/backends --dim 5 2 3 2 2 2 --numfiles ' + str(backendPixelRateFileCount) + ' --cpp --hpp'
+ )
+Depends(backendPixelRateFiles,
+ ['rasterizer/core/backends/gen_BackendPixelRate.hpp',
+ 'rasterizer/archrast/gen_ar_event.hpp',
+ 'rasterizer/codegen/gen_knobs.h']
+ )
+
Depends('rasterizer/jitter/gen_state_llvm.h',
swrroot + 'rasterizer/codegen/templates/gen_backend.cpp')
@@ -153,9 +163,10 @@ Depends('rasterizer/jitter/gen_state_llvm.h',
built_sources = [
'rasterizer/codegen/gen_knobs.cpp',
'rasterizer/archrast/gen_ar_event.cpp',
- 'rasterizer/core/gen_BackendPixelRate0.cpp',
]
+built_sources += backendPixelRateFiles
+
source = built_sources
source += env.ParseSourceList(swrroot + 'Makefile.sources', [
'CXX_SOURCES',
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py
index f65f7648c41..3f0790c8dae 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py
@@ -35,7 +35,9 @@ def main(args=sys.argv[1:]):
parser.add_argument('--dim', help='gBackendPixelRateTable array dimensions', nargs='+', type=int, required=True)
parser.add_argument('--outdir', help='output directory', nargs='?', type=str, default=thisDir)
parser.add_argument('--split', help='how many lines of initialization per file [0=no split]', nargs='?', type=int, default='512')
+ parser.add_argument('--numfiles', help='how many output files to generate', nargs='?', type=int, default='0')
parser.add_argument('--cpp', help='Generate cpp file(s)', action='store_true', default=False)
+ parser.add_argument('--hpp', help='Generate hpp file', action='store_true', default=False)
parser.add_argument('--cmake', help='Generate cmake file', action='store_true', default=False)
args = parser.parse_args(args);
@@ -43,11 +45,14 @@ def main(args=sys.argv[1:]):
class backendStrs :
def __init__(self) :
self.outFileName = 'gen_BackendPixelRate%s.cpp'
+ self.outHeaderName = 'gen_BackendPixelRate.hpp'
self.functionTableName = 'gBackendPixelRateTable'
self.funcInstanceHeader = ' = BackendPixelRate<SwrBackendTraits<'
self.template = 'gen_backend.cpp'
+ self.hpp_template = 'gen_header_init.hpp'
self.cmakeFileName = 'gen_backends.cmake'
self.cmakeSrcVar = 'GEN_BACKEND_SOURCES'
+ self.tableName = 'BackendPixelRate'
backend = backendStrs()
@@ -77,6 +82,8 @@ def main(args=sys.argv[1:]):
numFiles = 1
else:
numFiles = (len(output_list) + args.split - 1) // args.split
+ if (args.numfiles != 0):
+ numFiles = args.numfiles
linesPerFile = (len(output_list) + numFiles - 1) // numFiles
chunkedList = [output_list[x:x+linesPerFile] for x in range(0, len(output_list), linesPerFile)]
@@ -94,6 +101,18 @@ def main(args=sys.argv[1:]):
fileNum=fileNum,
funcList=chunkedList[fileNum])
+ if args.hpp:
+ baseHppName = os.path.join(args.outdir, backend.outHeaderName)
+ templateHpp = os.path.join(thisDir, 'templates', backend.hpp_template)
+
+ MakoTemplateWriter.to_file(
+ templateHpp,
+ baseHppName,
+ cmdline=sys.argv,
+ numFiles=numFiles,
+ filename=backend.outHeaderName,
+ tableName=backend.tableName)
+
# generate gen_backend.cmake file
if args.cmake:
templateCmake = os.path.join(thisDir, 'templates', 'gen_backend.cmake')
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp
index 4eb4ad4f2b3..088b1cd79d5 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp
@@ -32,6 +32,7 @@
//============================================================================
#include "core/backend.h"
+#include "core/backend_impl.h"
void InitBackendPixelRate${fileNum}()
{
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp
new file mode 100644
index 00000000000..5625ef8a0de
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp
@@ -0,0 +1,43 @@
+//============================================================================
+// Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice (including the next
+// paragraph) shall be included in all copies or substantial portions of the
+// Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+// IN THE SOFTWARE.
+//
+// @file ${filename}
+//
+// @brief auto-generated file
+//
+// DO NOT EDIT
+//
+// Generation Command Line:
+// ${'\n// '.join(cmdline)}
+//
+//============================================================================
+
+%for num in range(numFiles):
+void Init${tableName}${num}();
+%endfor
+
+static INLINE void Init${tableName}()
+{
+ %for num in range(numFiles):
+ Init${tableName}${num}();
+ %endfor
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index ae9ced26f58..cf895fb0d26 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -784,9 +784,6 @@ void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
// templated backend function tables
-extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
-extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2][2];
-extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2];
void SetupPipeline(DRAW_CONTEXT *pDC)
{
@@ -838,7 +835,9 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
break;
}
}
-
+
+ SWR_ASSERT(backendFuncs.pfnBackend);
+
PFN_PROCESS_PRIMS pfnBinner;
#if USE_SIMD16_FRONTEND
PFN_PROCESS_PRIMS_SIMD16 pfnBinner_simd16;
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index 8c9449baa00..fe11cdfd2f9 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -30,15 +30,14 @@
#include <smmintrin.h>
#include "backend.h"
+#include "backend_impl.h"
#include "tilemgr.h"
#include "memory/tilingtraits.h"
#include "core/multisample.h"
+#include "backends/gen_BackendPixelRate.hpp"
#include <algorithm>
-typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, uint32_t, DWORD[4], const SWR_RECT& rect);
-static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
-
//////////////////////////////////////////////////////////////////////////
/// @brief Process compute work.
@@ -103,238 +102,6 @@ void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
SWR_ASSERT(x == 0 && y == 0);
}
-template<SWR_FORMAT format>
-void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
-{
- auto lambda = [&](int32_t comp)
- {
- FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
-
- pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
- };
-
- const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
-
- for (uint32_t i = 0; i < numIter; ++i)
- {
- UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
- }
-}
-
-#if USE_8x2_TILE_BACKEND
-template<SWR_FORMAT format>
-void ClearRasterTile(uint8_t *pTileBuffer, simd16vector &value)
-{
- auto lambda = [&](int32_t comp)
- {
- FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
-
- pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
- };
-
- const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM);
-
- for (uint32_t i = 0; i < numIter; ++i)
- {
- UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
- }
-}
-
-#endif
-template<SWR_FORMAT format>
-INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, uint32_t renderTargetArrayIndex, DWORD clear[4], const SWR_RECT& rect)
-{
- // convert clear color to hottile format
- // clear color is in RGBA float/uint32
-#if USE_8x2_TILE_BACKEND
- simd16vector vClear;
- for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
- {
- simd16scalar vComp;
- vComp = _simd16_load1_ps((const float*)&clear[comp]);
- if (FormatTraits<format>::isNormalized(comp))
- {
- vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp)));
- vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
- }
- vComp = FormatTraits<format>::pack(comp, vComp);
- vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
- }
-
-#else
- simdvector vClear;
- for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
- {
- simdscalar vComp;
- vComp = _simd_load1_ps((const float*)&clear[comp]);
- if (FormatTraits<format>::isNormalized(comp))
- {
- vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
- vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
- }
- vComp = FormatTraits<format>::pack(comp, vComp);
- vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
- }
-
-#endif
- uint32_t tileX, tileY;
- MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
-
- // Init to full macrotile
- SWR_RECT clearTile =
- {
- KNOB_MACROTILE_X_DIM * int32_t(tileX),
- KNOB_MACROTILE_Y_DIM * int32_t(tileY),
- KNOB_MACROTILE_X_DIM * int32_t(tileX + 1),
- KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1),
- };
-
- // intersect with clear rect
- clearTile &= rect;
-
- // translate to local hottile origin
- clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, -int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
-
- // Make maximums inclusive (needed for convert to raster tiles)
- clearTile.xmax -= 1;
- clearTile.ymax -= 1;
-
- // convert to raster tiles
- clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT);
- clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT);
- clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT);
- clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT);
-
- const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
- // compute steps between raster tile samples / raster tiles / macro tile rows
- const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
- const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
- const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
- const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
-
- HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples, renderTargetArrayIndex);
- uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples;
- uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
-
- // loop over all raster tiles in the current hot tile
- for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y)
- {
- uint8_t* pRasterTile = pRasterTileRow;
- for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x)
- {
- for( int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
- {
- ClearRasterTile<format>(pRasterTile, vClear);
- pRasterTile += rasterTileSampleStep;
- }
- }
- pRasterTileRow += macroTileRowStep;
- }
-
- pHotTile->state = HOTTILE_DIRTY;
-}
-
-
-void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
-{
- SWR_CONTEXT *pContext = pDC->pContext;
-
- if (KNOB_FAST_CLEAR)
- {
- CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
- SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
- uint32_t numSamples = GetNumSamples(sampleCount);
-
- SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason.
-
- AR_BEGIN(BEClear, pDC->drawId);
-
- if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
- {
- unsigned long rt = 0;
- uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
- while (_BitScanForward(&rt, mask))
- {
- mask &= ~(1 << rt);
-
- HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)rt, true, numSamples, pClear->renderTargetArrayIndex);
-
- // All we want to do here is to mark the hot tile as being in a "needs clear" state.
- pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
- pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
- pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
- pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
- pHotTile->state = HOTTILE_CLEAR;
- }
- }
-
- if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
- {
- HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples, pClear->renderTargetArrayIndex);
- pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
- pHotTile->state = HOTTILE_CLEAR;
- }
-
- if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
- {
- HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples, pClear->renderTargetArrayIndex);
-
- pHotTile->clearData[0] = pClear->clearStencil;
- pHotTile->state = HOTTILE_CLEAR;
- }
-
- AR_END(BEClear, 1);
- }
- else
- {
- // Legacy clear
- CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
- AR_BEGIN(BEClear, pDC->drawId);
-
- if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
- {
- DWORD clearData[4];
- clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
- clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
- clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
- clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
-
- PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
- SWR_ASSERT(pfnClearTiles != nullptr);
-
- unsigned long rt = 0;
- uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
- while (_BitScanForward(&rt, mask))
- {
- mask &= ~(1 << rt);
-
- pfnClearTiles(pDC, (SWR_RENDERTARGET_ATTACHMENT)rt, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
- }
- }
-
- if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
- {
- DWORD clearData[4];
- clearData[0] = *(DWORD*)&pClear->clearDepth;
- PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
- SWR_ASSERT(pfnClearTiles != nullptr);
-
- pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
- }
-
- if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
- {
- DWORD clearData[4];
- clearData[0] = pClear->clearStencil;
- PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
-
- pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
- }
-
- AR_END(BEClear, 1);
- }
-}
-
void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc,
SWR_RENDERTARGET_ATTACHMENT attachment)
{
@@ -368,7 +135,7 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile
// clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
if (pHotTile->state == HOTTILE_CLEAR)
{
- PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
+ PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat];
SWR_ASSERT(pfnClearTiles != nullptr);
pfnClearTiles(pDC, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect);
@@ -429,457 +196,6 @@ void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint3
}
}
-#if KNOB_SIMD_WIDTH == 8
-const simdscalar vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
-const simdscalar vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
-const simdscalar vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
-const simdscalar vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
-#else
-#error Unsupported vector width
-#endif
-
-simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
-{
- simdscalar vClipMask = _simd_setzero_ps();
- uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
-
- for (uint32_t i = 0; i < numClipDistance; ++i)
- {
- // pull triangle clip distance values from clip buffer
- simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
- simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
- simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
-
- // interpolate
- simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
-
- // clip if interpolated clip distance is < 0 || NAN
- simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
-
- vClipMask = _simd_or_ps(vClipMask, vCull);
- }
-
- return _simd_movemask_ps(vClipMask);
-}
-
-template<typename T>
-void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
-{
- SWR_CONTEXT *pContext = pDC->pContext;
-
- AR_BEGIN(BESingleSampleBackend, pDC->drawId);
- AR_BEGIN(BESetup, pDC->drawId);
-
- const API_STATE &state = GetApiState(pDC);
-
- BarycentricCoeffs coeffs;
- SetupBarycentricCoeffs(&coeffs, work);
-
- uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
- SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
-
- SWR_PS_CONTEXT psContext;
- const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
- SetupPixelShaderContext<T>(&psContext, samplePos, work);
-
- AR_END(BESetup, 1);
-
- psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
- psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-
- const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
-
- for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
- {
- psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
- psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-
- const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
-
- for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
- {
-#if USE_8x2_TILE_BACKEND
- const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
-
-#endif
- simdmask coverageMask = work.coverageMask[0] & MASK;
-
- if (coverageMask)
- {
- if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
- {
- static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
-
- const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer));
-
- const float minz = state.depthBoundsState.depthBoundsTestMinValue;
- const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
-
- coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
- }
-
- if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
- {
- const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
-
- generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
- }
-
- AR_BEGIN(BEBarycentric, pDC->drawId);
-
- CalcPixelBarycentrics(coeffs, psContext);
-
- CalcCentroid<T, true>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
-
- // interpolate and quantize z
- psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
- psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-
- AR_END(BEBarycentric, 1);
-
- // interpolate user clip distance if available
- if (state.rastState.clipDistanceMask)
- {
- coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center);
- }
-
- simdscalar vCoverageMask = vMask(coverageMask);
- simdscalar depthPassMask = vCoverageMask;
- simdscalar stencilPassMask = vCoverageMask;
-
- // Early-Z?
- if (T::bCanEarlyZ)
- {
- AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
- depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
- psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
- AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
- AR_END(BEEarlyDepthTest, 0);
-
- // early-exit if no pixels passed depth or earlyZ is forced on
- if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
- {
- DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
- pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
-
- if (!_simd_movemask_ps(depthPassMask))
- {
- goto Endtile;
- }
- }
- }
-
- psContext.sampleIndex = 0;
- psContext.activeMask = _simd_castps_si(vCoverageMask);
-
- // execute pixel shader
- AR_BEGIN(BEPixelShader, pDC->drawId);
- UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
- state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
- AR_END(BEPixelShader, 0);
-
- vCoverageMask = _simd_castsi_ps(psContext.activeMask);
-
- // late-Z
- if (!T::bCanEarlyZ)
- {
- AR_BEGIN(BELateDepthTest, pDC->drawId);
- depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
- psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
- AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
- AR_END(BELateDepthTest, 0);
-
- if (!_simd_movemask_ps(depthPassMask))
- {
- // need to call depth/stencil write for stencil write
- DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
- pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
- goto Endtile;
- }
- } else {
- // for early z, consolidate discards from shader
- // into depthPassMask
- depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask);
- }
-
- uint32_t statMask = _simd_movemask_ps(depthPassMask);
- uint32_t statCount = _mm_popcnt_u32(statMask);
- UPDATE_STAT_BE(DepthPassCount, statCount);
-
- // output merger
- AR_BEGIN(BEOutputMerger, pDC->drawId);
-#if USE_8x2_TILE_BACKEND
- OutputMerger8x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
-#else
- OutputMerger4x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
-#endif
-
- // do final depth write after all pixel kills
- if (!state.psState.forceEarlyZ)
- {
- DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
- pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
- }
- AR_END(BEOutputMerger, 0);
- }
-
-Endtile:
- AR_BEGIN(BEEndTile, pDC->drawId);
-
- work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
- if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
- {
- work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
- }
-
-#if USE_8x2_TILE_BACKEND
- if (useAlternateOffset)
- {
- for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
- {
- pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
- }
- }
-#else
- for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
- {
- pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
- }
-#endif
- pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
- pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
-
- AR_END(BEEndTile, 0);
-
- psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
- psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
- }
-
- psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
- psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
- }
-
- AR_END(BESingleSampleBackend, 0);
-}
-
-template<typename T>
-void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
-{
- SWR_CONTEXT *pContext = pDC->pContext;
-
- AR_BEGIN(BESampleRateBackend, pDC->drawId);
- AR_BEGIN(BESetup, pDC->drawId);
-
- const API_STATE &state = GetApiState(pDC);
-
- BarycentricCoeffs coeffs;
- SetupBarycentricCoeffs(&coeffs, work);
-
- uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
- SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
-
- SWR_PS_CONTEXT psContext;
- const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
- SetupPixelShaderContext<T>(&psContext, samplePos, work);
-
- AR_END(BESetup, 0);
-
- psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
- psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-
- const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
-
- for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
- {
- psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
- psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-
- const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
-
- for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
- {
-#if USE_8x2_TILE_BACKEND
- const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
-
-#endif
- if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
- {
- const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
-
- generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
- }
-
- AR_BEGIN(BEBarycentric, pDC->drawId);
-
- CalcPixelBarycentrics(coeffs, psContext);
-
- CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
-
- AR_END(BEBarycentric, 0);
-
- for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
- {
- simdmask coverageMask = work.coverageMask[sample] & MASK;
-
- if (coverageMask)
- {
- // offset depth/stencil buffers current sample
- uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
- uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
-
- if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
- {
- static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
-
- const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
-
- const float minz = state.depthBoundsState.depthBoundsTestMinValue;
- const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
-
- coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
- }
-
- AR_BEGIN(BEBarycentric, pDC->drawId);
-
- // calculate per sample positions
- psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
- psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
-
- CalcSampleBarycentrics(coeffs, psContext);
-
- // interpolate and quantize z
- psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
- psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-
- AR_END(BEBarycentric, 0);
-
- // interpolate user clip distance if available
- if (state.rastState.clipDistanceMask)
- {
- coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
- }
-
- simdscalar vCoverageMask = vMask(coverageMask);
- simdscalar depthPassMask = vCoverageMask;
- simdscalar stencilPassMask = vCoverageMask;
-
- // Early-Z?
- if (T::bCanEarlyZ)
- {
- AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
- depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
- psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
- AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
- AR_END(BEEarlyDepthTest, 0);
-
- // early-exit if no samples passed depth or earlyZ is forced on.
- if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
- {
- DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
- pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
-
- if (!_simd_movemask_ps(depthPassMask))
- {
- work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
- continue;
- }
- }
- }
-
- psContext.sampleIndex = sample;
- psContext.activeMask = _simd_castps_si(vCoverageMask);
-
- // execute pixel shader
- AR_BEGIN(BEPixelShader, pDC->drawId);
- UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
- state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
- AR_END(BEPixelShader, 0);
-
- vCoverageMask = _simd_castsi_ps(psContext.activeMask);
-
- // late-Z
- if (!T::bCanEarlyZ)
- {
- AR_BEGIN(BELateDepthTest, pDC->drawId);
- depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
- psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
- AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
- AR_END(BELateDepthTest, 0);
-
- if (!_simd_movemask_ps(depthPassMask))
- {
- // need to call depth/stencil write for stencil write
- DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
- pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
-
- work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
- continue;
- }
- }
-
- uint32_t statMask = _simd_movemask_ps(depthPassMask);
- uint32_t statCount = _mm_popcnt_u32(statMask);
- UPDATE_STAT_BE(DepthPassCount, statCount);
-
- // output merger
- AR_BEGIN(BEOutputMerger, pDC->drawId);
-#if USE_8x2_TILE_BACKEND
- OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
-#else
- OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
-#endif
-
- // do final depth write after all pixel kills
- if (!state.psState.forceEarlyZ)
- {
- DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
- pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
- }
- AR_END(BEOutputMerger, 0);
- }
- work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
- }
-
-Endtile:
- ATTR_UNUSED;
-
- AR_BEGIN(BEEndTile, pDC->drawId);
-
- if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
- {
- work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
- }
-
-#if USE_8x2_TILE_BACKEND
- if (useAlternateOffset)
- {
- for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
- {
- pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
- }
- }
-#else
- for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
- {
- pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
- }
-#endif
- pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
- pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
-
- AR_END(BEEndTile, 0);
-
- psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
- psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
- }
-
- psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
- psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
- }
-
- AR_END(BESampleRateBackend, 0);
-}
-// optimized backend flow with NULL PS
template<uint32_t sampleCountT>
void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
{
@@ -977,7 +293,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
UPDATE_STAT_BE(DepthPassCount, statCount);
}
-Endtile:
+ Endtile:
ATTR_UNUSED;
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
}
@@ -994,17 +310,7 @@ Endtile:
AR_END(BENullBackend, 0);
}
-void InitClearTilesTable()
-{
- memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
-
- sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
- sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
- sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
- sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
- sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
-}
-
+PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS] = {};
PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT]
[2] // centroid
@@ -1023,113 +329,10 @@ PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
[2] // canEarlyZ
= {};
-// Recursive template used to auto-nest conditionals. Converts dynamic enum function
-// arguments to static template arguments.
-template <uint32_t... ArgsT>
-struct BEChooser
-{
- // Last Arg Terminator
- static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
- {
- switch(tArg)
- {
- case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
- case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<SwrBackendTraits<ArgsT...>>; break;
- case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
- default:
- SWR_ASSERT(0 && "Invalid backend func\n");
- return nullptr;
- break;
- }
- }
-
- // Recursively parse args
- template <typename... TArgsT>
- static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
- {
- switch(tArg)
- {
- case SWR_INPUT_COVERAGE_NONE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
- case SWR_INPUT_COVERAGE_NORMAL: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
- case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
- default:
- SWR_ASSERT(0 && "Invalid sample pattern\n");
- return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
- break;
- }
- }
-
- // Recursively parse args
- template <typename... TArgsT>
- static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
- {
- switch(tArg)
- {
- case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
- case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
- case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
- case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
- case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
- default:
- SWR_ASSERT(0 && "Invalid sample count\n");
- return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
- break;
- }
- }
-
- // Recursively parse args
- template <typename... TArgsT>
- static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
- {
- if(tArg == true)
- {
- return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
- }
-
- return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
- }
-};
-
-void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
-{
- for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
- {
- for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
- {
- for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
- {
- table[inputCoverage][isCentroid][canEarlyZ] =
- BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, false, (SWR_INPUT_COVERAGE)inputCoverage,
- (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
- }
- }
- }
-}
-
-void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
-{
- for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
- {
- for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
- {
- for(uint32_t centroid = 0; centroid < 2; centroid++)
- {
- for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
- {
- table[sampleCount][inputCoverage][centroid][canEarlyZ] =
- BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, false, (SWR_INPUT_COVERAGE)inputCoverage,
- (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
- }
- }
- }
- }
-}
-
-void InitBackendPixelRate0();
void InitBackendFuncTables()
{
+ InitBackendPixelRate();
InitBackendSingleFuncTable(gBackendSingleSample);
- InitBackendPixelRate0();
InitBackendSampleFuncTable(gBackendSampleRateTable);
gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h
index 035948652bc..c8c37e65257 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.h
@@ -40,1022 +40,23 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo
void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
-void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers);
-simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ);
-void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext);
-extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
- [2] // isCenterPattern
- [SWR_INPUT_COVERAGE_COUNT]
- [2] // centroid
- [2] // forcedSampleCount
- [2] // canEarlyZ
- ;
-
-enum SWR_BACKEND_FUNCS
-{
- SWR_BACKEND_SINGLE_SAMPLE,
- SWR_BACKEND_MSAA_PIXEL_RATE,
- SWR_BACKEND_MSAA_SAMPLE_RATE,
- SWR_BACKEND_FUNCS_MAX,
-};
-
-#if KNOB_SIMD_WIDTH == 8
-extern const simdscalar vCenterOffsetsX;
-extern const simdscalar vCenterOffsetsY;
-extern const simdscalar vULOffsetsX;
-extern const simdscalar vULOffsetsY;
-#define MASK 0xff
-#endif
-
-INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
-{
- static const uint32_t RasterTileColorOffsets[16]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
- };
- assert(sampleNum < 16);
- return RasterTileColorOffsets[sampleNum];
-}
-
-INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
-{
- static const uint32_t RasterTileDepthOffsets[16]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
- };
- assert(sampleNum < 16);
- return RasterTileDepthOffsets[sampleNum];
-}
-
-INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
-{
- static const uint32_t RasterTileStencilOffsets[16]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
- };
- assert(sampleNum < 16);
- return RasterTileStencilOffsets[sampleNum];
-}
-
-template<typename T, uint32_t InputCoverage>
-struct generateInputCoverage
-{
- INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
- {
- // will need to update for avx512
- assert(KNOB_SIMD_WIDTH == 8);
-
- simdscalari mask[2];
- simdscalari sampleCoverage[2];
-
- if(T::bIsCenterPattern)
- {
- // center coverage is the same for all samples; just broadcast to the sample slots
- uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
- if(T::MultisampleT::numSamples == 1)
- {
- sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
- }
- else if(T::MultisampleT::numSamples == 2)
- {
- sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
- }
- else if(T::MultisampleT::numSamples == 4)
- {
- sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
- }
- else if(T::MultisampleT::numSamples == 8)
- {
- sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
- }
- else if(T::MultisampleT::numSamples == 16)
- {
- sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
- sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
- }
- }
- else
- {
- __m256i src = _mm256_set1_epi32(0);
- __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
-
- if(T::MultisampleT::numSamples == 1)
- {
- mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
- }
- else if(T::MultisampleT::numSamples == 2)
- {
- mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
- }
- else if(T::MultisampleT::numSamples == 4)
- {
- mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
- }
- else if(T::MultisampleT::numSamples == 8)
- {
- mask[0] = _mm256_set1_epi32(-1);
- }
- else if(T::MultisampleT::numSamples == 16)
- {
- mask[0] = _mm256_set1_epi32(-1);
- mask[1] = _mm256_set1_epi32(-1);
- index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
- }
-
- // gather coverage for samples 0-7
- sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
- if(T::MultisampleT::numSamples > 8)
- {
- // gather coverage for samples 8-15
- sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
- }
- }
-
- mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
- // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
- simdscalari packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
-
- simdscalari packedCoverage1;
- if(T::MultisampleT::numSamples > 8)
- {
- // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
- packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
- }
-
- #if (KNOB_ARCH == KNOB_ARCH_AVX)
- // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
- simdscalari hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
- simdscalar shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
- packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
-
- simdscalari packedSampleCoverage;
- if(T::MultisampleT::numSamples > 8)
- {
- // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
- hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
- shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
- shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
- packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
- packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
- }
- else
- {
- packedSampleCoverage = packedCoverage0;
- }
- #else
- simdscalari permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
- // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
- packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
-
- simdscalari packedSampleCoverage;
- if(T::MultisampleT::numSamples > 8)
- {
- permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
- // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
- packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
-
- // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
- packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
- }
- else
- {
- packedSampleCoverage = packedCoverage0;
- }
- #endif
-
- for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
- {
- // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
- inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
-
- if(!T::bForcedSampleCount)
- {
- // input coverage has to be anded with sample mask if MSAA isn't forced on
- inputMask[i] &= sampleMask;
- }
-
- // shift to the next pixel in the 4x2
- packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
- }
- }
-
- INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
- {
- uint32_t inputMask[KNOB_SIMD_WIDTH];
- generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
- inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
- }
-
-};
-
-template<typename T>
-struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
-{
- INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
- {
- // will need to update for avx512
- assert(KNOB_SIMD_WIDTH == 8);
- simdscalari vec = _mm256_set1_epi32(coverageMask[0]);
- const simdscalari bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
- vec = _simd_and_si(vec, bit);
- vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
- vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
- inputCoverage = _simd_castsi_ps(vec);
- }
-
- INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
- {
- uint32_t simdCoverage = (coverageMask[0] & MASK);
- static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1;
- for(int i = 0; i < KNOB_SIMD_WIDTH; i++)
- {
- // set all samples to covered if conservative coverage mask is set for that pixel
- inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0;
- }
- }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Centroid behaves exactly as follows :
-// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to
-// have a sample location there).
-// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the
-// coverage with the SampleMask Rasterizer State.
-// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is
-// evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the
-// SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-template<typename T>
-INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS& samplePos,
- const uint64_t *const coverageMask, const uint32_t sampleMask,
- const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
-{
- uint32_t inputMask[KNOB_SIMD_WIDTH];
- generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
-
- // Case (2) - partially covered pixel
-
- // scan for first covered sample per pixel in the 4x2 span
- unsigned long sampleNum[KNOB_SIMD_WIDTH];
- (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
- (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
- (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
- (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
- (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
- (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
- (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
- (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
-
- // look up and set the sample offsets from UL pixel corner for first covered sample
- __m256 vXSample = _mm256_set_ps(samplePos.X(sampleNum[7]),
- samplePos.X(sampleNum[6]),
- samplePos.X(sampleNum[5]),
- samplePos.X(sampleNum[4]),
- samplePos.X(sampleNum[3]),
- samplePos.X(sampleNum[2]),
- samplePos.X(sampleNum[1]),
- samplePos.X(sampleNum[0]));
-
- __m256 vYSample = _mm256_set_ps(samplePos.Y(sampleNum[7]),
- samplePos.Y(sampleNum[6]),
- samplePos.Y(sampleNum[5]),
- samplePos.Y(sampleNum[4]),
- samplePos.Y(sampleNum[3]),
- samplePos.Y(sampleNum[2]),
- samplePos.Y(sampleNum[1]),
- samplePos.Y(sampleNum[0]));
- // add sample offset to UL pixel corner
- vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
- vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
-
- // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
- static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask();
- simdscalari vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
- simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
-
- static const simdscalari vZero = _simd_setzero_si();
- const simdscalari vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
- simdscalari vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
- simdscalari vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
- simdscalari vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
-
- simdscalari vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
-
- // set the centroid position based on results from above
- psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
- psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
-
- // Case (3a) No samples covered and partial sample mask
- simdscalari vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
- // sample mask should never be all 0's for this case, but handle it anyways
- unsigned long firstCoveredSampleMaskSample = 0;
- (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
-
- simdscalari vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
-
- vXSample = _simd_set1_ps(samplePos.X(firstCoveredSampleMaskSample));
- vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample));
-
- // blend in case 3a pixel locations
- psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
- psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
-}
-
-INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
- const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
-{
- // evaluate I,J
- psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
- psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
- psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
- psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
-
- // interpolate 1/w
- psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
-}
-
-INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar z, float minz, float maxz)
-{
- const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz));
- const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz));
-
- return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask));
-}
-
-template<typename T>
-INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
-{
- // RT has to be single sample if we're in forcedMSAA mode
- if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
- {
- return 1;
- }
- // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
- else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
- {
- return GetNumSamples(blendSampleCount);
- }
- // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
- else
- {
- return T::MultisampleT::numSamples;
- }
-}
-
-inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE_DESC &work)
-{
- // broadcast scalars
-
- coeffs->vIa = _simd_broadcast_ss(&work.I[0]);
- coeffs->vIb = _simd_broadcast_ss(&work.I[1]);
- coeffs->vIc = _simd_broadcast_ss(&work.I[2]);
-
- coeffs->vJa = _simd_broadcast_ss(&work.J[0]);
- coeffs->vJb = _simd_broadcast_ss(&work.J[1]);
- coeffs->vJc = _simd_broadcast_ss(&work.J[2]);
-
- coeffs->vZa = _simd_broadcast_ss(&work.Z[0]);
- coeffs->vZb = _simd_broadcast_ss(&work.Z[1]);
- coeffs->vZc = _simd_broadcast_ss(&work.Z[2]);
-
- coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet);
-
- coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
- coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
- coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
-}
-
-inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorBufferCount, RenderOutputBuffers &renderBuffers)
-{
- assert(colorBufferCount <= SWR_NUM_RENDERTARGETS);
-
- if (pColorBuffer)
- {
- for (uint32_t index = 0; index < colorBufferCount; index += 1)
- {
- pColorBuffer[index] = renderBuffers.pColor[index];
- }
- }
-
- if (pDepthBuffer)
- {
- *pDepthBuffer = renderBuffers.pDepth;
- }
-
- if (pStencilBuffer)
- {
- *pStencilBuffer = renderBuffers.pStencil;;
- }
-}
-
-template<typename T>
-void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos, SWR_TRIANGLE_DESC &work)
-{
- psContext->pAttribs = work.pAttribs;
- psContext->pPerspAttribs = work.pPerspAttribs;
- psContext->frontFace = work.triFlags.frontFacing;
- psContext->renderTargetArrayIndex = work.triFlags.renderTargetArrayIndex;
-
- // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
- psContext->I = work.I;
- psContext->J = work.J;
-
- psContext->recipDet = work.recipDet;
- psContext->pRecipW = work.pRecipW;
- psContext->pSamplePosX = samplePos.X();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
- psContext->pSamplePosY = samplePos.Y();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
- psContext->rasterizerSampleCount = T::MultisampleT::numSamples;
- psContext->sampleIndex = 0;
-}
-
-template<typename T, bool IsSingleSample>
-void CalcCentroid(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos,
- const BarycentricCoeffs &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask)
-{
- if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different
- {
- // for 1x case, centroid is pixel center
- psContext->vX.centroid = psContext->vX.center;
- psContext->vY.centroid = psContext->vY.center;
- psContext->vI.centroid = psContext->vI.center;
- psContext->vJ.centroid = psContext->vJ.center;
- psContext->vOneOverW.centroid = psContext->vOneOverW.center;
- }
- else
- {
- if (T::bCentroidPos)
- {
- ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
- if (T::bIsCenterPattern)
- {
- psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f));
- psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f));
- }
- else
- {
- // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'..
- CalcCentroidPos<T>(*psContext, samplePos, coverageMask, sampleMask, psContext->vX.UL, psContext->vY.UL);
- }
-
- CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL);
- }
- else
- {
- psContext->vX.centroid = psContext->vX.sample;
- psContext->vY.centroid = psContext->vY.sample;
- }
- }
-}
-
-template<typename T>
-struct PixelRateZTestLoop
-{
- PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState,
- uint8_t*& depthBuffer, uint8_t*& stencilBuffer, const uint8_t ClipDistanceMask) :
- pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
- samplePos(state.rastState.samplePositions),
- clipDistanceMask(ClipDistanceMask), pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){};
-
- INLINE
- uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext,
- const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0)
- {
- SWR_CONTEXT *pContext = pDC->pContext;
-
- uint32_t statCount = 0;
- simdscalar anyDepthSamplePassed = _simd_setzero_ps();
- for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
- {
- const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample];
- vCoverageMask[sample] = _simd_and_ps(activeLanes, vMask(pCoverageMask[currentSimdIn8x8] & MASK));
-
- if(!_simd_movemask_ps(vCoverageMask[sample]))
- {
- vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
- continue;
- }
-
- // offset depth/stencil buffers current sample
- uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
- uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
-
- if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
- {
- static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
-
- const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
-
- const float minz = state.depthBoundsState.depthBoundsTestMinValue;
- const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
-
- vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(CalcDepthBoundsAcceptMask(z, minz, maxz)));
- }
-
- AR_BEGIN(BEBarycentric, pDC->drawId);
-
- // calculate per sample positions
- psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
- psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
-
- // calc I & J per sample
- CalcSampleBarycentrics(coeffs, psContext);
-
- if(psState.writesODepth)
- {
- {
- // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
- vZ[sample] = psContext.vZ;
- }
- }
- else
- {
- vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
- vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
- }
-
- AR_END(BEBarycentric, 0);
-
- ///@todo: perspective correct vs non-perspective correct clipping?
- // if clip distances are enabled, we need to interpolate for each sample
- if(clipDistanceMask)
- {
- uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
-
- vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
- }
+typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, uint32_t, DWORD[4], const SWR_RECT& rect);
- // ZTest for this sample
- ///@todo Need to uncomment out this bucket.
- //AR_BEGIN(BEDepthBucket, pDC->drawId);
- depthPassMask[sample] = vCoverageMask[sample];
- stencilPassMask[sample] = vCoverageMask[sample];
- depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
- vZ[sample], pDepthSample, vCoverageMask[sample],
- pStencilSample, &stencilPassMask[sample]);
- //AR_END(BEDepthBucket, 0);
-
- // early-exit if no pixels passed depth or earlyZ is forced on
- if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
- {
- DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
- pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
-
- if(!_simd_movemask_ps(depthPassMask[sample]))
- {
- continue;
- }
- }
- anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
- uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
- statCount += _mm_popcnt_u32(statMask);
- }
-
- activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes);
- // return number of samples that passed depth and coverage
- return statCount;
- }
-
- // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
- simdscalar vZ[T::MultisampleT::numCoverageSamples];
- simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples];
- simdscalar depthPassMask[T::MultisampleT::numCoverageSamples];
- simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples];
-
-private:
- // functor inputs
- DRAW_CONTEXT* pDC;
- uint32_t workerId;
-
- const SWR_TRIANGLE_DESC& work;
- const BarycentricCoeffs& coeffs;
- const API_STATE& state;
- const SWR_PS_STATE& psState;
- const SWR_MULTISAMPLE_POS& samplePos;
- const uint8_t clipDistanceMask;
- uint8_t*& pDepthBuffer;
- uint8_t*& pStencilBuffer;
-};
-
-INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
-{
- // evaluate I,J
- psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
- psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
- psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
- psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
-
- // interpolate 1/w
- psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
-}
-
-INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
-{
- // evaluate I,J
- psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
- psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
- psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
- psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
-
- // interpolate 1/w
- psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
-}
-
-// Merge Output to 4x2 SIMD Tile Format
-INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
- const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT)
-{
- // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
- const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
- simdvector blendOut;
-
- for(uint32_t rt = 0; rt < NumRT; ++rt)
- {
- uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset;
-
- const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
-
- {
- // pfnBlendFunc may not update all channels. Initialize with PS output.
- /// TODO: move this into the blend JIT.
- blendOut = psContext.shaded[rt];
-
- // Blend outputs and update coverage mask for alpha test
- if(pfnBlendFunc[rt] != nullptr)
- {
- pfnBlendFunc[rt](
- pBlendState,
- psContext.shaded[rt],
- psContext.shaded[1],
- psContext.shaded[0].w,
- sample,
- pColorSample,
- blendOut,
- &psContext.oMask,
- (simdscalari*)&coverageMask);
- }
- }
-
- // final write mask
- simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
-
- ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
- static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
-
- const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
-
- // store with color mask
- if(!pRTBlend->writeDisableRed)
- {
- _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
- }
- if(!pRTBlend->writeDisableGreen)
- {
- _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
- }
- if(!pRTBlend->writeDisableBlue)
- {
- _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
- }
- if(!pRTBlend->writeDisableAlpha)
- {
- _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
- }
- }
-}
-
-#if USE_8x2_TILE_BACKEND
-// Merge Output to 8x2 SIMD16 Tile Format
-INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
- const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT, const uint32_t colorBufferEnableMask, bool useAlternateOffset)
-{
- // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
- uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
-
- if (useAlternateOffset)
- {
- rasterTileColorOffset += sizeof(simdscalar);
- }
-
- simdvector blendSrc;
- simdvector blendOut;
-
- uint32_t colorBufferBit = 1;
- for (uint32_t rt = 0; rt < NumRT; rt += 1, colorBufferBit <<= 1)
- {
- simdscalar *pColorSample = reinterpret_cast<simdscalar *>(pColorBase[rt] + rasterTileColorOffset);
-
- const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
-
- if (colorBufferBit & colorBufferEnableMask)
- {
- blendSrc[0] = pColorSample[0];
- blendSrc[1] = pColorSample[2];
- blendSrc[2] = pColorSample[4];
- blendSrc[3] = pColorSample[6];
- }
-
- {
- // pfnBlendFunc may not update all channels. Initialize with PS output.
- /// TODO: move this into the blend JIT.
- blendOut = psContext.shaded[rt];
-
- // Blend outputs and update coverage mask for alpha test
- if(pfnBlendFunc[rt] != nullptr)
- {
- pfnBlendFunc[rt](
- pBlendState,
- psContext.shaded[rt],
- psContext.shaded[1],
- psContext.shaded[0].w,
- sample,
- reinterpret_cast<uint8_t *>(&blendSrc),
- blendOut,
- &psContext.oMask,
- reinterpret_cast<simdscalari *>(&coverageMask));
- }
- }
-
- // final write mask
- simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
-
- ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
- static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
-
- // store with color mask
- if (!pRTBlend->writeDisableRed)
- {
- _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[0]), outputMask, blendOut.x);
- }
- if (!pRTBlend->writeDisableGreen)
- {
- _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[2]), outputMask, blendOut.y);
- }
- if (!pRTBlend->writeDisableBlue)
- {
- _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[4]), outputMask, blendOut.z);
- }
- if (!pRTBlend->writeDisableAlpha)
- {
- _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[6]), outputMask, blendOut.w);
- }
- }
-}
-
-#endif
-
-template<typename T>
-void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
-{
- ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the backend
-
-
- SWR_CONTEXT *pContext = pDC->pContext;
-
- AR_BEGIN(BEPixelRateBackend, pDC->drawId);
- AR_BEGIN(BESetup, pDC->drawId);
-
- const API_STATE &state = GetApiState(pDC);
-
- BarycentricCoeffs coeffs;
- SetupBarycentricCoeffs(&coeffs, work);
-
- SWR_PS_CONTEXT psContext;
- const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
- SetupPixelShaderContext<T>(&psContext, samplePos, work);
-
- uint8_t *pDepthBuffer, *pStencilBuffer;
- SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
-
- AR_END(BESetup, 0);
-
- PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask);
-
- psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
- psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-
- const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
-
- for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
- {
- psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
- psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-
- const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
-
- for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
- {
-#if USE_8x2_TILE_BACKEND
- const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
-#endif
- simdscalar activeLanes;
- if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
- activeLanes = vMask(work.anyCoveredSamples & MASK);
-
- if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
- {
- const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
-
- generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
- }
-
- AR_BEGIN(BEBarycentric, pDC->drawId);
-
- CalcPixelBarycentrics(coeffs, psContext);
-
- CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
-
- AR_END(BEBarycentric, 0);
-
- if(T::bForcedSampleCount)
- {
- // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
- const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
- activeLanes = _simd_and_ps(activeLanes, vSampleMask);
- }
-
- // Early-Z?
- if(T::bCanEarlyZ && !T::bForcedSampleCount)
- {
- uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
- UPDATE_STAT_BE(DepthPassCount, depthPassCount);
- AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
- }
-
- // if we have no covered samples that passed depth at this point, go to next tile
- if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
-
- if(state.psState.usesSourceDepth)
- {
- AR_BEGIN(BEBarycentric, pDC->drawId);
- // interpolate and quantize z
- psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
- psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
- AR_END(BEBarycentric, 0);
- }
-
- // pixels that are currently active
- psContext.activeMask = _simd_castps_si(activeLanes);
- psContext.oMask = T::MultisampleT::FullSampleMask();
-
- // execute pixel shader
- AR_BEGIN(BEPixelShader, pDC->drawId);
- state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
- UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
- AR_END(BEPixelShader, 0);
-
- // update active lanes to remove any discarded or oMask'd pixels
- activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
- if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
-
- // late-Z
- if(!T::bCanEarlyZ && !T::bForcedSampleCount)
- {
- uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
- UPDATE_STAT_BE(DepthPassCount, depthPassCount);
- AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
- }
-
- // if we have no covered samples that passed depth at this point, skip OM and go to next tile
- if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
-
- // output merger
- // loop over all samples, broadcasting the results of the PS to all passing pixels
- for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
- {
- AR_BEGIN(BEOutputMerger, pDC->drawId);
- // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
- uint32_t coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample;
- simdscalar coverageMask, depthMask;
- if(T::bForcedSampleCount)
- {
- coverageMask = depthMask = activeLanes;
- }
- else
- {
- coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
- depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
- if(!_simd_movemask_ps(depthMask))
- {
- // stencil should already have been written in early/lateZ tests
- AR_END(BEOutputMerger, 0);
- continue;
- }
- }
-
- // broadcast the results of the PS to all passing pixels
-#if USE_8x2_TILE_BACKEND
- OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
-#else // USE_8x2_TILE_BACKEND
- OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets);
-#endif // USE_8x2_TILE_BACKEND
-
- if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
- {
- uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
- uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
-
- DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
- pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
- }
- AR_END(BEOutputMerger, 0);
- }
-Endtile:
- AR_BEGIN(BEEndTile, pDC->drawId);
-
- for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
- {
- work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
- }
-
- if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
- {
- work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
- }
- work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-
-#if USE_8x2_TILE_BACKEND
- if (useAlternateOffset)
- {
- for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
- {
- psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
- }
- }
-#else
- for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
- {
- psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
- }
-#endif
- pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
- pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
-
- AR_END(BEEndTile, 0);
-
- psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
- psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
- }
-
- psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
- psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
- }
-
- AR_END(BEPixelRateBackend, 0);
-}
+extern PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS];
+extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
+extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT]
+ [2] // centroid
+ [2]; // canEarlyZ
+extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
+ [2] // isCenterPattern
+ [SWR_INPUT_COVERAGE_COUNT]
+ [2] // centroid
+ [2] // forcedSampleCount
+ [2] // canEarlyZ
+ ;
+extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
+ [SWR_INPUT_COVERAGE_COUNT]
+ [2] // centroid
+ [2]; // canEarlyZ
-template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t isCenter = 0,
- uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0
- >
-struct SwrBackendTraits
-{
- static const bool bIsCenterPattern = (isCenter == 1);
- static const uint32_t InputCoverage = coverage;
- static const bool bCentroidPos = (centroid == 1);
- static const bool bForcedSampleCount = (forced == 1);
- static const bool bCanEarlyZ = (canEarlyZ == 1);
- typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, bIsCenterPattern> MultisampleT;
-};
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp
new file mode 100644
index 00000000000..0ef54e266d7
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp
@@ -0,0 +1,281 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file backend.cpp
+*
+* @brief Backend handles rasterization, pixel shading and output merger
+* operations.
+*
+******************************************************************************/
+
+#include <smmintrin.h>
+
+#include "backend.h"
+#include "backend_impl.h"
+#include "tilemgr.h"
+#include "memory/tilingtraits.h"
+#include "core/multisample.h"
+
+#include <algorithm>
+
+template<SWR_FORMAT format>
+void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
+{
+ auto lambda = [&](int32_t comp)
+ {
+ FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
+
+ pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
+ };
+
+ const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
+
+ for (uint32_t i = 0; i < numIter; ++i)
+ {
+ UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
+ }
+}
+
+#if USE_8x2_TILE_BACKEND
+template<SWR_FORMAT format>
+void ClearRasterTile(uint8_t *pTileBuffer, simd16vector &value)
+{
+ auto lambda = [&](int32_t comp)
+ {
+ FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
+
+ pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
+ };
+
+ const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM);
+
+ for (uint32_t i = 0; i < numIter; ++i)
+ {
+ UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
+ }
+}
+
+#endif
+template<SWR_FORMAT format>
+INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, uint32_t renderTargetArrayIndex, DWORD clear[4], const SWR_RECT& rect)
+{
+ // convert clear color to hottile format
+ // clear color is in RGBA float/uint32
+#if USE_8x2_TILE_BACKEND
+ simd16vector vClear;
+ for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
+ {
+ simd16scalar vComp;
+ vComp = _simd16_load1_ps((const float*)&clear[comp]);
+ if (FormatTraits<format>::isNormalized(comp))
+ {
+ vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp)));
+ vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
+ }
+ vComp = FormatTraits<format>::pack(comp, vComp);
+ vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
+ }
+
+#else
+ simdvector vClear;
+ for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
+ {
+ simdscalar vComp;
+ vComp = _simd_load1_ps((const float*)&clear[comp]);
+ if (FormatTraits<format>::isNormalized(comp))
+ {
+ vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
+ vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
+ }
+ vComp = FormatTraits<format>::pack(comp, vComp);
+ vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
+ }
+
+#endif
+ uint32_t tileX, tileY;
+ MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
+
+ // Init to full macrotile
+ SWR_RECT clearTile =
+ {
+ KNOB_MACROTILE_X_DIM * int32_t(tileX),
+ KNOB_MACROTILE_Y_DIM * int32_t(tileY),
+ KNOB_MACROTILE_X_DIM * int32_t(tileX + 1),
+ KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1),
+ };
+
+ // intersect with clear rect
+ clearTile &= rect;
+
+ // translate to local hottile origin
+ clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, -int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
+
+ // Make maximums inclusive (needed for convert to raster tiles)
+ clearTile.xmax -= 1;
+ clearTile.ymax -= 1;
+
+ // convert to raster tiles
+ clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT);
+ clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT);
+ clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT);
+ clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT);
+
+ const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
+ // compute steps between raster tile samples / raster tiles / macro tile rows
+ const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
+ const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
+ const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
+ const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
+
+ HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples, renderTargetArrayIndex);
+ uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples;
+ uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
+
+ // loop over all raster tiles in the current hot tile
+ for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y)
+ {
+ uint8_t* pRasterTile = pRasterTileRow;
+ for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x)
+ {
+ for( int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
+ {
+ ClearRasterTile<format>(pRasterTile, vClear);
+ pRasterTile += rasterTileSampleStep;
+ }
+ }
+ pRasterTileRow += macroTileRowStep;
+ }
+
+ pHotTile->state = HOTTILE_DIRTY;
+}
+
+
+void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
+{
+ SWR_CONTEXT *pContext = pDC->pContext;
+
+ if (KNOB_FAST_CLEAR)
+ {
+ CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
+ SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
+ uint32_t numSamples = GetNumSamples(sampleCount);
+
+ SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason.
+
+ AR_BEGIN(BEClear, pDC->drawId);
+
+ if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
+ {
+ unsigned long rt = 0;
+ uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
+ while (_BitScanForward(&rt, mask))
+ {
+ mask &= ~(1 << rt);
+
+ HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)rt, true, numSamples, pClear->renderTargetArrayIndex);
+
+ // All we want to do here is to mark the hot tile as being in a "needs clear" state.
+ pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
+ pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
+ pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
+ pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
+ pHotTile->state = HOTTILE_CLEAR;
+ }
+ }
+
+ if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
+ {
+ HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples, pClear->renderTargetArrayIndex);
+ pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
+ pHotTile->state = HOTTILE_CLEAR;
+ }
+
+ if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
+ {
+ HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples, pClear->renderTargetArrayIndex);
+
+ pHotTile->clearData[0] = pClear->clearStencil;
+ pHotTile->state = HOTTILE_CLEAR;
+ }
+
+ AR_END(BEClear, 1);
+ }
+ else
+ {
+ // Legacy clear
+ CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
+ AR_BEGIN(BEClear, pDC->drawId);
+
+ if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
+ {
+ DWORD clearData[4];
+ clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
+ clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
+ clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
+ clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
+
+ PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
+ SWR_ASSERT(pfnClearTiles != nullptr);
+
+ unsigned long rt = 0;
+ uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
+ while (_BitScanForward(&rt, mask))
+ {
+ mask &= ~(1 << rt);
+
+ pfnClearTiles(pDC, (SWR_RENDERTARGET_ATTACHMENT)rt, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
+ }
+ }
+
+ if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
+ {
+ DWORD clearData[4];
+ clearData[0] = *(DWORD*)&pClear->clearDepth;
+ PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
+ SWR_ASSERT(pfnClearTiles != nullptr);
+
+ pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
+ }
+
+ if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
+ {
+ DWORD clearData[4];
+ clearData[0] = pClear->clearStencil;
+ PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
+
+ pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
+ }
+
+ AR_END(BEClear, 1);
+ }
+}
+
+void InitClearTilesTable()
+{
+ memset(gClearTilesTable, 0, sizeof(gClearTilesTable));
+
+ gClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
+ gClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
+ gClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
+ gClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
+ gClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
new file mode 100644
index 00000000000..e1518719840
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
@@ -0,0 +1,1067 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file backend.h
+*
+* @brief Backend handles rasterization, pixel shading and output merger
+* operations.
+*
+******************************************************************************/
+#pragma once
+
+void InitBackendSingleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_INPUT_COVERAGE_COUNT][2][2]);
+void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]);
+
+static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext);
+
+
+enum SWR_BACKEND_FUNCS
+{
+ SWR_BACKEND_SINGLE_SAMPLE,
+ SWR_BACKEND_MSAA_PIXEL_RATE,
+ SWR_BACKEND_MSAA_SAMPLE_RATE,
+ SWR_BACKEND_FUNCS_MAX,
+};
+
+#if KNOB_SIMD_WIDTH == 8
+static const simdscalar vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
+static const simdscalar vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
+static const simdscalar vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
+static const simdscalar vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
+#define MASK 0xff
+#endif
+
+static INLINE simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
+{
+ simdscalar vClipMask = _simd_setzero_ps();
+ uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
+
+ for (uint32_t i = 0; i < numClipDistance; ++i)
+ {
+ // pull triangle clip distance values from clip buffer
+ simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
+ simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
+ simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
+
+ // interpolate
+ simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
+
+ // clip if interpolated clip distance is < 0 || NAN
+ simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
+
+ vClipMask = _simd_or_ps(vClipMask, vCull);
+ }
+
+ return _simd_movemask_ps(vClipMask);
+}
+
+INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
+{
+ static const uint32_t RasterTileColorOffsets[16]
+ { 0,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
+ };
+ assert(sampleNum < 16);
+ return RasterTileColorOffsets[sampleNum];
+}
+
+INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
+{
+ static const uint32_t RasterTileDepthOffsets[16]
+ { 0,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
+ };
+ assert(sampleNum < 16);
+ return RasterTileDepthOffsets[sampleNum];
+}
+
+INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
+{
+ static const uint32_t RasterTileStencilOffsets[16]
+ { 0,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
+ };
+ assert(sampleNum < 16);
+ return RasterTileStencilOffsets[sampleNum];
+}
+
+template<typename T, uint32_t InputCoverage>
+struct generateInputCoverage
+{
+ INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
+ {
+ // will need to update for avx512
+ assert(KNOB_SIMD_WIDTH == 8);
+
+ simdscalari mask[2];
+ simdscalari sampleCoverage[2];
+
+ if(T::bIsCenterPattern)
+ {
+ // center coverage is the same for all samples; just broadcast to the sample slots
+ uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
+ if(T::MultisampleT::numSamples == 1)
+ {
+ sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
+ }
+ else if(T::MultisampleT::numSamples == 2)
+ {
+ sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
+ }
+ else if(T::MultisampleT::numSamples == 4)
+ {
+ sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
+ }
+ else if(T::MultisampleT::numSamples == 8)
+ {
+ sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+ }
+ else if(T::MultisampleT::numSamples == 16)
+ {
+ sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+ sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
+ }
+ }
+ else
+ {
+ __m256i src = _mm256_set1_epi32(0);
+ __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
+
+ if(T::MultisampleT::numSamples == 1)
+ {
+ mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
+ }
+ else if(T::MultisampleT::numSamples == 2)
+ {
+ mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+ }
+ else if(T::MultisampleT::numSamples == 4)
+ {
+ mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+ }
+ else if(T::MultisampleT::numSamples == 8)
+ {
+ mask[0] = _mm256_set1_epi32(-1);
+ }
+ else if(T::MultisampleT::numSamples == 16)
+ {
+ mask[0] = _mm256_set1_epi32(-1);
+ mask[1] = _mm256_set1_epi32(-1);
+ index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
+ }
+
+ // gather coverage for samples 0-7
+ sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
+ if(T::MultisampleT::numSamples > 8)
+ {
+ // gather coverage for samples 8-15
+ sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
+ }
+ }
+
+ mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
+ // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
+ simdscalari packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
+
+ simdscalari packedCoverage1;
+ if(T::MultisampleT::numSamples > 8)
+ {
+ // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
+ packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
+ }
+
+ #if (KNOB_ARCH == KNOB_ARCH_AVX)
+ // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
+ simdscalari hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
+ simdscalar shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+ packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
+
+ simdscalari packedSampleCoverage;
+ if(T::MultisampleT::numSamples > 8)
+ {
+ // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+ hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
+ shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+ shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
+ packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
+ packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
+ }
+ else
+ {
+ packedSampleCoverage = packedCoverage0;
+ }
+ #else
+ simdscalari permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
+ // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
+ packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
+
+ simdscalari packedSampleCoverage;
+ if(T::MultisampleT::numSamples > 8)
+ {
+ permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
+ // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+ packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
+
+ // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
+ packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
+ }
+ else
+ {
+ packedSampleCoverage = packedCoverage0;
+ }
+ #endif
+
+ for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
+ {
+ // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
+ inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
+
+ if(!T::bForcedSampleCount)
+ {
+ // input coverage has to be anded with sample mask if MSAA isn't forced on
+ inputMask[i] &= sampleMask;
+ }
+
+ // shift to the next pixel in the 4x2
+ packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
+ }
+ }
+
+ INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
+ {
+ uint32_t inputMask[KNOB_SIMD_WIDTH];
+ generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
+ inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
+ }
+
+};
+
+template<typename T>
+struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
+{
+ INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
+ {
+ // will need to update for avx512
+ assert(KNOB_SIMD_WIDTH == 8);
+ simdscalari vec = _mm256_set1_epi32(coverageMask[0]);
+ const simdscalari bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+ vec = _simd_and_si(vec, bit);
+ vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
+ vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
+ inputCoverage = _simd_castsi_ps(vec);
+ }
+
+ INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
+ {
+ uint32_t simdCoverage = (coverageMask[0] & MASK);
+ static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1;
+ for(int i = 0; i < KNOB_SIMD_WIDTH; i++)
+ {
+ // set all samples to covered if conservative coverage mask is set for that pixel
+ inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0;
+ }
+ }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Centroid behaves exactly as follows :
+// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to
+// have a sample location there).
+// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the
+// coverage with the SampleMask Rasterizer State.
+// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is
+// evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the
+// SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename T>
+INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS& samplePos,
+ const uint64_t *const coverageMask, const uint32_t sampleMask,
+ const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
+{
+ uint32_t inputMask[KNOB_SIMD_WIDTH];
+ generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
+
+ // Case (2) - partially covered pixel
+
+ // scan for first covered sample per pixel in the 4x2 span
+ unsigned long sampleNum[KNOB_SIMD_WIDTH];
+ (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
+ (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
+ (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
+ (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
+ (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
+ (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
+ (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
+ (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
+
+ // look up and set the sample offsets from UL pixel corner for first covered sample
+ __m256 vXSample = _mm256_set_ps(samplePos.X(sampleNum[7]),
+ samplePos.X(sampleNum[6]),
+ samplePos.X(sampleNum[5]),
+ samplePos.X(sampleNum[4]),
+ samplePos.X(sampleNum[3]),
+ samplePos.X(sampleNum[2]),
+ samplePos.X(sampleNum[1]),
+ samplePos.X(sampleNum[0]));
+
+ __m256 vYSample = _mm256_set_ps(samplePos.Y(sampleNum[7]),
+ samplePos.Y(sampleNum[6]),
+ samplePos.Y(sampleNum[5]),
+ samplePos.Y(sampleNum[4]),
+ samplePos.Y(sampleNum[3]),
+ samplePos.Y(sampleNum[2]),
+ samplePos.Y(sampleNum[1]),
+ samplePos.Y(sampleNum[0]));
+ // add sample offset to UL pixel corner
+ vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
+ vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
+
+ // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
+ static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask();
+ simdscalari vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
+ simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
+
+ static const simdscalari vZero = _simd_setzero_si();
+ const simdscalari vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
+ simdscalari vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
+ simdscalari vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
+ simdscalari vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
+
+ simdscalari vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
+
+ // set the centroid position based on results from above
+ psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
+ psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
+
+ // Case (3a) No samples covered and partial sample mask
+ simdscalari vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
+ // sample mask should never be all 0's for this case, but handle it anyways
+ unsigned long firstCoveredSampleMaskSample = 0;
+ (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
+
+ simdscalari vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
+
+ vXSample = _simd_set1_ps(samplePos.X(firstCoveredSampleMaskSample));
+ vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample));
+
+ // blend in case 3a pixel locations
+ psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
+ psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
+}
+
+INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
+ const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
+{
+ // evaluate I,J
+ psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
+ psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
+ psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
+ psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
+
+ // interpolate 1/w
+ psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
+}
+
+INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar z, float minz, float maxz)
+{
+ const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz));
+ const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz));
+
+ return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask));
+}
+
+template<typename T>
+INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
+{
+ // RT has to be single sample if we're in forcedMSAA mode
+ if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
+ {
+ return 1;
+ }
+ // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
+ else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
+ {
+ return GetNumSamples(blendSampleCount);
+ }
+ // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
+ else
+ {
+ return T::MultisampleT::numSamples;
+ }
+}
+
+inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE_DESC &work)
+{
+ // broadcast scalars
+
+ coeffs->vIa = _simd_broadcast_ss(&work.I[0]);
+ coeffs->vIb = _simd_broadcast_ss(&work.I[1]);
+ coeffs->vIc = _simd_broadcast_ss(&work.I[2]);
+
+ coeffs->vJa = _simd_broadcast_ss(&work.J[0]);
+ coeffs->vJb = _simd_broadcast_ss(&work.J[1]);
+ coeffs->vJc = _simd_broadcast_ss(&work.J[2]);
+
+ coeffs->vZa = _simd_broadcast_ss(&work.Z[0]);
+ coeffs->vZb = _simd_broadcast_ss(&work.Z[1]);
+ coeffs->vZc = _simd_broadcast_ss(&work.Z[2]);
+
+ coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet);
+
+ coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
+ coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
+ coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
+}
+
+inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorBufferCount, RenderOutputBuffers &renderBuffers)
+{
+ assert(colorBufferCount <= SWR_NUM_RENDERTARGETS);
+
+ if (pColorBuffer)
+ {
+ for (uint32_t index = 0; index < colorBufferCount; index += 1)
+ {
+ pColorBuffer[index] = renderBuffers.pColor[index];
+ }
+ }
+
+ if (pDepthBuffer)
+ {
+ *pDepthBuffer = renderBuffers.pDepth;
+ }
+
+ if (pStencilBuffer)
+ {
+ *pStencilBuffer = renderBuffers.pStencil;;
+ }
+}
+
+template<typename T>
+void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos, SWR_TRIANGLE_DESC &work)
+{
+ psContext->pAttribs = work.pAttribs;
+ psContext->pPerspAttribs = work.pPerspAttribs;
+ psContext->frontFace = work.triFlags.frontFacing;
+ psContext->renderTargetArrayIndex = work.triFlags.renderTargetArrayIndex;
+
+ // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
+ psContext->I = work.I;
+ psContext->J = work.J;
+
+ psContext->recipDet = work.recipDet;
+ psContext->pRecipW = work.pRecipW;
+ psContext->pSamplePosX = samplePos.X();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
+ psContext->pSamplePosY = samplePos.Y();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
+ psContext->rasterizerSampleCount = T::MultisampleT::numSamples;
+ psContext->sampleIndex = 0;
+}
+
+template<typename T, bool IsSingleSample>
+void CalcCentroid(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos,
+ const BarycentricCoeffs &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask)
+{
+ if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different
+ {
+ // for 1x case, centroid is pixel center
+ psContext->vX.centroid = psContext->vX.center;
+ psContext->vY.centroid = psContext->vY.center;
+ psContext->vI.centroid = psContext->vI.center;
+ psContext->vJ.centroid = psContext->vJ.center;
+ psContext->vOneOverW.centroid = psContext->vOneOverW.center;
+ }
+ else
+ {
+ if (T::bCentroidPos)
+ {
+ ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
+ if (T::bIsCenterPattern)
+ {
+ psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f));
+ psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f));
+ }
+ else
+ {
+ // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'..
+ CalcCentroidPos<T>(*psContext, samplePos, coverageMask, sampleMask, psContext->vX.UL, psContext->vY.UL);
+ }
+
+ CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL);
+ }
+ else
+ {
+ psContext->vX.centroid = psContext->vX.sample;
+ psContext->vY.centroid = psContext->vY.sample;
+ }
+ }
+}
+
+template<typename T>
+struct PixelRateZTestLoop
+{
+ PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState,
+ uint8_t*& depthBuffer, uint8_t*& stencilBuffer, const uint8_t ClipDistanceMask) :
+ pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
+ samplePos(state.rastState.samplePositions),
+ clipDistanceMask(ClipDistanceMask), pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){};
+
+ INLINE
+ uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext,
+ const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0)
+ {
+ SWR_CONTEXT *pContext = pDC->pContext;
+
+ uint32_t statCount = 0;
+ simdscalar anyDepthSamplePassed = _simd_setzero_ps();
+ for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
+ {
+ const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample];
+ vCoverageMask[sample] = _simd_and_ps(activeLanes, vMask(pCoverageMask[currentSimdIn8x8] & MASK));
+
+ if(!_simd_movemask_ps(vCoverageMask[sample]))
+ {
+ vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
+ continue;
+ }
+
+ // offset depth/stencil buffers current sample
+ uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
+ uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
+
+ if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
+ {
+ static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
+
+ const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
+
+ const float minz = state.depthBoundsState.depthBoundsTestMinValue;
+ const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
+
+ vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(CalcDepthBoundsAcceptMask(z, minz, maxz)));
+ }
+
+ AR_BEGIN(BEBarycentric, pDC->drawId);
+
+ // calculate per sample positions
+ psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
+ psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
+
+ // calc I & J per sample
+ CalcSampleBarycentrics(coeffs, psContext);
+
+ if(psState.writesODepth)
+ {
+ {
+ // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
+ vZ[sample] = psContext.vZ;
+ }
+ }
+ else
+ {
+ vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+ vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
+ }
+
+ AR_END(BEBarycentric, 0);
+
+ ///@todo: perspective correct vs non-perspective correct clipping?
+ // if clip distances are enabled, we need to interpolate for each sample
+ if(clipDistanceMask)
+ {
+ uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
+
+ vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
+ }
+
+ // ZTest for this sample
+ ///@todo Need to uncomment out this bucket.
+ //AR_BEGIN(BEDepthBucket, pDC->drawId);
+ depthPassMask[sample] = vCoverageMask[sample];
+ stencilPassMask[sample] = vCoverageMask[sample];
+ depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
+ vZ[sample], pDepthSample, vCoverageMask[sample],
+ pStencilSample, &stencilPassMask[sample]);
+ //AR_END(BEDepthBucket, 0);
+
+ // early-exit if no pixels passed depth or earlyZ is forced on
+ if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
+ {
+ DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
+ pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
+
+ if(!_simd_movemask_ps(depthPassMask[sample]))
+ {
+ continue;
+ }
+ }
+ anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
+ uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
+ statCount += _mm_popcnt_u32(statMask);
+ }
+
+ activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes);
+ // return number of samples that passed depth and coverage
+ return statCount;
+ }
+
+ // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
+ simdscalar vZ[T::MultisampleT::numCoverageSamples];
+ simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples];
+ simdscalar depthPassMask[T::MultisampleT::numCoverageSamples];
+ simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples];
+
+private:
+ // functor inputs
+ DRAW_CONTEXT* pDC;
+ uint32_t workerId;
+
+ const SWR_TRIANGLE_DESC& work;
+ const BarycentricCoeffs& coeffs;
+ const API_STATE& state;
+ const SWR_PS_STATE& psState;
+ const SWR_MULTISAMPLE_POS& samplePos;
+ const uint8_t clipDistanceMask;
+ uint8_t*& pDepthBuffer;
+ uint8_t*& pStencilBuffer;
+};
+
+INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
+{
+ // evaluate I,J
+ psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
+ psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
+ psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
+ psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
+
+ // interpolate 1/w
+ psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
+}
+
+static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
+{
+ // evaluate I,J
+ psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
+ psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
+ psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
+ psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
+
+ // interpolate 1/w
+ psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
+}
+
+// Merge Output to 4x2 SIMD Tile Format
+INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
+ const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT)
+{
+ // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
+ const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
+ simdvector blendOut;
+
+ for(uint32_t rt = 0; rt < NumRT; ++rt)
+ {
+ uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset;
+
+ const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
+
+ {
+ // pfnBlendFunc may not update all channels. Initialize with PS output.
+ /// TODO: move this into the blend JIT.
+ blendOut = psContext.shaded[rt];
+
+ // Blend outputs and update coverage mask for alpha test
+ if(pfnBlendFunc[rt] != nullptr)
+ {
+ pfnBlendFunc[rt](
+ pBlendState,
+ psContext.shaded[rt],
+ psContext.shaded[1],
+ psContext.shaded[0].w,
+ sample,
+ pColorSample,
+ blendOut,
+ &psContext.oMask,
+ (simdscalari*)&coverageMask);
+ }
+ }
+
+ // final write mask
+ simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
+
+ ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
+ static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
+
+ const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
+
+ // store with color mask
+ if(!pRTBlend->writeDisableRed)
+ {
+ _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
+ }
+ if(!pRTBlend->writeDisableGreen)
+ {
+ _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
+ }
+ if(!pRTBlend->writeDisableBlue)
+ {
+ _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
+ }
+ if(!pRTBlend->writeDisableAlpha)
+ {
+ _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
+ }
+ }
+}
+
+#if USE_8x2_TILE_BACKEND
+// Merge Output to 8x2 SIMD16 Tile Format
+INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
+ const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT, const uint32_t colorBufferEnableMask, bool useAlternateOffset)
+{
+ // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
+ uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
+
+ if (useAlternateOffset)
+ {
+ rasterTileColorOffset += sizeof(simdscalar);
+ }
+
+ simdvector blendSrc;
+ simdvector blendOut;
+
+ uint32_t colorBufferBit = 1;
+ for (uint32_t rt = 0; rt < NumRT; rt += 1, colorBufferBit <<= 1)
+ {
+ simdscalar *pColorSample = reinterpret_cast<simdscalar *>(pColorBase[rt] + rasterTileColorOffset);
+
+ const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
+
+ if (colorBufferBit & colorBufferEnableMask)
+ {
+ blendSrc[0] = pColorSample[0];
+ blendSrc[1] = pColorSample[2];
+ blendSrc[2] = pColorSample[4];
+ blendSrc[3] = pColorSample[6];
+ }
+
+ {
+ // pfnBlendFunc may not update all channels. Initialize with PS output.
+ /// TODO: move this into the blend JIT.
+ blendOut = psContext.shaded[rt];
+
+ // Blend outputs and update coverage mask for alpha test
+ if(pfnBlendFunc[rt] != nullptr)
+ {
+ pfnBlendFunc[rt](
+ pBlendState,
+ psContext.shaded[rt],
+ psContext.shaded[1],
+ psContext.shaded[0].w,
+ sample,
+ reinterpret_cast<uint8_t *>(&blendSrc),
+ blendOut,
+ &psContext.oMask,
+ reinterpret_cast<simdscalari *>(&coverageMask));
+ }
+ }
+
+ // final write mask
+ simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
+
+ ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
+ static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
+
+ // store with color mask
+ if (!pRTBlend->writeDisableRed)
+ {
+ _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[0]), outputMask, blendOut.x);
+ }
+ if (!pRTBlend->writeDisableGreen)
+ {
+ _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[2]), outputMask, blendOut.y);
+ }
+ if (!pRTBlend->writeDisableBlue)
+ {
+ _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[4]), outputMask, blendOut.z);
+ }
+ if (!pRTBlend->writeDisableAlpha)
+ {
+ _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[6]), outputMask, blendOut.w);
+ }
+ }
+}
+
+#endif
+
+template<typename T>
+void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+{
+ ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the backend
+
+
+ SWR_CONTEXT *pContext = pDC->pContext;
+
+ AR_BEGIN(BEPixelRateBackend, pDC->drawId);
+ AR_BEGIN(BESetup, pDC->drawId);
+
+ const API_STATE &state = GetApiState(pDC);
+
+ BarycentricCoeffs coeffs;
+ SetupBarycentricCoeffs(&coeffs, work);
+
+ SWR_PS_CONTEXT psContext;
+ const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
+ SetupPixelShaderContext<T>(&psContext, samplePos, work);
+
+ uint8_t *pDepthBuffer, *pStencilBuffer;
+ SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
+
+ AR_END(BESetup, 0);
+
+ PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask);
+
+ psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
+ psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
+
+ const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
+
+ for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
+ {
+ psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
+ psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
+
+ const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
+
+ for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
+ {
+#if USE_8x2_TILE_BACKEND
+ const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
+#endif
+ simdscalar activeLanes;
+ if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
+ activeLanes = vMask(work.anyCoveredSamples & MASK);
+
+ if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
+ {
+ const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
+
+ generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
+ }
+
+ AR_BEGIN(BEBarycentric, pDC->drawId);
+
+ CalcPixelBarycentrics(coeffs, psContext);
+
+ CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
+
+ AR_END(BEBarycentric, 0);
+
+ if(T::bForcedSampleCount)
+ {
+ // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
+ const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
+ activeLanes = _simd_and_ps(activeLanes, vSampleMask);
+ }
+
+ // Early-Z?
+ if(T::bCanEarlyZ && !T::bForcedSampleCount)
+ {
+ uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
+ UPDATE_STAT_BE(DepthPassCount, depthPassCount);
+ AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
+ }
+
+ // if we have no covered samples that passed depth at this point, go to next tile
+ if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
+
+ if(state.psState.usesSourceDepth)
+ {
+ AR_BEGIN(BEBarycentric, pDC->drawId);
+ // interpolate and quantize z
+ psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+ psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
+ AR_END(BEBarycentric, 0);
+ }
+
+ // pixels that are currently active
+ psContext.activeMask = _simd_castps_si(activeLanes);
+ psContext.oMask = T::MultisampleT::FullSampleMask();
+
+ // execute pixel shader
+ AR_BEGIN(BEPixelShader, pDC->drawId);
+ state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+ UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
+ AR_END(BEPixelShader, 0);
+
+ // update active lanes to remove any discarded or oMask'd pixels
+ activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
+ if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
+
+ // late-Z
+ if(!T::bCanEarlyZ && !T::bForcedSampleCount)
+ {
+ uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
+ UPDATE_STAT_BE(DepthPassCount, depthPassCount);
+ AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
+ }
+
+ // if we have no covered samples that passed depth at this point, skip OM and go to next tile
+ if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
+
+ // output merger
+ // loop over all samples, broadcasting the results of the PS to all passing pixels
+ for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
+ {
+ AR_BEGIN(BEOutputMerger, pDC->drawId);
+ // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
+ uint32_t coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample;
+ simdscalar coverageMask, depthMask;
+ if(T::bForcedSampleCount)
+ {
+ coverageMask = depthMask = activeLanes;
+ }
+ else
+ {
+ coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
+ depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
+ if(!_simd_movemask_ps(depthMask))
+ {
+ // stencil should already have been written in early/lateZ tests
+ AR_END(BEOutputMerger, 0);
+ continue;
+ }
+ }
+
+ // broadcast the results of the PS to all passing pixels
+#if USE_8x2_TILE_BACKEND
+ OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
+#else // USE_8x2_TILE_BACKEND
+ OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets);
+#endif // USE_8x2_TILE_BACKEND
+
+ if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
+ {
+ uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
+ uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
+
+ DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
+ pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
+ }
+ AR_END(BEOutputMerger, 0);
+ }
+Endtile:
+ AR_BEGIN(BEEndTile, pDC->drawId);
+
+ for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
+ {
+ work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+ }
+
+ if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
+ {
+ work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+ }
+ work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+
+#if USE_8x2_TILE_BACKEND
+ if (useAlternateOffset)
+ {
+ for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+ {
+ psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+ }
+ }
+#else
+ for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+ {
+ psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+ }
+#endif
+ pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+ pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+
+ AR_END(BEEndTile, 0);
+
+ psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
+ psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
+ }
+
+ psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
+ psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
+ }
+
+ AR_END(BEPixelRateBackend, 0);
+}
+
+template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t isCenter = 0,
+ uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0
+ >
+struct SwrBackendTraits
+{
+ static const bool bIsCenterPattern = (isCenter == 1);
+ static const uint32_t InputCoverage = coverage;
+ static const bool bCentroidPos = (centroid == 1);
+ static const bool bForcedSampleCount = (forced == 1);
+ static const bool bCanEarlyZ = (canEarlyZ == 1);
+ typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, bIsCenterPattern> MultisampleT;
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
new file mode 100644
index 00000000000..0f75ec24fb0
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
@@ -0,0 +1,345 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file backend.cpp
+*
+* @brief Backend handles rasterization, pixel shading and output merger
+* operations.
+*
+******************************************************************************/
+
+#include <smmintrin.h>
+
+#include "backend.h"
+#include "backend_impl.h"
+#include "tilemgr.h"
+#include "memory/tilingtraits.h"
+#include "core/multisample.h"
+
+#include <algorithm>
+
+template<typename T>
+void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+{
+ SWR_CONTEXT *pContext = pDC->pContext;
+
+ AR_BEGIN(BESampleRateBackend, pDC->drawId);
+ AR_BEGIN(BESetup, pDC->drawId);
+
+ const API_STATE &state = GetApiState(pDC);
+
+ BarycentricCoeffs coeffs;
+ SetupBarycentricCoeffs(&coeffs, work);
+
+ uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
+ SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
+
+ SWR_PS_CONTEXT psContext;
+ const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
+ SetupPixelShaderContext<T>(&psContext, samplePos, work);
+
+ AR_END(BESetup, 0);
+
+ psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
+ psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
+
+ const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
+
+ for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
+ {
+ psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
+ psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
+
+ const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
+
+ for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
+ {
+#if USE_8x2_TILE_BACKEND
+ const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
+
+#endif
+ if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
+ {
+ const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
+
+ generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
+ }
+
+ AR_BEGIN(BEBarycentric, pDC->drawId);
+
+ CalcPixelBarycentrics(coeffs, psContext);
+
+ CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
+
+ AR_END(BEBarycentric, 0);
+
+ for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
+ {
+ simdmask coverageMask = work.coverageMask[sample] & MASK;
+
+ if (coverageMask)
+ {
+ // offset depth/stencil buffers current sample
+ uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
+ uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
+
+ if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
+ {
+ static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
+
+ const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
+
+ const float minz = state.depthBoundsState.depthBoundsTestMinValue;
+ const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
+
+ coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
+ }
+
+ AR_BEGIN(BEBarycentric, pDC->drawId);
+
+ // calculate per sample positions
+ psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
+ psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
+
+ CalcSampleBarycentrics(coeffs, psContext);
+
+ // interpolate and quantize z
+ psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+ psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
+
+ AR_END(BEBarycentric, 0);
+
+ // interpolate user clip distance if available
+ if (state.rastState.clipDistanceMask)
+ {
+ coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
+ }
+
+ simdscalar vCoverageMask = vMask(coverageMask);
+ simdscalar depthPassMask = vCoverageMask;
+ simdscalar stencilPassMask = vCoverageMask;
+
+ // Early-Z?
+ if (T::bCanEarlyZ)
+ {
+ AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
+ depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
+ psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
+ AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
+ AR_END(BEEarlyDepthTest, 0);
+
+ // early-exit if no samples passed depth or earlyZ is forced on.
+ if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
+ {
+ DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+ pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
+
+ if (!_simd_movemask_ps(depthPassMask))
+ {
+ work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+ continue;
+ }
+ }
+ }
+
+ psContext.sampleIndex = sample;
+ psContext.activeMask = _simd_castps_si(vCoverageMask);
+
+ // execute pixel shader
+ AR_BEGIN(BEPixelShader, pDC->drawId);
+ UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
+ state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+ AR_END(BEPixelShader, 0);
+
+ vCoverageMask = _simd_castsi_ps(psContext.activeMask);
+
+ // late-Z
+ if (!T::bCanEarlyZ)
+ {
+ AR_BEGIN(BELateDepthTest, pDC->drawId);
+ depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
+ psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
+ AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
+ AR_END(BELateDepthTest, 0);
+
+ if (!_simd_movemask_ps(depthPassMask))
+ {
+ // need to call depth/stencil write for stencil write
+ DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+ pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
+
+ work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+ continue;
+ }
+ }
+
+ uint32_t statMask = _simd_movemask_ps(depthPassMask);
+ uint32_t statCount = _mm_popcnt_u32(statMask);
+ UPDATE_STAT_BE(DepthPassCount, statCount);
+
+ // output merger
+ AR_BEGIN(BEOutputMerger, pDC->drawId);
+#if USE_8x2_TILE_BACKEND
+ OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
+#else
+ OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
+#endif
+
+ // do final depth write after all pixel kills
+ if (!state.psState.forceEarlyZ)
+ {
+ DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+ pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
+ }
+ AR_END(BEOutputMerger, 0);
+ }
+ work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+ }
+
+ Endtile:
+ ATTR_UNUSED;
+
+ AR_BEGIN(BEEndTile, pDC->drawId);
+
+ if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
+ {
+ work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+ }
+
+#if USE_8x2_TILE_BACKEND
+ if (useAlternateOffset)
+ {
+ for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+ {
+ pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+ }
+ }
+#else
+ for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+ {
+ pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+ }
+#endif
+ pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+ pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+
+ AR_END(BEEndTile, 0);
+
+ psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
+ psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
+ }
+
+ psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
+ psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
+ }
+
+ AR_END(BESampleRateBackend, 0);
+}
+
+// Recursive template used to auto-nest conditionals. Converts dynamic enum function
+// arguments to static template arguments.
+template <uint32_t... ArgsT>
+struct BEChooserSampleRate
+{
+ // Last Arg Terminator
+ static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
+ {
+ switch (tArg)
+ {
+ case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
+ case SWR_BACKEND_SINGLE_SAMPLE:
+ case SWR_BACKEND_MSAA_PIXEL_RATE:
+ SWR_ASSERT(0 && "Invalid backend func\n");
+ return nullptr;
+ break;
+ default:
+ SWR_ASSERT(0 && "Invalid backend func\n");
+ return nullptr;
+ break;
+ }
+ }
+
+ // Recursively parse args
+ template <typename... TArgsT>
+ static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
+ {
+ switch (tArg)
+ {
+ case SWR_INPUT_COVERAGE_NONE: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
+ case SWR_INPUT_COVERAGE_NORMAL: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
+ case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
+ default:
+ SWR_ASSERT(0 && "Invalid sample pattern\n");
+ return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
+ break;
+ }
+ }
+
+ // Recursively parse args
+ template <typename... TArgsT>
+ static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
+ {
+ switch (tArg)
+ {
+ case SWR_MULTISAMPLE_1X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
+ case SWR_MULTISAMPLE_2X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
+ case SWR_MULTISAMPLE_4X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
+ case SWR_MULTISAMPLE_8X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
+ case SWR_MULTISAMPLE_16X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
+ default:
+ SWR_ASSERT(0 && "Invalid sample count\n");
+ return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
+ break;
+ }
+ }
+
+ // Recursively parse args
+ template <typename... TArgsT>
+ static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
+ {
+ if (tArg == true)
+ {
+ return BEChooserSampleRate<ArgsT..., 1>::GetFunc(remainingArgs...);
+ }
+
+ return BEChooserSampleRate<ArgsT..., 0>::GetFunc(remainingArgs...);
+ }
+};
+
+void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
+{
+ for (uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
+ {
+ for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
+ {
+ for (uint32_t centroid = 0; centroid < 2; centroid++)
+ {
+ for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
+ {
+ table[sampleCount][inputCoverage][centroid][canEarlyZ] =
+ BEChooserSampleRate<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, false, (SWR_INPUT_COVERAGE)inputCoverage,
+ (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
+ }
+ }
+ }
+ }
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
new file mode 100644
index 00000000000..0eecc25882a
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
@@ -0,0 +1,321 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file backend.cpp
+*
+* @brief Backend handles rasterization, pixel shading and output merger
+* operations.
+*
+******************************************************************************/
+
+#include <smmintrin.h>
+
+#include "backend.h"
+#include "backend_impl.h"
+#include "tilemgr.h"
+#include "memory/tilingtraits.h"
+#include "core/multisample.h"
+
+#include <algorithm>
+
+template<typename T>
+void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+{
+ SWR_CONTEXT *pContext = pDC->pContext;
+
+ AR_BEGIN(BESingleSampleBackend, pDC->drawId);
+ AR_BEGIN(BESetup, pDC->drawId);
+
+ const API_STATE &state = GetApiState(pDC);
+
+ BarycentricCoeffs coeffs;
+ SetupBarycentricCoeffs(&coeffs, work);
+
+ uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
+ SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
+
+ SWR_PS_CONTEXT psContext;
+ const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
+ SetupPixelShaderContext<T>(&psContext, samplePos, work);
+
+ AR_END(BESetup, 1);
+
+ psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
+ psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
+
+ const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
+
+ for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
+ {
+ psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
+ psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
+
+ const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
+
+ for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
+ {
+#if USE_8x2_TILE_BACKEND
+ const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
+
+#endif
+ simdmask coverageMask = work.coverageMask[0] & MASK;
+
+ if (coverageMask)
+ {
+ if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
+ {
+ static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
+
+ const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer));
+
+ const float minz = state.depthBoundsState.depthBoundsTestMinValue;
+ const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
+
+ coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
+ }
+
+ if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
+ {
+ const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
+
+ generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
+ }
+
+ AR_BEGIN(BEBarycentric, pDC->drawId);
+
+ CalcPixelBarycentrics(coeffs, psContext);
+
+ CalcCentroid<T, true>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
+
+ // interpolate and quantize z
+ psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+ psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
+
+ AR_END(BEBarycentric, 1);
+
+ // interpolate user clip distance if available
+ if (state.rastState.clipDistanceMask)
+ {
+ coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center);
+ }
+
+ simdscalar vCoverageMask = vMask(coverageMask);
+ simdscalar depthPassMask = vCoverageMask;
+ simdscalar stencilPassMask = vCoverageMask;
+
+ // Early-Z?
+ if (T::bCanEarlyZ)
+ {
+ AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
+ depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
+ psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
+ AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
+ AR_END(BEEarlyDepthTest, 0);
+
+ // early-exit if no pixels passed depth or earlyZ is forced on
+ if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
+ {
+ DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+ pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
+
+ if (!_simd_movemask_ps(depthPassMask))
+ {
+ goto Endtile;
+ }
+ }
+ }
+
+ psContext.sampleIndex = 0;
+ psContext.activeMask = _simd_castps_si(vCoverageMask);
+
+ // execute pixel shader
+ AR_BEGIN(BEPixelShader, pDC->drawId);
+ UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
+ state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+ AR_END(BEPixelShader, 0);
+
+ vCoverageMask = _simd_castsi_ps(psContext.activeMask);
+
+ // late-Z
+ if (!T::bCanEarlyZ)
+ {
+ AR_BEGIN(BELateDepthTest, pDC->drawId);
+ depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
+ psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
+ AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
+ AR_END(BELateDepthTest, 0);
+
+ if (!_simd_movemask_ps(depthPassMask))
+ {
+ // need to call depth/stencil write for stencil write
+ DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+ pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
+ goto Endtile;
+ }
+ } else {
+ // for early z, consolidate discards from shader
+ // into depthPassMask
+ depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask);
+ }
+
+ uint32_t statMask = _simd_movemask_ps(depthPassMask);
+ uint32_t statCount = _mm_popcnt_u32(statMask);
+ UPDATE_STAT_BE(DepthPassCount, statCount);
+
+ // output merger
+ AR_BEGIN(BEOutputMerger, pDC->drawId);
+#if USE_8x2_TILE_BACKEND
+ OutputMerger8x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
+#else
+ OutputMerger4x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
+#endif
+
+ // do final depth write after all pixel kills
+ if (!state.psState.forceEarlyZ)
+ {
+ DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+ pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
+ }
+ AR_END(BEOutputMerger, 0);
+ }
+
+Endtile:
+ AR_BEGIN(BEEndTile, pDC->drawId);
+
+ work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+ if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
+ {
+ work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+ }
+
+#if USE_8x2_TILE_BACKEND
+ if (useAlternateOffset)
+ {
+ for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+ {
+ pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+ }
+ }
+#else
+ for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+ {
+ pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+ }
+#endif
+ pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+ pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+
+ AR_END(BEEndTile, 0);
+
+ psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
+ psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
+ }
+
+ psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
+ psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
+ }
+
+ AR_END(BESingleSampleBackend, 0);
+}
+
+// Recursive template used to auto-nest conditionals. Converts dynamic enum function
+// arguments to static template arguments.
+template <uint32_t... ArgsT>
+struct BEChooserSingleSample
+{
+ // Last Arg Terminator
+ static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
+ {
+ switch(tArg)
+ {
+ case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
+ case SWR_BACKEND_MSAA_PIXEL_RATE:
+ case SWR_BACKEND_MSAA_SAMPLE_RATE:
+ default:
+ SWR_ASSERT(0 && "Invalid backend func\n");
+ return nullptr;
+ break;
+ }
+ }
+
+ // Recursively parse args
+ template <typename... TArgsT>
+ static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
+ {
+ switch(tArg)
+ {
+ case SWR_INPUT_COVERAGE_NONE: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
+ case SWR_INPUT_COVERAGE_NORMAL: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
+ case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
+ default:
+ SWR_ASSERT(0 && "Invalid sample pattern\n");
+ return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
+ break;
+ }
+ }
+
+ // Recursively parse args
+ template <typename... TArgsT>
+ static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
+ {
+ switch(tArg)
+ {
+ case SWR_MULTISAMPLE_1X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
+ case SWR_MULTISAMPLE_2X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
+ case SWR_MULTISAMPLE_4X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
+ case SWR_MULTISAMPLE_8X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
+ case SWR_MULTISAMPLE_16X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
+ default:
+ SWR_ASSERT(0 && "Invalid sample count\n");
+ return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
+ break;
+ }
+ }
+
+ // Recursively parse args
+ template <typename... TArgsT>
+ static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
+ {
+ if(tArg == true)
+ {
+ return BEChooserSingleSample<ArgsT..., 1>::GetFunc(remainingArgs...);
+ }
+
+ return BEChooserSingleSample<ArgsT..., 0>::GetFunc(remainingArgs...);
+ }
+};
+
+void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
+{
+ for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
+ {
+ for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
+ {
+ for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
+ {
+ table[inputCoverage][isCentroid][canEarlyZ] =
+ BEChooserSingleSample<>::GetFunc(SWR_MULTISAMPLE_1X, false, (SWR_INPUT_COVERAGE)inputCoverage,
+ (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
+ }
+ }
+ }
+}