summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGeorge Kyriazis <[email protected]>2018-01-22 19:23:32 -0600
committerGeorge Kyriazis <[email protected]>2018-01-25 13:26:49 -0600
commit36dbbf11a0afda57940b7ca224f318dd3915d8c5 (patch)
tree87db5c7049b751da46c7261d8fd41208b31c492a
parent94922dbe4bc5bbe77124f435607641abe4e8c114 (diff)
swr/rast: Move memory-related JIT functions
Move them to their own file (builder_mem.{h|cpp}). Add builder_mem.cpp to the build system. Reviewed-by: Bruce Cherniak <[email protected]>
-rw-r--r--src/gallium/drivers/swr/Makefile.sources2
-rw-r--r--src/gallium/drivers/swr/meson.build2
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/builder.h1
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp816
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h73
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp777
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h41
7 files changed, 894 insertions, 818 deletions
diff --git a/src/gallium/drivers/swr/Makefile.sources b/src/gallium/drivers/swr/Makefile.sources
index cd2040e1371..cbf73953911 100644
--- a/src/gallium/drivers/swr/Makefile.sources
+++ b/src/gallium/drivers/swr/Makefile.sources
@@ -140,6 +140,8 @@ JITTER_CXX_SOURCES := \
rasterizer/jitter/builder.cpp \
rasterizer/jitter/builder.h \
rasterizer/jitter/builder_math.h \
+ rasterizer/jitter/builder_mem.cpp \
+ rasterizer/jitter/builder_mem.h \
rasterizer/jitter/builder_misc.cpp \
rasterizer/jitter/builder_misc.h \
rasterizer/jitter/fetch_jit.cpp \
diff --git a/src/gallium/drivers/swr/meson.build b/src/gallium/drivers/swr/meson.build
index 8dffb4c000d..ae86c8ec380 100644
--- a/src/gallium/drivers/swr/meson.build
+++ b/src/gallium/drivers/swr/meson.build
@@ -68,6 +68,8 @@ files_swr_mesa = files(
'rasterizer/jitter/builder.cpp',
'rasterizer/jitter/builder.h',
'rasterizer/jitter/builder_math.h',
+ 'rasterizer/jitter/builder_mem.cpp',
+ 'rasterizer/jitter/builder_mem.h',
'rasterizer/jitter/builder_misc.cpp',
'rasterizer/jitter/builder_misc.h',
'rasterizer/jitter/fetch_jit.cpp',
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 2e714f97380..5d1a6b9273c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -94,5 +94,6 @@ namespace SwrJit
#include "gen_builder_x86.hpp"
#include "builder_misc.h"
#include "builder_math.h"
+#include "builder_mem.h"
};
}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
new file mode 100644
index 00000000000..c8ba5b465e2
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -0,0 +1,816 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file builder_misc.cpp
+*
+* @brief Implementation for miscellaneous builder functions
+*
+* Notes:
+*
+******************************************************************************/
+#include "jit_pch.hpp"
+#include "builder.h"
+#include "common/rdtsc_buckets.h"
+
+#include <cstdarg>
+
+
+namespace SwrJit
+{
+
+ Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
+ {
+ std::vector<Value*> indices;
+ for (auto i : indexList)
+ indices.push_back(i);
+ return GEPA(ptr, indices);
+ }
+
+ Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
+ {
+ std::vector<Value*> indices;
+ for (auto i : indexList)
+ indices.push_back(C(i));
+ return GEPA(ptr, indices);
+ }
+
+ Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
+ {
+ std::vector<Value*> indices;
+ for (auto i : indexList)
+ indices.push_back(i);
+ return IN_BOUNDS_GEP(ptr, indices);
+ }
+
+ Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
+ {
+ std::vector<Value*> indices;
+ for (auto i : indexList)
+ indices.push_back(C(i));
+ return IN_BOUNDS_GEP(ptr, indices);
+ }
+
+ LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
+ {
+ std::vector<Value*> valIndices;
+ for (auto i : indices)
+ valIndices.push_back(C(i));
+ return LOAD(GEPA(basePtr, valIndices), name);
+ }
+
+ LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
+ {
+ std::vector<Value*> valIndices;
+ for (auto i : indices)
+ valIndices.push_back(i);
+ return LOAD(GEPA(basePtr, valIndices), name);
+ }
+
+ StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
+ {
+ std::vector<Value*> valIndices;
+ for (auto i : indices)
+ valIndices.push_back(C(i));
+ return STORE(val, GEPA(basePtr, valIndices));
+ }
+
+ StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
+ {
+ std::vector<Value*> valIndices;
+ for (auto i : indices)
+ valIndices.push_back(i);
+ return STORE(val, GEPA(basePtr, valIndices));
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Generate an i32 masked load operation in LLVM IR. If not
+ /// supported on the underlying platform, emulate it with float masked load
+ /// @param src - base address pointer for the load
+ /// @param vMask - SIMD wide mask that controls whether to access memory load 0
+ Value *Builder::MASKLOADD(Value* src, Value* mask)
+ {
+ Value* vResult;
+ // use avx2 gather instruction is available
+ if (JM()->mArch.AVX2())
+ {
+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
+ vResult = CALL(func, { src,mask });
+ }
+ else
+ {
+ // maskload intrinsic expects integer mask operand in llvm >= 3.8
+#if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
+ mask = BITCAST(mask, VectorType::get(mInt32Ty, mVWidth));
+#else
+ mask = BITCAST(mask, VectorType::get(mFP32Ty, mVWidth));
+#endif
+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskload_ps_256);
+ vResult = BITCAST(CALL(func, { src,mask }), VectorType::get(mInt32Ty, mVWidth));
+ }
+ return vResult;
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Generate a masked gather operation in LLVM IR. If not
+ /// supported on the underlying platform, emulate it with loads
+ /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
+ /// @param pBase - Int8* base VB address pointer value
+ /// @param vIndices - SIMD wide value of VB byte offsets
+ /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
+ /// @param scale - value to scale indices by
+ Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale, Value *pDrawContext)
+ {
+ Value *vGather;
+
+ // use avx2 gather instruction if available
+ if (JM()->mArch.AVX2())
+ {
+ // force mask to <N x float>, required by vgather
+ Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);
+
+ vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale));
+ }
+ else
+ {
+ Value* pStack = STACKSAVE();
+
+ // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
+ Value* vSrcPtr = ALLOCA(vSrc->getType());
+ STORE(vSrc, vSrcPtr);
+
+ vGather = VUNDEF_F();
+ Value *vScaleVec = VIMMED1((uint32_t)scale);
+ Value *vOffsets = MUL(vIndices, vScaleVec);
+ for (uint32_t i = 0; i < mVWidth; ++i)
+ {
+ // single component byte index
+ Value *offset = VEXTRACT(vOffsets, C(i));
+ // byte pointer to component
+ Value *loadAddress = GEP(pBase, offset);
+ loadAddress = BITCAST(loadAddress, PointerType::get(mFP32Ty, 0));
+ // pointer to the value to load if we're masking off a component
+ Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
+ Value *selMask = VEXTRACT(vMask, C(i));
+ // switch in a safe address to load if we're trying to access a vertex
+ Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
+ Value *val = LOAD(validAddress);
+ vGather = VINSERT(vGather, val, C(i));
+ }
+
+ STACKRESTORE(pStack);
+ }
+
+ return vGather;
+ }
+
+ Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
+ {
+ Value *vGather = VUNDEF_F_16();
+
+ // use AVX512F gather instruction if available
+ if (JM()->mArch.AVX512F())
+ {
+ // force mask to <N-bit Integer>, required by vgather2
+ Value *mask = BITCAST(vMask, mInt16Ty);
+
+ vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
+ }
+ else
+ {
+ Value *src0 = EXTRACT_16(vSrc, 0);
+ Value *src1 = EXTRACT_16(vSrc, 1);
+
+ Value *indices0 = EXTRACT_16(vIndices, 0);
+ Value *indices1 = EXTRACT_16(vIndices, 1);
+
+ Value *mask0 = EXTRACT_16(vMask, 0);
+ Value *mask1 = EXTRACT_16(vMask, 1);
+
+ Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
+ Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
+
+ vGather = JOIN_16(gather0, gather1);
+ }
+
+ return vGather;
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Generate a masked gather operation in LLVM IR. If not
+ /// supported on the underlying platform, emulate it with loads
+ /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
+ /// @param pBase - Int8* base VB address pointer value
+ /// @param vIndices - SIMD wide value of VB byte offsets
+ /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
+ /// @param scale - value to scale indices by
+ Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
+ {
+ Value* vGather;
+
+ // use avx2 gather instruction if available
+ if (JM()->mArch.AVX2())
+ {
+ vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
+ }
+ else
+ {
+ Value* pStack = STACKSAVE();
+
+ // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
+ Value* vSrcPtr = ALLOCA(vSrc->getType());
+ STORE(vSrc, vSrcPtr);
+
+ vGather = VUNDEF_I();
+ Value *vScaleVec = VIMMED1((uint32_t)scale);
+ Value *vOffsets = MUL(vIndices, vScaleVec);
+ for (uint32_t i = 0; i < mVWidth; ++i)
+ {
+ // single component byte index
+ Value *offset = VEXTRACT(vOffsets, C(i));
+ // byte pointer to component
+ Value *loadAddress = GEP(pBase, offset);
+ loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
+ // pointer to the value to load if we're masking off a component
+ Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
+ Value *selMask = VEXTRACT(vMask, C(i));
+ // switch in a safe address to load if we're trying to access a vertex
+ Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
+ Value *val = LOAD(validAddress, C(0));
+ vGather = VINSERT(vGather, val, C(i));
+ }
+
+ STACKRESTORE(pStack);
+ }
+
+ return vGather;
+ }
+
+ Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
+ {
+ Value *vGather = VUNDEF_I_16();
+
+ // use AVX512F gather instruction if available
+ if (JM()->mArch.AVX512F())
+ {
+ // force mask to <N-bit Integer>, required by vgather2
+ Value *mask = BITCAST(vMask, mInt16Ty);
+
+ vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
+ }
+ else
+ {
+ Value *src0 = EXTRACT_16(vSrc, 0);
+ Value *src1 = EXTRACT_16(vSrc, 1);
+
+ Value *indices0 = EXTRACT_16(vIndices, 0);
+ Value *indices1 = EXTRACT_16(vIndices, 1);
+
+ Value *mask0 = EXTRACT_16(vMask, 0);
+ Value *mask1 = EXTRACT_16(vMask, 1);
+
+ Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
+ Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);
+
+ vGather = JOIN_16(gather0, gather1);
+ }
+
+ return vGather;
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Generate a masked gather operation in LLVM IR. If not
+ /// supported on the underlying platform, emulate it with loads
+ /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
+ /// @param pBase - Int8* base VB address pointer value
+ /// @param vIndices - SIMD wide value of VB byte offsets
+ /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
+ /// @param scale - value to scale indices by
+ Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
+ {
+ Value* vGather;
+
+ // use avx2 gather instruction if available
+ if (JM()->mArch.AVX2())
+ {
+ vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth / 2)), VectorType::get(mDoubleTy, mVWidth / 2));
+ vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
+ }
+ else
+ {
+ Value* pStack = STACKSAVE();
+
+ // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
+ Value* vSrcPtr = ALLOCA(vSrc->getType());
+ STORE(vSrc, vSrcPtr);
+
+ vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
+ Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
+ Value *vOffsets = MUL(vIndices, vScaleVec);
+ for (uint32_t i = 0; i < mVWidth / 2; ++i)
+ {
+ // single component byte index
+ Value *offset = VEXTRACT(vOffsets, C(i));
+ // byte pointer to component
+ Value *loadAddress = GEP(pBase, offset);
+ loadAddress = BITCAST(loadAddress, PointerType::get(mDoubleTy, 0));
+ // pointer to the value to load if we're masking off a component
+ Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
+ Value *selMask = VEXTRACT(vMask, C(i));
+ // switch in a safe address to load if we're trying to access a vertex
+ Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
+ Value *val = LOAD(validAddress);
+ vGather = VINSERT(vGather, val, C(i));
+ }
+ STACKRESTORE(pStack);
+ }
+ return vGather;
+ }
+
+ void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
+ Value* mask, Value* vGatherComponents[], bool bPackedOutput)
+ {
+ const SWR_FORMAT_INFO &info = GetFormatInfo(format);
+ if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
+ {
+ GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
+ }
+ else
+ {
+ GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
+ }
+ }
+
+ void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
+ Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
+ {
+ switch (info.bpp / info.numComps)
+ {
+ case 16:
+ {
+ Value* vGatherResult[2];
+
+ // TODO: vGatherMaskedVal
+ Value* vGatherMaskedVal = VIMMED1((float)0);
+
+ // always have at least one component out of x or y to fetch
+
+ vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
+ // e.g. result of first 8x32bit integer gather for 16bit components
+ // 256i - 0 1 2 3 4 5 6 7
+ // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
+ //
+
+ // if we have at least one component out of x or y to fetch
+ if (info.numComps > 2)
+ {
+ // offset base to the next components(zw) in the vertex to gather
+ pSrcBase = GEP(pSrcBase, C((char)4));
+
+ vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
+ // e.g. result of second 8x32bit integer gather for 16bit components
+ // 256i - 0 1 2 3 4 5 6 7
+ // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
+ //
+ }
+ else
+ {
+ vGatherResult[1] = vGatherMaskedVal;
+ }
+
+ // Shuffle gathered components into place, each row is a component
+ Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
+ }
+ break;
+ case 32:
+ {
+ // apply defaults
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
+ }
+
+ for (uint32_t i = 0; i < info.numComps; i++)
+ {
+ uint32_t swizzleIndex = info.swizzle[i];
+
+ // Gather a SIMD of components
+ vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
+
+ // offset base to the next component to gather
+ pSrcBase = GEP(pSrcBase, C((char)4));
+ }
+ }
+ break;
+ default:
+ SWR_INVALID("Invalid float format");
+ break;
+ }
+ }
+
+ void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
+ Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
+ {
+ switch (info.bpp / info.numComps)
+ {
+ case 8:
+ {
+ Value* vGatherMaskedVal = VIMMED1((int32_t)0);
+ Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
+ // e.g. result of an 8x32bit integer gather for 8bit components
+ // 256i - 0 1 2 3 4 5 6 7
+ // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
+
+ Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
+ }
+ break;
+ case 16:
+ {
+ Value* vGatherResult[2];
+
+ // TODO: vGatherMaskedVal
+ Value* vGatherMaskedVal = VIMMED1((int32_t)0);
+
+ // always have at least one component out of x or y to fetch
+
+ vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
+ // e.g. result of first 8x32bit integer gather for 16bit components
+ // 256i - 0 1 2 3 4 5 6 7
+ // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
+ //
+
+ // if we have at least one component out of x or y to fetch
+ if (info.numComps > 2)
+ {
+ // offset base to the next components(zw) in the vertex to gather
+ pSrcBase = GEP(pSrcBase, C((char)4));
+
+ vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
+ // e.g. result of second 8x32bit integer gather for 16bit components
+ // 256i - 0 1 2 3 4 5 6 7
+ // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
+ //
+ }
+ else
+ {
+ vGatherResult[1] = vGatherMaskedVal;
+ }
+
+ // Shuffle gathered components into place, each row is a component
+ Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
+
+ }
+ break;
+ case 32:
+ {
+ // apply defaults
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
+ }
+
+ for (uint32_t i = 0; i < info.numComps; i++)
+ {
+ uint32_t swizzleIndex = info.swizzle[i];
+
+ // Gather a SIMD of components
+ vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
+
+ // offset base to the next component to gather
+ pSrcBase = GEP(pSrcBase, C((char)4));
+ }
+ }
+ break;
+ default:
+ SWR_INVALID("unsupported format");
+ break;
+ }
+ }
+
+ void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
+ {
+ // cast types
+ Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+ Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
+
+ // input could either be float or int vector; do shuffle work in int
+ vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
+ vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
+
+ if (bPackedOutput)
+ {
+ Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
+
+ // shuffle mask
+ Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
+ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
+ // after pshufb: group components together in each 128bit lane
+ // 256i - 0 1 2 3 4 5 6 7
+ // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
+
+ Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ // after PERMD: move and pack xy components into each 128bit lane
+ // 256i - 0 1 2 3 4 5 6 7
+ // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
+
+ // do the same for zw components
+ Value* vi128ZW = nullptr;
+ if (info.numComps > 2)
+ {
+ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
+ vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ }
+
+ for (uint32_t i = 0; i < 4; i++)
+ {
+ uint32_t swizzleIndex = info.swizzle[i];
+ // todo: fixed for packed
+ Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
+ if (i >= info.numComps)
+ {
+ // set the default component val
+ vGatherOutput[swizzleIndex] = vGatherMaskedVal;
+ continue;
+ }
+
+ // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
+ uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
+ // if x or y, use vi128XY permute result, else use vi128ZW
+ Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
+
+ // extract packed component 128 bit lanes
+ vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
+ }
+
+ }
+ else
+ {
+ // pshufb masks for each component
+ Value* vConstMask[2];
+ // x/z shuffle mask
+ vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
+ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
+
+ // y/w shuffle mask
+ vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
+ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
+
+
+ // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
+ // apply defaults
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
+ }
+
+ for (uint32_t i = 0; i < info.numComps; i++)
+ {
+ uint32_t swizzleIndex = info.swizzle[i];
+
+ // select correct constMask for x/z or y/w pshufb
+ uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
+ // if x or y, use vi128XY permute result, else use vi128ZW
+ uint32_t selectedGather = (i < 2) ? 0 : 1;
+
+ vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
+ // after pshufb mask for x channel; z uses the same shuffle from the second gather
+ // 256i - 0 1 2 3 4 5 6 7
+ // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
+ }
+ }
+ }
+
+ void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
+ {
+ // cast types
+ Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+ Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
+
+ if (bPackedOutput)
+ {
+ Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
+ // shuffle mask
+ Value* vConstMask = C<char>({ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 });
+ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
+ // after pshufb: group components together in each 128bit lane
+ // 256i - 0 1 2 3 4 5 6 7
+ // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
+
+ Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
+ // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
+ // 256i - 0 1 2 3 4 5 6 7
+ // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
+
+ // do the same for zw components
+ Value* vi128ZW = nullptr;
+ if (info.numComps > 2)
+ {
+ vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
+ }
+
+ // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
+ for (uint32_t i = 0; i < 4; i++)
+ {
+ uint32_t swizzleIndex = info.swizzle[i];
+ // todo: fix for packed
+ Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
+ if (i >= info.numComps)
+ {
+ // set the default component val
+ vGatherOutput[swizzleIndex] = vGatherMaskedVal;
+ continue;
+ }
+
+ // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
+ uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
+ // if x or y, use vi128XY permute result, else use vi128ZW
+ Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
+
+ // sign extend
+ vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
+ }
+ }
+ // else zero extend
+ else {
+ // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
+ // apply defaults
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
+ }
+
+ for (uint32_t i = 0; i < info.numComps; i++) {
+ uint32_t swizzleIndex = info.swizzle[i];
+
+ // pshufb masks for each component
+ Value* vConstMask;
+ switch (i)
+ {
+ case 0:
+ // x shuffle mask
+ vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
+ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
+ break;
+ case 1:
+ // y shuffle mask
+ vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
+ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
+ break;
+ case 2:
+ // z shuffle mask
+ vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
+ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
+ break;
+ case 3:
+ // w shuffle mask
+ vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
+ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
+ break;
+ default:
+ vConstMask = nullptr;
+ break;
+ }
+
+ vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
+ // after pshufb for x channel
+ // 256i - 0 1 2 3 4 5 6 7
+ // x000 x000 x000 x000 x000 x000 x000 x000
+ }
+ }
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief emulates a scatter operation.
+ /// @param pDst - pointer to destination
+ /// @param vSrc - vector of src data to scatter
+ /// @param vOffsets - vector of byte offsets from pDst
+ /// @param vMask - mask of valid lanes
+ void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
+ {
+ /* Scatter algorithm
+
+ while(Index = BitScanForward(mask))
+ srcElem = srcVector[Index]
+ offsetElem = offsetVector[Index]
+ *(pDst + offsetElem) = srcElem
+ Update mask (&= ~(1<<Index)
+
+ */
+
+ BasicBlock* pCurBB = IRB()->GetInsertBlock();
+ Function* pFunc = pCurBB->getParent();
+ Type* pSrcTy = vSrc->getType()->getVectorElementType();
+
+ // Store vectors on stack
+ if (pScatterStackSrc == nullptr)
+ {
+ // Save off stack allocations and reuse per scatter. Significantly reduces stack
+ // requirements for shaders with a lot of scatters.
+ pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
+ pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
+ }
+
+ Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
+ Value* pOffsetsArrayPtr = pScatterStackOffsets;
+ STORE(vSrc, pSrcArrayPtr);
+ STORE(vOffsets, pOffsetsArrayPtr);
+
+ // Cast to pointers for random access
+ pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
+ pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
+
+ Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
+
+ // Get cttz function
+ Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
+
+ // Setup loop basic block
+ BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
+
+ // compute first set bit
+ Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
+
+ Value* pIsUndef = ICMP_EQ(pIndex, C(32));
+
+ // Split current block
+ BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
+
+ // Remove unconditional jump created by splitBasicBlock
+ pCurBB->getTerminator()->eraseFromParent();
+
+ // Add terminator to end of original block
+ IRB()->SetInsertPoint(pCurBB);
+
+ // Add conditional branch
+ COND_BR(pIsUndef, pPostLoop, pLoop);
+
+ // Add loop basic block contents
+ IRB()->SetInsertPoint(pLoop);
+ PHINode* pIndexPhi = PHI(mInt32Ty, 2);
+ PHINode* pMaskPhi = PHI(mInt32Ty, 2);
+
+ pIndexPhi->addIncoming(pIndex, pCurBB);
+ pMaskPhi->addIncoming(pMask, pCurBB);
+
+ // Extract elements for this index
+ Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
+ Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
+
+ // GEP to this offset in dst
+ Value* pCurDst = GEP(pDst, pOffsetElem);
+ pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
+ STORE(pSrcElem, pCurDst);
+
+ // Update the mask
+ Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
+
+ // Terminator
+ Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
+
+ pIsUndef = ICMP_EQ(pNewIndex, C(32));
+ COND_BR(pIsUndef, pPostLoop, pLoop);
+
+ // Update phi edges
+ pIndexPhi->addIncoming(pNewIndex, pLoop);
+ pMaskPhi->addIncoming(pNewMask, pLoop);
+
+ // Move builder to beginning of post loop
+ IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief save/restore stack, providing ability to push/pop the stack and
+ /// reduce overall stack requirements for temporary stack use
+ Value* Builder::STACKSAVE()
+ {
+ Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
+ return CALLA(pfnStackSave);
+ }
+
+ void Builder::STACKRESTORE(Value* pSaved)
+ {
+ Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
+ CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
+ }
+
+}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
new file mode 100644
index 00000000000..6264f398161
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
@@ -0,0 +1,73 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file builder_misc.h
+*
+* @brief miscellaneous builder functions
+*
+* Notes:
+*
+******************************************************************************/
+#pragma once
+
+Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
+Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
+Value *IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
+Value *IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
+
+LoadInst *LOAD(Value *BasePtr, const std::initializer_list<uint32_t> &offset, const llvm::Twine& name = "");
+LoadInst *LOADV(Value *BasePtr, const std::initializer_list<Value*> &offset, const llvm::Twine& name = "");
+StoreInst *STORE(Value *Val, Value *BasePtr, const std::initializer_list<uint32_t> &offset);
+StoreInst *STOREV(Value *Val, Value *BasePtr, const std::initializer_list<Value*> &offset);
+
+Value *MASKLOADD(Value* src, Value* mask);
+
+void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
+ Value* mask, Value* vGatherComponents[], bool bPackedOutput);
+
+virtual Value *GATHERPS(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1, Value *pDrawContext = nullptr);
+Value *GATHERPS_16(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1);
+
+void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
+ Value* mask, Value* vGatherComponents[], bool bPackedOutput);
+
+Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1);
+Value *GATHERDD_16(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1);
+
+void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
+ Value* mask, Value* vGatherComponents[], bool bPackedOutput);
+
+Value *GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1);
+
+void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask);
+
+void Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput);
+void Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[], Value* vGatherOutput[], bool bPackedOutput);
+
+Value* STACKSAVE();
+void STACKRESTORE(Value* pSaved);
+
+// Static stack allocations for scatter operations
+Value* pScatterStackSrc{ nullptr };
+Value* pScatterStackOffsets{ nullptr };
+
+
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 10a5979d864..0738d023321 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -303,70 +303,6 @@ namespace SwrJit
return pValConst->getSExtValue();
}
- Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
- {
- std::vector<Value*> indices;
- for (auto i : indexList)
- indices.push_back(i);
- return GEPA(ptr, indices);
- }
-
- Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
- {
- std::vector<Value*> indices;
- for (auto i : indexList)
- indices.push_back(C(i));
- return GEPA(ptr, indices);
- }
-
- Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
- {
- std::vector<Value*> indices;
- for (auto i : indexList)
- indices.push_back(i);
- return IN_BOUNDS_GEP(ptr, indices);
- }
-
- Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
- {
- std::vector<Value*> indices;
- for (auto i : indexList)
- indices.push_back(C(i));
- return IN_BOUNDS_GEP(ptr, indices);
- }
-
- LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
- {
- std::vector<Value*> valIndices;
- for (auto i : indices)
- valIndices.push_back(C(i));
- return LOAD(GEPA(basePtr, valIndices), name);
- }
-
- LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
- {
- std::vector<Value*> valIndices;
- for (auto i : indices)
- valIndices.push_back(i);
- return LOAD(GEPA(basePtr, valIndices), name);
- }
-
- StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
- {
- std::vector<Value*> valIndices;
- for (auto i : indices)
- valIndices.push_back(C(i));
- return STORE(val, GEPA(basePtr, valIndices));
- }
-
- StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
- {
- std::vector<Value*> valIndices;
- for (auto i : indices)
- valIndices.push_back(i);
- return STORE(val, GEPA(basePtr, valIndices));
- }
-
CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList, const llvm::Twine& name)
{
std::vector<Value*> args;
@@ -419,34 +355,6 @@ namespace SwrJit
}
//////////////////////////////////////////////////////////////////////////
- /// @brief Generate an i32 masked load operation in LLVM IR. If not
- /// supported on the underlying platform, emulate it with float masked load
- /// @param src - base address pointer for the load
- /// @param vMask - SIMD wide mask that controls whether to access memory load 0
- Value *Builder::MASKLOADD(Value* src,Value* mask)
- {
- Value* vResult;
- // use avx2 gather instruction is available
- if(JM()->mArch.AVX2())
- {
- Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
- vResult = CALL(func,{src,mask});
- }
- else
- {
- // maskload intrinsic expects integer mask operand in llvm >= 3.8
- #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
- mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
- #else
- mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
- #endif
- Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
- vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
- }
- return vResult;
- }
-
- //////////////////////////////////////////////////////////////////////////
/// @brief insert a JIT call to CallPrint
/// - outputs formatted string to both stdout and VS output window
/// - DEBUG builds only
@@ -581,222 +489,6 @@ namespace SwrJit
return PRINT(printStr, {});
}
- //////////////////////////////////////////////////////////////////////////
- /// @brief Generate a masked gather operation in LLVM IR. If not
- /// supported on the underlying platform, emulate it with loads
- /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
- /// @param pBase - Int8* base VB address pointer value
- /// @param vIndices - SIMD wide value of VB byte offsets
- /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
- /// @param scale - value to scale indices by
- Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale, Value *pDrawContext)
- {
- Value *vGather;
-
- // use avx2 gather instruction if available
- if(JM()->mArch.AVX2())
- {
- // force mask to <N x float>, required by vgather
- Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);
-
- vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale));
- }
- else
- {
- Value* pStack = STACKSAVE();
-
- // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
- Value* vSrcPtr = ALLOCA(vSrc->getType());
- STORE(vSrc, vSrcPtr);
-
- vGather = VUNDEF_F();
- Value *vScaleVec = VIMMED1((uint32_t)scale);
- Value *vOffsets = MUL(vIndices,vScaleVec);
- for(uint32_t i = 0; i < mVWidth; ++i)
- {
- // single component byte index
- Value *offset = VEXTRACT(vOffsets,C(i));
- // byte pointer to component
- Value *loadAddress = GEP(pBase,offset);
- loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
- // pointer to the value to load if we're masking off a component
- Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
- Value *selMask = VEXTRACT(vMask,C(i));
- // switch in a safe address to load if we're trying to access a vertex
- Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
- Value *val = LOAD(validAddress);
- vGather = VINSERT(vGather,val,C(i));
- }
-
- STACKRESTORE(pStack);
- }
-
- return vGather;
- }
-
- Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
- {
- Value *vGather = VUNDEF_F_16();
-
- // use AVX512F gather instruction if available
- if (JM()->mArch.AVX512F())
- {
- // force mask to <N-bit Integer>, required by vgather2
- Value *mask = BITCAST(vMask, mInt16Ty);
-
- vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
- }
- else
- {
- Value *src0 = EXTRACT_16(vSrc, 0);
- Value *src1 = EXTRACT_16(vSrc, 1);
-
- Value *indices0 = EXTRACT_16(vIndices, 0);
- Value *indices1 = EXTRACT_16(vIndices, 1);
-
- Value *mask0 = EXTRACT_16(vMask, 0);
- Value *mask1 = EXTRACT_16(vMask, 1);
-
- Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
- Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
-
- vGather = JOIN_16(gather0, gather1);
- }
-
- return vGather;
- }
-
- //////////////////////////////////////////////////////////////////////////
- /// @brief Generate a masked gather operation in LLVM IR. If not
- /// supported on the underlying platform, emulate it with loads
- /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
- /// @param pBase - Int8* base VB address pointer value
- /// @param vIndices - SIMD wide value of VB byte offsets
- /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
- /// @param scale - value to scale indices by
- Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
- {
- Value* vGather;
-
- // use avx2 gather instruction if available
- if(JM()->mArch.AVX2())
- {
- vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
- }
- else
- {
- Value* pStack = STACKSAVE();
-
- // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
- Value* vSrcPtr = ALLOCA(vSrc->getType());
- STORE(vSrc, vSrcPtr);
-
- vGather = VUNDEF_I();
- Value *vScaleVec = VIMMED1((uint32_t)scale);
- Value *vOffsets = MUL(vIndices, vScaleVec);
- for(uint32_t i = 0; i < mVWidth; ++i)
- {
- // single component byte index
- Value *offset = VEXTRACT(vOffsets, C(i));
- // byte pointer to component
- Value *loadAddress = GEP(pBase, offset);
- loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
- // pointer to the value to load if we're masking off a component
- Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
- Value *selMask = VEXTRACT(vMask, C(i));
- // switch in a safe address to load if we're trying to access a vertex
- Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
- Value *val = LOAD(validAddress, C(0));
- vGather = VINSERT(vGather, val, C(i));
- }
-
- STACKRESTORE(pStack);
- }
-
- return vGather;
- }
-
- Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
- {
- Value *vGather = VUNDEF_I_16();
-
- // use AVX512F gather instruction if available
- if (JM()->mArch.AVX512F())
- {
- // force mask to <N-bit Integer>, required by vgather2
- Value *mask = BITCAST(vMask, mInt16Ty);
-
- vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
- }
- else
- {
- Value *src0 = EXTRACT_16(vSrc, 0);
- Value *src1 = EXTRACT_16(vSrc, 1);
-
- Value *indices0 = EXTRACT_16(vIndices, 0);
- Value *indices1 = EXTRACT_16(vIndices, 1);
-
- Value *mask0 = EXTRACT_16(vMask, 0);
- Value *mask1 = EXTRACT_16(vMask, 1);
-
- Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
- Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);
-
- vGather = JOIN_16(gather0, gather1);
- }
-
- return vGather;
- }
-
- //////////////////////////////////////////////////////////////////////////
- /// @brief Generate a masked gather operation in LLVM IR. If not
- /// supported on the underlying platform, emulate it with loads
- /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
- /// @param pBase - Int8* base VB address pointer value
- /// @param vIndices - SIMD wide value of VB byte offsets
- /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
- /// @param scale - value to scale indices by
- Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
- {
- Value* vGather;
-
- // use avx2 gather instruction if available
- if(JM()->mArch.AVX2())
- {
- vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth/2)), VectorType::get(mDoubleTy, mVWidth/2));
- vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
- }
- else
- {
- Value* pStack = STACKSAVE();
-
- // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
- Value* vSrcPtr = ALLOCA(vSrc->getType());
- STORE(vSrc, vSrcPtr);
-
- vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
- Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
- Value *vOffsets = MUL(vIndices,vScaleVec);
- for(uint32_t i = 0; i < mVWidth/2; ++i)
- {
- // single component byte index
- Value *offset = VEXTRACT(vOffsets,C(i));
- // byte pointer to component
- Value *loadAddress = GEP(pBase,offset);
- loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
- // pointer to the value to load if we're masking off a component
- Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
- Value *selMask = VEXTRACT(vMask,C(i));
- // switch in a safe address to load if we're trying to access a vertex
- Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
- Value *val = LOAD(validAddress);
- vGather = VINSERT(vGather,val,C(i));
- }
- STACKRESTORE(pStack);
- }
- return vGather;
- }
-
Value *Builder::EXTRACT_16(Value *x, uint32_t imm)
{
if (imm == 0)
@@ -1064,360 +756,6 @@ namespace SwrJit
return SELECT(cmp, a, b);
}
- void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
- Value* mask, Value* vGatherComponents[], bool bPackedOutput)
- {
- const SWR_FORMAT_INFO &info = GetFormatInfo(format);
- if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
- {
- GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
- }
- else
- {
- GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
- }
- }
-
- void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
- Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
- {
- switch(info.bpp / info.numComps)
- {
- case 16:
- {
- Value* vGatherResult[2];
-
- // TODO: vGatherMaskedVal
- Value* vGatherMaskedVal = VIMMED1((float)0);
-
- // always have at least one component out of x or y to fetch
-
- vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
- // e.g. result of first 8x32bit integer gather for 16bit components
- // 256i - 0 1 2 3 4 5 6 7
- // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
- //
-
- // if we have at least one component out of x or y to fetch
- if(info.numComps > 2)
- {
- // offset base to the next components(zw) in the vertex to gather
- pSrcBase = GEP(pSrcBase, C((char)4));
-
- vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
- // e.g. result of second 8x32bit integer gather for 16bit components
- // 256i - 0 1 2 3 4 5 6 7
- // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
- //
- }
- else
- {
- vGatherResult[1] = vGatherMaskedVal;
- }
-
- // Shuffle gathered components into place, each row is a component
- Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
- }
- break;
- case 32:
- {
- // apply defaults
- for (uint32_t i = 0; i < 4; ++i)
- {
- vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
- }
-
- for(uint32_t i = 0; i < info.numComps; i++)
- {
- uint32_t swizzleIndex = info.swizzle[i];
-
- // Gather a SIMD of components
- vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
-
- // offset base to the next component to gather
- pSrcBase = GEP(pSrcBase, C((char)4));
- }
- }
- break;
- default:
- SWR_INVALID("Invalid float format");
- break;
- }
- }
-
- void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
- Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
- {
- switch (info.bpp / info.numComps)
- {
- case 8:
- {
- Value* vGatherMaskedVal = VIMMED1((int32_t)0);
- Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
- // e.g. result of an 8x32bit integer gather for 8bit components
- // 256i - 0 1 2 3 4 5 6 7
- // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
-
- Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
- }
- break;
- case 16:
- {
- Value* vGatherResult[2];
-
- // TODO: vGatherMaskedVal
- Value* vGatherMaskedVal = VIMMED1((int32_t)0);
-
- // always have at least one component out of x or y to fetch
-
- vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
- // e.g. result of first 8x32bit integer gather for 16bit components
- // 256i - 0 1 2 3 4 5 6 7
- // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
- //
-
- // if we have at least one component out of x or y to fetch
- if(info.numComps > 2)
- {
- // offset base to the next components(zw) in the vertex to gather
- pSrcBase = GEP(pSrcBase, C((char)4));
-
- vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
- // e.g. result of second 8x32bit integer gather for 16bit components
- // 256i - 0 1 2 3 4 5 6 7
- // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
- //
- }
- else
- {
- vGatherResult[1] = vGatherMaskedVal;
- }
-
- // Shuffle gathered components into place, each row is a component
- Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
-
- }
- break;
- case 32:
- {
- // apply defaults
- for (uint32_t i = 0; i < 4; ++i)
- {
- vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
- }
-
- for(uint32_t i = 0; i < info.numComps; i++)
- {
- uint32_t swizzleIndex = info.swizzle[i];
-
- // Gather a SIMD of components
- vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
-
- // offset base to the next component to gather
- pSrcBase = GEP(pSrcBase, C((char)4));
- }
- }
- break;
- default:
- SWR_INVALID("unsupported format");
- break;
- }
- }
-
- void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
- {
- // cast types
- Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
- Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
-
- // input could either be float or int vector; do shuffle work in int
- vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
- vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
-
- if(bPackedOutput)
- {
- Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
-
- // shuffle mask
- Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
- 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
- Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
- // after pshufb: group components together in each 128bit lane
- // 256i - 0 1 2 3 4 5 6 7
- // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
-
- Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
- // after PERMD: move and pack xy components into each 128bit lane
- // 256i - 0 1 2 3 4 5 6 7
- // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
-
- // do the same for zw components
- Value* vi128ZW = nullptr;
- if(info.numComps > 2)
- {
- Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
- vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
- }
-
- for(uint32_t i = 0; i < 4; i++)
- {
- uint32_t swizzleIndex = info.swizzle[i];
- // todo: fixed for packed
- Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
- if(i >= info.numComps)
- {
- // set the default component val
- vGatherOutput[swizzleIndex] = vGatherMaskedVal;
- continue;
- }
-
- // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
- uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
- // if x or y, use vi128XY permute result, else use vi128ZW
- Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
-
- // extract packed component 128 bit lanes
- vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
- }
-
- }
- else
- {
- // pshufb masks for each component
- Value* vConstMask[2];
- // x/z shuffle mask
- vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
- 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
-
- // y/w shuffle mask
- vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
- 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
-
-
- // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
- // apply defaults
- for (uint32_t i = 0; i < 4; ++i)
- {
- vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
- }
-
- for(uint32_t i = 0; i < info.numComps; i++)
- {
- uint32_t swizzleIndex = info.swizzle[i];
-
- // select correct constMask for x/z or y/w pshufb
- uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
- // if x or y, use vi128XY permute result, else use vi128ZW
- uint32_t selectedGather = (i < 2) ? 0 : 1;
-
- vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
- // after pshufb mask for x channel; z uses the same shuffle from the second gather
- // 256i - 0 1 2 3 4 5 6 7
- // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
- }
- }
- }
-
- void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
- {
- // cast types
- Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
- Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
-
- if(bPackedOutput)
- {
- Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
- // shuffle mask
- Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
- 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
- Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
- // after pshufb: group components together in each 128bit lane
- // 256i - 0 1 2 3 4 5 6 7
- // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
-
- Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
- // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
- // 256i - 0 1 2 3 4 5 6 7
- // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
-
- // do the same for zw components
- Value* vi128ZW = nullptr;
- if(info.numComps > 2)
- {
- vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
- }
-
- // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
- for(uint32_t i = 0; i < 4; i++)
- {
- uint32_t swizzleIndex = info.swizzle[i];
- // todo: fix for packed
- Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
- if(i >= info.numComps)
- {
- // set the default component val
- vGatherOutput[swizzleIndex] = vGatherMaskedVal;
- continue;
- }
-
- // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
- uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
- // if x or y, use vi128XY permute result, else use vi128ZW
- Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
-
- // sign extend
- vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
- }
- }
- // else zero extend
- else{
- // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
- // apply defaults
- for (uint32_t i = 0; i < 4; ++i)
- {
- vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
- }
-
- for(uint32_t i = 0; i < info.numComps; i++){
- uint32_t swizzleIndex = info.swizzle[i];
-
- // pshufb masks for each component
- Value* vConstMask;
- switch(i)
- {
- case 0:
- // x shuffle mask
- vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
- 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
- break;
- case 1:
- // y shuffle mask
- vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
- 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
- break;
- case 2:
- // z shuffle mask
- vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
- 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
- break;
- case 3:
- // w shuffle mask
- vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
- 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
- break;
- default:
- vConstMask = nullptr;
- break;
- }
-
- vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
- // after pshufb for x channel
- // 256i - 0 1 2 3 4 5 6 7
- // x000 x000 x000 x000 x000 x000 x000 x000
- }
- }
- }
-
// Helper function to create alloca in entry block of function
Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
{
@@ -1439,105 +777,6 @@ namespace SwrJit
return pAlloca;
}
- //////////////////////////////////////////////////////////////////////////
- /// @brief emulates a scatter operation.
- /// @param pDst - pointer to destination
- /// @param vSrc - vector of src data to scatter
- /// @param vOffsets - vector of byte offsets from pDst
- /// @param vMask - mask of valid lanes
- void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
- {
- /* Scatter algorithm
-
- while(Index = BitScanForward(mask))
- srcElem = srcVector[Index]
- offsetElem = offsetVector[Index]
- *(pDst + offsetElem) = srcElem
- Update mask (&= ~(1<<Index)
-
- */
-
- BasicBlock* pCurBB = IRB()->GetInsertBlock();
- Function* pFunc = pCurBB->getParent();
- Type* pSrcTy = vSrc->getType()->getVectorElementType();
-
- // Store vectors on stack
- if (pScatterStackSrc == nullptr)
- {
- // Save off stack allocations and reuse per scatter. Significantly reduces stack
- // requirements for shaders with a lot of scatters.
- pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
- pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
- }
-
- Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
- Value* pOffsetsArrayPtr = pScatterStackOffsets;
- STORE(vSrc, pSrcArrayPtr);
- STORE(vOffsets, pOffsetsArrayPtr);
-
- // Cast to pointers for random access
- pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
- pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
-
- Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
-
- // Get cttz function
- Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
-
- // Setup loop basic block
- BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
-
- // compute first set bit
- Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
-
- Value* pIsUndef = ICMP_EQ(pIndex, C(32));
-
- // Split current block
- BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
-
- // Remove unconditional jump created by splitBasicBlock
- pCurBB->getTerminator()->eraseFromParent();
-
- // Add terminator to end of original block
- IRB()->SetInsertPoint(pCurBB);
-
- // Add conditional branch
- COND_BR(pIsUndef, pPostLoop, pLoop);
-
- // Add loop basic block contents
- IRB()->SetInsertPoint(pLoop);
- PHINode* pIndexPhi = PHI(mInt32Ty, 2);
- PHINode* pMaskPhi = PHI(mInt32Ty, 2);
-
- pIndexPhi->addIncoming(pIndex, pCurBB);
- pMaskPhi->addIncoming(pMask, pCurBB);
-
- // Extract elements for this index
- Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
- Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
-
- // GEP to this offset in dst
- Value* pCurDst = GEP(pDst, pOffsetElem);
- pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
- STORE(pSrcElem, pCurDst);
-
- // Update the mask
- Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
-
- // Terminator
- Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
-
- pIsUndef = ICMP_EQ(pNewIndex, C(32));
- COND_BR(pIsUndef, pPostLoop, pLoop);
-
- // Update phi edges
- pIndexPhi->addIncoming(pNewIndex, pLoop);
- pMaskPhi->addIncoming(pNewMask, pLoop);
-
- // Move builder to beginning of post loop
- IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
- }
-
Value* Builder::VABSPS(Value* a)
{
Value* asInt = BITCAST(a, mSimdInt32Ty);
@@ -1575,21 +814,6 @@ namespace SwrJit
return result;
}
- //////////////////////////////////////////////////////////////////////////
- /// @brief save/restore stack, providing ability to push/pop the stack and
- /// reduce overall stack requirements for temporary stack use
- Value* Builder::STACKSAVE()
- {
- Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
- return CALLA(pfnStackSave);
- }
-
- void Builder::STACKRESTORE(Value* pSaved)
- {
- Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
- CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
- }
-
Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
{
Value* vOut;
@@ -1707,7 +931,6 @@ namespace SwrJit
}
}
-
uint32_t Builder::GetTypeSize(Type* pType)
{
if (pType->isStructTy())
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 891b31d3201..50d7a1e71fa 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -90,22 +90,12 @@ Value *VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY);
uint32_t IMMED(Value* i);
int32_t S_IMMED(Value* i);
-Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
-Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
-Value *IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
-Value *IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
-
CallInst *CALL(Value *Callee, const std::initializer_list<Value*> &args, const llvm::Twine& name = "");
CallInst *CALL(Value *Callee) { return CALLA(Callee); }
CallInst *CALL(Value *Callee, Value* arg);
CallInst *CALL2(Value *Callee, Value* arg1, Value* arg2);
CallInst *CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3);
-LoadInst *LOAD(Value *BasePtr, const std::initializer_list<uint32_t> &offset, const llvm::Twine& name = "");
-LoadInst *LOADV(Value *BasePtr, const std::initializer_list<Value*> &offset, const llvm::Twine& name = "");
-StoreInst *STORE(Value *Val, Value *BasePtr, const std::initializer_list<uint32_t> &offset);
-StoreInst *STOREV(Value *Val, Value *BasePtr, const std::initializer_list<Value*> &offset);
-
Value *VCMPPS_EQ(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_EQ_OQ)); }
Value *VCMPPS_LT(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_LT_OQ)); }
Value *VCMPPS_LE(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_LE_OQ)); }
@@ -129,30 +119,6 @@ Value *VMASK_16(Value *mask);
Value *EXTRACT_16(Value *x, uint32_t imm);
Value *JOIN_16(Value *a, Value *b);
-Value *MASKLOADD(Value* src, Value* mask);
-
-void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
- Value* mask, Value* vGatherComponents[], bool bPackedOutput);
-
-virtual Value *GATHERPS(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1, Value *pDrawContext = nullptr);
-Value *GATHERPS_16(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1);
-
-void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
- Value* mask, Value* vGatherComponents[], bool bPackedOutput);
-
-Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1);
-Value *GATHERDD_16(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1);
-
-void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
- Value* mask, Value* vGatherComponents[], bool bPackedOutput);
-
-Value *GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1);
-
-void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask);
-
-void Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput);
-void Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[], Value* vGatherOutput[], bool bPackedOutput);
-
Value *PSHUFB(Value* a, Value* b);
Value *PMOVSXBD(Value* a);
Value *PMOVSXWD(Value* a);
@@ -180,8 +146,6 @@ Value *FCLAMP(Value* src, float low, float high);
CallInst *PRINT(const std::string &printStr);
CallInst *PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs);
-Value* STACKSAVE();
-void STACKRESTORE(Value* pSaved);
Value* POPCNT(Value* a);
@@ -199,9 +163,4 @@ void RDTSC_STOP(Value* pBucketMgr, Value* pId);
Value* CreateEntryAlloca(Function* pFunc, Type* pType);
Value* CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize);
-// Static stack allocations for scatter operations
-Value* pScatterStackSrc{ nullptr };
-Value* pScatterStackOffsets{ nullptr };
-
-
uint32_t GetTypeSize(Type* pType);