summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp1088
1 files changed, 368 insertions, 720 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index ac09a82f6c2..99a936d1760 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -50,7 +50,6 @@ enum ConversionType
#if USE_SIMD16_SHADERS
#define USE_SIMD16_GATHERS 0
-#define USE_SIMD16_BUILDER 0
#endif
//////////////////////////////////////////////////////////////////////////
@@ -61,6 +60,7 @@ struct FetchJit : public Builder
FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
Function* Create(const FETCH_COMPILE_STATE& fetchState);
+
Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
@@ -69,43 +69,49 @@ struct FetchJit : public Builder
typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
const uint32_t(&)[4]> Shuffle8bpcArgs;
+
#if USE_SIMD16_SHADERS
+#if USE_SIMD16_GATHERS
+ void Shuffle8bpcGatherd16(Shuffle8bpcArgs &args);
+#else
void Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2);
+#endif
#else
void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
#endif
-#if USE_SIMD16_BUILDER
- void Shuffle8bpcGatherd2(Shuffle8bpcArgs &args);
-#endif
typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
+
#if USE_SIMD16_SHADERS
+#if USE_SIMD16_GATHERS
+ void Shuffle16bpcGather16(Shuffle16bpcArgs &args);
+#else
void Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2);
+#endif
#else
void Shuffle16bpcGather(Shuffle16bpcArgs &args);
#endif
-#if USE_SIMD16_BUILDER
- void Shuffle16bpcGather2(Shuffle16bpcArgs &args);
-#endif
+#if USE_SIMD16_GATHERS
+ void StoreVertexElements16(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
+#else
void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
-#if USE_SIMD16_BUILDER
- void StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
#endif
#if USE_SIMD16_SHADERS
- Value* GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2);
+#if USE_SIMD16_GATHERS
+ Value *GenerateCompCtrlVector16(const ComponentControl ctrl);
#else
- Value* GenerateCompCtrlVector(const ComponentControl ctrl);
+ Value *GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2);
#endif
-#if USE_SIMD16_BUILDER
- Value* GenerateCompCtrlVector2(const ComponentControl ctrl);
+#else
+ Value *GenerateCompCtrlVector(const ComponentControl ctrl);
#endif
void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
-#if USE_SIMD16_SHADERS
+#if USE_SIMD16_SHADERS
#if USE_SIMD16_GATHERS
void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2);
#else
@@ -833,24 +839,17 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
uint32_t outputElt = 0;
Value* vVertexElements[4];
#if USE_SIMD16_GATHERS
- Value* vVertexElements2[4];
-#if USE_SIMD16_BUILDER
Value *pVtxSrc2[4];
#endif
-#endif
Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
#if USE_SIMD16_GATHERS
-#if USE_SIMD16_BUILDER
Value* vBaseVertex16 = VBROADCAST_16(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
#else
Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
#endif
-#else
- Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
-#endif
curInstance->setName("curInstance");
for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
@@ -874,14 +873,10 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
#if USE_SIMD16_GATHERS
-#if USE_SIMD16_BUILDER
Value *vStride16 = VBROADCAST_16(stride);
#else
Value *vStride = VBROADCAST(stride);
#endif
-#else
- Value *vStride = VBROADCAST(stride);
-#endif
// max vertex index that is fully in bounds
Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
@@ -901,23 +896,17 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
curInstance = ADD(curInstance, startInstance);
}
- Value *vCurIndices;
#if USE_SIMD16_GATHERS
- Value *vCurIndices2;
-#if USE_SIMD16_BUILDER
Value *vCurIndices16;
-#endif
+#else
+ Value *vCurIndices;
#endif
Value *startOffset;
#if USE_SIMD16_GATHERS
-#if USE_SIMD16_BUILDER
Value *vInstanceStride16 = VIMMED1_16(0);
#else
Value *vInstanceStride = VIMMED1(0);
#endif
-#else
- Value *vInstanceStride = VIMMED1(0);
-#endif
if (ied.InstanceEnable)
{
@@ -934,14 +923,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
#if USE_SIMD16_GATHERS
-#if USE_SIMD16_BUILDER
vCurIndices16 = VBROADCAST_16(calcInstance);
#else
vCurIndices = VBROADCAST(calcInstance);
- vCurIndices2 = VBROADCAST(calcInstance);
-#endif
-#else
- vCurIndices = VBROADCAST(calcInstance);
#endif
startOffset = startInstance;
@@ -951,27 +935,18 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
// grab the instance advancement state, determines stride in bytes from one instance to the next
Value* stepRate = C(ied.InstanceAdvancementState);
#if USE_SIMD16_GATHERS
-#if USE_SIMD16_BUILDER
vInstanceStride16 = VBROADCAST_16(MUL(curInstance, stepRate));
#else
vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
#endif
-#else
- vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
-#endif
// offset indices by baseVertex
#if USE_SIMD16_GATHERS
-#if USE_SIMD16_BUILDER
Value *vIndices16 = JOIN_16(vIndices, vIndices2);
vCurIndices16 = ADD(vIndices16, vBaseVertex16);
#else
vCurIndices = ADD(vIndices, vBaseVertex);
- vCurIndices2 = ADD(vIndices2, vBaseVertex);
-#endif
-#else
- vCurIndices = ADD(vIndices, vBaseVertex);
#endif
startOffset = startVertex;
@@ -981,16 +956,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
{
// offset indices by baseVertex
#if USE_SIMD16_GATHERS
-#if USE_SIMD16_BUILDER
Value *vIndices16 = JOIN_16(vIndices, vIndices2);
vCurIndices16 = ADD(vIndices16, vBaseVertex16);
#else
vCurIndices = ADD(vIndices, vBaseVertex);
- vCurIndices2 = ADD(vIndices2, vBaseVertex);
-#endif
-#else
- vCurIndices = ADD(vIndices, vBaseVertex);
#endif
startOffset = startVertex;
@@ -1021,7 +991,6 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
partialInboundsSize = LOAD(partialInboundsSize);
#if USE_SIMD16_GATHERS
-#if USE_SIMD16_BUILDER
Value *vPartialVertexSize = VBROADCAST_16(partialInboundsSize);
Value *vBpp = VBROADCAST_16(C(info.Bpp));
Value *vAlignmentOffsets = VBROADCAST_16(C(ied.AlignedByteOffset));
@@ -1030,17 +999,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
Value *vBpp = VBROADCAST(C(info.Bpp));
Value *vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
#endif
-#else
- Value *vPartialVertexSize = VBROADCAST(partialInboundsSize);
- Value *vBpp = VBROADCAST(C(info.Bpp));
- Value *vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
-#endif
// is the element is <= the partially valid size
Value *vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
#if USE_SIMD16_GATHERS
-#if USE_SIMD16_BUILDER
// override cur indices with 0 if pitch is 0
Value *pZeroPitchMask16 = ICMP_EQ(vStride16, VIMMED1_16(0));
vCurIndices16 = SELECT(pZeroPitchMask16, VIMMED1_16(0), vCurIndices16);
@@ -1091,58 +1054,6 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
// override cur indices with 0 if pitch is 0
Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
- vCurIndices2 = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices2);
-
- // are vertices partially OOB?
- Value* vMaxVertex = VBROADCAST(maxVertex);
- Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
- Value* vPartialOOBMask2 = ICMP_EQ(vCurIndices2, vMaxVertex);
-
- // are vertices fully in bounds?
- Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
- Value* vMaxGatherMask2 = ICMP_ULT(vCurIndices2, vMaxVertex);
-
- Value *vGatherMask;
- Value *vGatherMask2;
- if (fetchState.bPartialVertexBuffer)
- {
- // are vertices below minVertex limit?
- Value *vMinVertex = VBROADCAST(minVertex);
- Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
- Value *vMinGatherMask2 = ICMP_UGE(vCurIndices2, vMinVertex);
-
- // only fetch lanes that pass both tests
- vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
- vGatherMask2 = AND(vMaxGatherMask2, vMinGatherMask2);
- }
- else
- {
- vGatherMask = vMaxGatherMask;
- vGatherMask2 = vMaxGatherMask2;
- }
-
- // blend in any partially OOB indices that have valid elements
- vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
- vGatherMask2 = SELECT(vPartialOOBMask2, vElementInBoundsMask, vGatherMask2);
-
- // calculate the actual offsets into the VB
- Value* vOffsets = MUL(vCurIndices, vStride);
- vOffsets = ADD(vOffsets, vAlignmentOffsets);
-
- Value* vOffsets2 = MUL(vCurIndices2, vStride);
- vOffsets2 = ADD(vOffsets2, vAlignmentOffsets);
-
- // if instance stride enable is:
- // true - add product of the instanceID and advancement state to the offst into the VB
- // false - value of vInstanceStride has been initialialized to zero
- vOffsets = ADD(vOffsets, vInstanceStride);
- vOffsets2 = ADD(vOffsets2, vInstanceStride);
-
-#endif
-#else
- // override cur indices with 0 if pitch is 0
- Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
- vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
// are vertices partially OOB?
Value* vMaxVertex = VBROADCAST(maxVertex);
@@ -1190,7 +1101,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
#if USE_SIMD16_GATHERS
Value *pResults[4];
Value *pResults2[4];
- CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
+ CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask2, pStreamBase, vOffsets2, pResults2);
ConvertFormat((SWR_FORMAT)ied.Format, pResults);
ConvertFormat((SWR_FORMAT)ied.Format, pResults2);
@@ -1199,43 +1110,26 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
{
if (isComponentEnabled(compMask, c))
{
-#if USE_SIMD16_BUILDER
// pack adjacent pairs of SIMD8s into SIMD16s
- pVtxSrc2[currentVertexElement] = JOIN_16(pResults[c], pResults2[c]);
-
-#else
- vVertexElements[currentVertexElement] = pResults[c];
- vVertexElements2[currentVertexElement] = pResults2[c];
-
-#endif
- currentVertexElement += 1;
+ pVtxSrc2[currentVertexElement++] = JOIN_16(pResults[c], pResults2[c]);
if (currentVertexElement > 3)
{
-#if USE_SIMD16_BUILDER
// store SIMD16s
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
- StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
-
-#else
- StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
- StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
-
-#endif
- outputElt += 1;
-
+ StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
// reset to the next vVertexElement to output
currentVertexElement = 0;
}
}
}
#else
- Value* pResults[4];
+ Value *pResults[4];
CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
ConvertFormat((SWR_FORMAT)ied.Format, pResults);
- for (uint32_t c = 0; c < 4; ++c)
+ for (uint32_t c = 0; c < 4; c += 1)
{
if (isComponentEnabled(compMask, c))
{
@@ -1255,11 +1149,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
///@todo: support 64 bit vb accesses
Value *gatherSrc = VIMMED1(0.0f);
#if USE_SIMD16_GATHERS
- Value *gatherSrc2 = VIMMED1(0.0f);
-#if USE_SIMD16_BUILDER
Value *gatherSrc16 = VIMMED1_16(0.0f);
#endif
-#endif
SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
"Unsupported format for standard gather fetch.");
@@ -1270,7 +1161,6 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
case 16:
{
#if USE_SIMD16_GATHERS
-#if USE_SIMD16_BUILDER
Value *gatherResult[2];
// if we have at least one component out of x or y to fetch
@@ -1306,73 +1196,23 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
gatherResult[1] = VUNDEF_I_16();
}
-#else
- Value *vGatherResult[2];
- Value *vGatherResult2[2];
-
- // if we have at least one component out of x or y to fetch
- if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
- {
- vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
- vGatherResult2[0] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
- // e.g. result of first 8x32bit integer gather for 16bit components
- // 256i - 0 1 2 3 4 5 6 7
- // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
- //
- }
- else
- {
- vGatherResult[0] = VUNDEF_I();
- vGatherResult2[0] = VUNDEF_I();
- }
-
- // if we have at least one component out of z or w to fetch
- if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
- {
- // offset base to the next components(zw) in the vertex to gather
- pStreamBase = GEP(pStreamBase, C((char)4));
-
- vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
- vGatherResult2[1] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
- // e.g. result of second 8x32bit integer gather for 16bit components
- // 256i - 0 1 2 3 4 5 6 7
- // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
- //
- }
- else
- {
- vGatherResult[1] = VUNDEF_I();
- vGatherResult2[1] = VUNDEF_I();
- }
-
-#endif
// if we have at least one component to shuffle into place
if (compMask)
{
-#if USE_SIMD16_BUILDER
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, CONVERT_NONE,
currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
// Shuffle gathered components into place in simdvertex struct
- Shuffle16bpcGather2(args); // outputs to vVertexElements ref
-#else
- Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
- currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
- Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), Instruction::CastOps::FPExt, CONVERT_NONE,
- currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2);
-
- // Shuffle gathered components into place in simdvertex struct
- Shuffle16bpcGather(args, false); // outputs to vVertexElements ref
- Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref
-#endif
+ Shuffle16bpcGather16(args); // outputs to vVertexElements ref
}
#else
- Value* vGatherResult[2];
+ Value *vGatherResult[2];
// if we have at least one component out of x or y to fetch
- if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
+ if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
+ {
vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
// e.g. result of first 8x32bit integer gather for 16bit components
// 256i - 0 1 2 3 4 5 6 7
@@ -1381,7 +1221,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
}
// if we have at least one component out of z or w to fetch
- if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
+ if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
+ {
// offset base to the next components(zw) in the vertex to gather
pStreamBase = GEP(pStreamBase, C((char)4));
@@ -1393,7 +1234,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
}
// if we have at least one component to shuffle into place
- if(compMask){
+ if (compMask)
+ {
Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
@@ -1422,59 +1264,20 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
// However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
// But, we know that elements must be aligned for FETCH. :)
// Right shift the offset by a bit and then scale by 2 to remove the sign extension.
-#if USE_SIMD16_BUILDER
- Value *shiftedOffsets = LSHR(vOffsets16, 1);
- pVtxSrc2[currentVertexElement] = GATHERPS_16(gatherSrc16, pStreamBase, shiftedOffsets, vGatherMask16, 2);
-
-#else
- Value *vShiftedOffsets = LSHR(vOffsets, 1);
- Value *vShiftedOffsets2 = LSHR(vOffsets2, 1);
-
- vVertexElements[currentVertexElement] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
- vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vGatherMask2, 2);
-
-#if USE_SIMD16_BUILDER
- // pack adjacent pairs of SIMD8s into SIMD16s
- pVtxSrc2[currentVertexElement] = JOIN_16(vVertexElements[currentVertexElement],
- vVertexElements2[currentVertexElement]);
-
-#endif
-#endif
- currentVertexElement += 1;
+ Value *shiftedOffsets16 = LSHR(vOffsets16, 1);
+ pVtxSrc2[currentVertexElement++] = GATHERPS_16(gatherSrc16, pStreamBase, shiftedOffsets16, vGatherMask16, 2);
}
else
{
-#if USE_SIMD16_BUILDER
- pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
-#else
- vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
- vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
-
-#if USE_SIMD16_BUILDER
- // pack adjacent pairs of SIMD8s into SIMD16s
- pVtxSrc2[currentVertexElement] = JOIN_16(vVertexElements[currentVertexElement],
- vVertexElements2[currentVertexElement]);
-
-#endif
-#endif
- currentVertexElement += 1;
+ pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
}
if (currentVertexElement > 3)
{
-#if USE_SIMD16_BUILDER
// store SIMD16s
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
- StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
-
-#else
- StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
- StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
-
-#endif
- outputElt += 1;
-
+ StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
// reset to the next vVertexElement to output
currentVertexElement = 0;
}
@@ -1493,7 +1296,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
// However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
// But, we know that elements must be aligned for FETCH. :)
// Right shift the offset by a bit and then scale by 2 to remove the sign extension.
- Value* vShiftedOffsets = LSHR(vOffsets, 1);
+ Value *vShiftedOffsets = LSHR(vOffsets, 1);
vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
}
else
@@ -1554,45 +1357,20 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
-#if USE_SIMD16_BUILDER
// pack adjacent pairs of SIMD8s into SIMD16s
- pVtxSrc2[currentVertexElement] = JOIN_16(pGather, pGather2);
-
-#else
- vVertexElements[currentVertexElement] = pGather;
- vVertexElements2[currentVertexElement] = pGather2;
-
-#endif
- currentVertexElement += 1;
+ pVtxSrc2[currentVertexElement++] = JOIN_16(pGather, pGather2);
}
else
{
-#if USE_SIMD16_BUILDER
- pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
-
-#else
- vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
- vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
-
-#endif
- currentVertexElement += 1;
+ pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
}
if (currentVertexElement > 3)
{
-#if USE_SIMD16_BUILDER
// store SIMD16s
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
- StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
-
-#else
- StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
- StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
-
-#endif
- outputElt += 1;
-
+ StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
// reset to the next vVertexElement to output
currentVertexElement = 0;
}
@@ -1614,10 +1392,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
- Value* pGatherLo = GATHERPD(vZeroDouble,
- pStreamBase, vOffsetsLo, vMaskLo);
- Value* pGatherHi = GATHERPD(vZeroDouble,
- pStreamBase, vOffsetsHi, vMaskHi);
+ Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
+ Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
pGatherLo = VCVTPD2PS(pGatherLo);
pGatherHi = VCVTPD2PS(pGatherHi);
@@ -1693,11 +1469,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
// value substituted when component of gather is masked
Value* gatherSrc = VIMMED1(0);
#if USE_SIMD16_GATHERS
- Value* gatherSrc2 = VIMMED1(0);
-#if USE_SIMD16_BUILDER
Value *gatherSrc16 = VIMMED1_16(0);
#endif
-#endif
// Gather components from memory to store in a simdvertex structure
switch (bpc)
@@ -1708,42 +1481,21 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
if (compMask)
{
#if USE_SIMD16_GATHERS
-#if USE_SIMD16_BUILDER
Value *gatherResult = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
// e.g. result of an 8x32bit integer gather for 8bit components
// 256i - 0 1 2 3 4 5 6 7
// xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
-#else
- Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
- Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
-
- // e.g. result of an 8x32bit integer gather for 8bit components
- // 256i - 0 1 2 3 4 5 6 7
- // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
-
-#endif
-#if USE_SIMD16_BUILDER
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
Shuffle8bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2, info.swizzle);
// Shuffle gathered components into place in simdvertex struct
- Shuffle8bpcGatherd2(args); // outputs to vVertexElements ref
-#else
- Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
- currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
- Shuffle8bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
- currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2, info.swizzle);
-
- // Shuffle gathered components into place in simdvertex struct
- Shuffle8bpcGatherd(args, false); // outputs to vVertexElements ref
- Shuffle8bpcGatherd(args2, true); // outputs to vVertexElements ref
-#endif
+ Shuffle8bpcGatherd16(args); // outputs to vVertexElements ref
#else
- Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
+ Value *vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
// e.g. result of an 8x32bit integer gather for 8bit components
// 256i - 0 1 2 3 4 5 6 7
// xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
@@ -1764,8 +1516,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
case 16:
{
#if USE_SIMD16_GATHERS
-#if USE_SIMD16_BUILDER
- Value* gatherResult[2];
+ Value *gatherResult[2];
// if we have at least one component out of x or y to fetch
if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
@@ -1800,73 +1551,23 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
gatherResult[1] = VUNDEF_I_16();
}
-#else
- Value* vGatherResult[2];
- Value* vGatherResult2[2];
-
- // if we have at least one component out of x or y to fetch
- if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
- {
- vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
- vGatherResult2[0] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
- // e.g. result of first 8x32bit integer gather for 16bit components
- // 256i - 0 1 2 3 4 5 6 7
- // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
- //
- }
- else
- {
- vGatherResult[0] = VUNDEF_I();
- vGatherResult2[0] = VUNDEF_I();
- }
-
- // if we have at least one component out of z or w to fetch
- if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
- {
- // offset base to the next components(zw) in the vertex to gather
- pStreamBase = GEP(pStreamBase, C((char)4));
-
- vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
- vGatherResult2[1] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
- // e.g. result of second 8x32bit integer gather for 16bit components
- // 256i - 0 1 2 3 4 5 6 7
- // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
- //
- }
- else
- {
- vGatherResult[1] = VUNDEF_I();
- vGatherResult2[1] = VUNDEF_I();
- }
-
-#endif
// if we have at least one component to shuffle into place
if (compMask)
{
-#if USE_SIMD16_BUILDER
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
// Shuffle gathered components into place in simdvertex struct
- Shuffle16bpcGather2(args); // outputs to vVertexElements ref
-#else
- Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
- currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
- Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
- currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2);
-
- // Shuffle gathered components into place in simdvertex struct
- Shuffle16bpcGather(args, false); // outputs to vVertexElements ref
- Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref
-#endif
+ Shuffle16bpcGather16(args); // outputs to vVertexElements ref
}
#else
- Value* vGatherResult[2];
+ Value *vGatherResult[2];
// if we have at least one component out of x or y to fetch
- if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
+ if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
+ {
vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
// e.g. result of first 8x32bit integer gather for 16bit components
// 256i - 0 1 2 3 4 5 6 7
@@ -1875,7 +1576,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
}
// if we have at least one component out of z or w to fetch
- if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
+ if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
+ {
// offset base to the next components(zw) in the vertex to gather
pStreamBase = GEP(pStreamBase, C((char)4));
@@ -1887,7 +1589,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
}
// if we have at least one component to shuffle into place
- if(compMask){
+ if (compMask)
+ {
Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
@@ -1912,7 +1615,6 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
if (compCtrl[i] == StoreSrc)
{
#if USE_SIMD16_GATHERS
-#if USE_SIMD16_BUILDER
Value *pGather = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
if (conversionType == CONVERT_USCALED)
@@ -1928,41 +1630,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
pGather = FMUL(SI_TO_FP(pGather, mSimd16FP32Ty), VBROADCAST_16(C(1 / 65536.0f)));
}
-#else
- Value *pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
- Value *pGather2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
-
- if (conversionType == CONVERT_USCALED)
- {
- pGather = UI_TO_FP(pGather, mSimdFP32Ty);
- pGather2 = UI_TO_FP(pGather2, mSimdFP32Ty);
- }
- else if (conversionType == CONVERT_SSCALED)
- {
- pGather = SI_TO_FP(pGather, mSimdFP32Ty);
- pGather2 = SI_TO_FP(pGather2, mSimdFP32Ty);
- }
- else if (conversionType == CONVERT_SFIXED)
- {
- pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
- pGather2 = FMUL(SI_TO_FP(pGather2, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
- }
-
-#endif
-#if USE_SIMD16_BUILDER
- pVtxSrc2[currentVertexElement] = pGather;
-
-#else
- vVertexElements[currentVertexElement] = pGather;
- vVertexElements2[currentVertexElement] = pGather2;
-
-#endif
+ pVtxSrc2[currentVertexElement++] = pGather;
// e.g. result of a single 8x32bit integer gather for 32bit components
// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
-
- currentVertexElement += 1;
#else
Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
@@ -1980,6 +1652,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
}
vVertexElements[currentVertexElement++] = pGather;
+
// e.g. result of a single 8x32bit integer gather for 32bit components
// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
@@ -1987,40 +1660,24 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
}
else
{
-#if USE_SIMD16_SHADERS
#if USE_SIMD16_GATHERS
-#if USE_SIMD16_BUILDER
- pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
-
-#else
- vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
- vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
-
-#endif
- currentVertexElement += 1;
+ pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
#else
+#if USE_SIMD16_SHADERS
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
-#endif
#else
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
#endif
+#endif
}
if (currentVertexElement > 3)
{
#if USE_SIMD16_GATHERS
-#if USE_SIMD16_BUILDER
// store SIMD16s
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
- StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
-
-#else
- StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
- StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
-
-#endif
- outputElt += 1;
+ StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
#else
StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
#endif
@@ -2044,18 +1701,10 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
if (currentVertexElement > 0)
{
#if USE_SIMD16_GATHERS
-#if USE_SIMD16_BUILDER
// store SIMD16s
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
- StoreVertexElements2(pVtxOut2, outputElt, currentVertexElement, pVtxSrc2);
-
-#else
- StoreVertexElements(pVtxOut, outputElt, currentVertexElement, vVertexElements);
- StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, currentVertexElement, vVertexElements2);
-
-#endif
- outputElt += 1;
+ StoreVertexElements16(pVtxOut2, outputElt++, currentVertexElement, pVtxSrc2);
#else
StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
#endif
@@ -2183,109 +1832,8 @@ Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
/// @param compCtrl - component control val
/// @param vVertexElements[4] - vertex components to output
/// @param swizzle[4] - component swizzle location
-#if USE_SIMD16_SHADERS
-void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2)
-#else
-void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
-#endif
-{
- // Unpack tuple args
- Value*& vGatherResult = std::get<0>(args);
- Value* pVtxOut = std::get<1>(args);
- const Instruction::CastOps extendType = std::get<2>(args);
- const ConversionType conversionType = std::get<3>(args);
- uint32_t &currentVertexElement = std::get<4>(args);
- uint32_t &outputElt = std::get<5>(args);
- const ComponentEnable compMask = std::get<6>(args);
- const ComponentControl (&compCtrl)[4] = std::get<7>(args);
- Value* (&vVertexElements)[4] = std::get<8>(args);
- const uint32_t (&swizzle)[4] = std::get<9>(args);
-
- // cast types
- Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
-
- for (uint32_t i = 0; i < 4; i++)
- {
- if (!isComponentEnabled(compMask, i))
- continue;
-
- if (compCtrl[i] == ComponentControl::StoreSrc)
- {
- std::vector<uint32_t> vShuffleMasks[4] = {
- { 0, 4, 8, 12, 16, 20, 24, 28 }, // x
- { 1, 5, 9, 13, 17, 21, 25, 29 }, // y
- { 2, 6, 10, 14, 18, 22, 26, 30 }, // z
- { 3, 7, 11, 15, 19, 23, 27, 31 }, // w
- };
-
- Value *val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
- UndefValue::get(v32x8Ty),
- vShuffleMasks[swizzle[i]]);
-
- if ((extendType == Instruction::CastOps::SExt) ||
- (extendType == Instruction::CastOps::SIToFP)) {
- switch (conversionType)
- {
- case CONVERT_NORMALIZED:
- val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));
- break;
- case CONVERT_SSCALED:
- val = SI_TO_FP(val, mSimdFP32Ty);
- break;
- case CONVERT_USCALED:
- SWR_INVALID("Type should not be sign extended!");
- break;
- default:
- SWR_ASSERT(conversionType == CONVERT_NONE);
- val = S_EXT(val, mSimdInt32Ty);
- break;
- }
- } else if ((extendType == Instruction::CastOps::ZExt) ||
- (extendType == Instruction::CastOps::UIToFP)) {
- switch (conversionType)
- {
- case CONVERT_NORMALIZED:
- val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));
- break;
- case CONVERT_SSCALED:
- SWR_INVALID("Type should not be zero extended!");
- break;
- case CONVERT_USCALED:
- val = UI_TO_FP(val, mSimdFP32Ty);
- break;
- default:
- SWR_ASSERT(conversionType == CONVERT_NONE);
- val = Z_EXT(val, mSimdInt32Ty);
- break;
- }
- }
- else
- {
- SWR_INVALID("Unsupported conversion type");
- }
-
- vVertexElements[currentVertexElement++] = val;
- }
- else
- {
-#if USE_SIMD16_SHADERS
- vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
-#else
- vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
-#endif
- }
-
- if (currentVertexElement > 3)
- {
- StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
- // reset to the next vVertexElement to output
- currentVertexElement = 0;
- }
- }
-}
-
-#if USE_SIMD16_BUILDER
-void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args)
+#if USE_SIMD16_GATHERS
+void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args)
{
// Unpack tuple args
Value*& vGatherResult = std::get<0>(args);
@@ -2408,12 +1956,12 @@ void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args)
}
else
{
- vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
}
if (currentVertexElement > 3)
{
- StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
+ StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
// reset to the next vVertexElement to output
currentVertexElement = 0;
}
@@ -2506,12 +2054,12 @@ void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args)
}
else
{
- vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
}
if (currentVertexElement > 3)
{
- StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
+ StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
// reset to the next vVertexElement to output
currentVertexElement = 0;
}
@@ -2524,6 +2072,109 @@ void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args)
}
}
+#else
+#if USE_SIMD16_SHADERS
+void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2)
+#else
+void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
+#endif
+{
+ // Unpack tuple args
+ Value*& vGatherResult = std::get<0>(args);
+ Value* pVtxOut = std::get<1>(args);
+ const Instruction::CastOps extendType = std::get<2>(args);
+ const ConversionType conversionType = std::get<3>(args);
+ uint32_t &currentVertexElement = std::get<4>(args);
+ uint32_t &outputElt = std::get<5>(args);
+ const ComponentEnable compMask = std::get<6>(args);
+ const ComponentControl(&compCtrl)[4] = std::get<7>(args);
+ Value* (&vVertexElements)[4] = std::get<8>(args);
+ const uint32_t(&swizzle)[4] = std::get<9>(args);
+
+ // cast types
+ Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
+
+ for (uint32_t i = 0; i < 4; i++)
+ {
+ if (!isComponentEnabled(compMask, i))
+ continue;
+
+ if (compCtrl[i] == ComponentControl::StoreSrc)
+ {
+ std::vector<uint32_t> vShuffleMasks[4] = {
+ { 0, 4, 8, 12, 16, 20, 24, 28 }, // x
+ { 1, 5, 9, 13, 17, 21, 25, 29 }, // y
+ { 2, 6, 10, 14, 18, 22, 26, 30 }, // z
+ { 3, 7, 11, 15, 19, 23, 27, 31 }, // w
+ };
+
+ Value *val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
+ UndefValue::get(v32x8Ty),
+ vShuffleMasks[swizzle[i]]);
+
+ if ((extendType == Instruction::CastOps::SExt) ||
+ (extendType == Instruction::CastOps::SIToFP)) {
+ switch (conversionType)
+ {
+ case CONVERT_NORMALIZED:
+ val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));
+ break;
+ case CONVERT_SSCALED:
+ val = SI_TO_FP(val, mSimdFP32Ty);
+ break;
+ case CONVERT_USCALED:
+ SWR_INVALID("Type should not be sign extended!");
+ break;
+ default:
+ SWR_ASSERT(conversionType == CONVERT_NONE);
+ val = S_EXT(val, mSimdInt32Ty);
+ break;
+ }
+ }
+ else if ((extendType == Instruction::CastOps::ZExt) ||
+ (extendType == Instruction::CastOps::UIToFP)) {
+ switch (conversionType)
+ {
+ case CONVERT_NORMALIZED:
+ val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));
+ break;
+ case CONVERT_SSCALED:
+ SWR_INVALID("Type should not be zero extended!");
+ break;
+ case CONVERT_USCALED:
+ val = UI_TO_FP(val, mSimdFP32Ty);
+ break;
+ default:
+ SWR_ASSERT(conversionType == CONVERT_NONE);
+ val = Z_EXT(val, mSimdInt32Ty);
+ break;
+ }
+ }
+ else
+ {
+ SWR_INVALID("Unsupported conversion type");
+ }
+
+ vVertexElements[currentVertexElement++] = val;
+ }
+ else
+ {
+#if USE_SIMD16_SHADERS
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
+#else
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+#endif
+ }
+
+ if (currentVertexElement > 3)
+ {
+ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
+ // reset to the next vVertexElement to output
+ currentVertexElement = 0;
+ }
+ }
+}
+
#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
@@ -2539,11 +2190,8 @@ void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args)
/// @param compMask - component packing mask
/// @param compCtrl - component control val
/// @param vVertexElements[4] - vertex components to output
-#if USE_SIMD16_SHADERS
-void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2)
-#else
-void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
-#endif
+#if USE_SIMD16_GATHERS
+void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args)
{
// Unpack tuple args
Value* (&vGatherResult)[2] = std::get<0>(args);
@@ -2557,45 +2205,63 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
Value* (&vVertexElements)[4] = std::get<8>(args);
// cast types
- Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
- Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
+ Type *vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+ Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
// have to do extra work for sign extending
- if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
- (extendType == Instruction::CastOps::FPExt))
+ if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
{
// is this PP float?
bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
- Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
- Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
+ Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
+ Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
// shuffle mask
- Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
- 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
- Value* vi128XY = nullptr;
- if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
- Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
+ Value *vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
+ Value *vi128XY_lo = nullptr;
+ Value *vi128XY_hi = nullptr;
+ if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
+ {
+ // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
+
+ Value *vGatherResult_lo = EXTRACT_16(vGatherResult[0], 0);
+ Value *vGatherResult_hi = EXTRACT_16(vGatherResult[0], 1);
+
+ Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
+ Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
+
// after pshufb: group components together in each 128bit lane
// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
- vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
+ vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+
// after PERMD: move and pack xy components into each 128bit lane
// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
}
// do the same for zw components
- Value* vi128ZW = nullptr;
- if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
- Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
- vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
+ Value *vi128ZW_lo = nullptr;
+ Value *vi128ZW_hi = nullptr;
+ if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
+ {
+ Value *vGatherResult_lo = EXTRACT_16(vGatherResult[1], 0);
+ Value *vGatherResult_hi = EXTRACT_16(vGatherResult[1], 1);
+
+ Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
+ Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
+
+ vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
}
// init denormalize variables if needed
Instruction::CastOps IntToFpCast;
- Value* conversionFactor;
+ Value *conversionFactor;
switch (conversionType)
{
@@ -2627,35 +2293,43 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
// if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
// if x or y, use vi128XY permute result, else use vi128ZW
- Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
+ Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
+ Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
- if (bFP) {
+ if (bFP)
+ {
// extract 128 bit lanes to sign extend each component
- vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
+ Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
+ Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
+
+ vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
}
- else {
+ else
+ {
// extract 128 bit lanes to sign extend each component
- vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
+ Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
+ Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
// denormalize if needed
- if (conversionType != CONVERT_NONE) {
- vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
+ if (conversionType != CONVERT_NONE)
+ {
+ temp_lo = FMUL(CAST(IntToFpCast, temp_lo, mSimdFP32Ty), conversionFactor);
+ temp_hi = FMUL(CAST(IntToFpCast, temp_hi, mSimdFP32Ty), conversionFactor);
}
+
+ vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
}
- currentVertexElement++;
+
+ currentVertexElement += 1;
}
else
{
-#if USE_SIMD16_SHADERS
- vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
-#else
- vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
-#endif
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
}
if (currentVertexElement > 3)
{
- StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
+ StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
// reset to the next vVertexElement to output
currentVertexElement = 0;
}
@@ -2666,17 +2340,20 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
{
// pshufb masks for each component
- Value* vConstMask[2];
- if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
+ Value *vConstMask[2];
+
+ if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
+ {
// x/z shuffle mask
- vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
- 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
+ vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
+ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
}
-
- if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
+
+ if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
+ {
// y/w shuffle mask
- vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
- 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
+ vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
+ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
}
// init denormalize variables if needed
@@ -2715,7 +2392,14 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
// if x or y, use vi128XY permute result, else use vi128ZW
uint32_t selectedGather = (i < 2) ? 0 : 1;
- vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
+ // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
+
+ Value *vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
+ Value *vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
+
+ Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
+ Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
+
// after pshufb mask for x channel; z uses the same shuffle from the second gather
// 256i - 0 1 2 3 4 5 6 7
// xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
@@ -2723,22 +2407,22 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
// denormalize if needed
if (conversionType != CONVERT_NONE)
{
- vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
+ temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
+ temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
}
- currentVertexElement++;
+
+ vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
+
+ currentVertexElement += 1;
}
else
{
-#if USE_SIMD16_SHADERS
- vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
-#else
- vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
-#endif
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
}
if (currentVertexElement > 3)
{
- StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
+ StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
// reset to the next vVertexElement to output
currentVertexElement = 0;
}
@@ -2751,8 +2435,12 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
}
}
-#if USE_SIMD16_BUILDER
-void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args)
+#else
+#if USE_SIMD16_SHADERS
+void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2)
+#else
+void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
+#endif
{
// Unpack tuple args
Value* (&vGatherResult)[2] = std::get<0>(args);
@@ -2766,71 +2454,45 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args)
Value* (&vVertexElements)[4] = std::get<8>(args);
// cast types
- Type *vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
- Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
+ Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+ Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
- // have to do extra work for sign extending
- if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
+ // have to do extra work for sign extending
+ if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) ||
+ (extendType == Instruction::CastOps::FPExt))
{
// is this PP float?
bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
- Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
- Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
-
- // shuffle mask
- Value *vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
- 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
- Value *vi128XY = nullptr;
- Value *vi128XY_lo = nullptr;
- Value *vi128XY_hi = nullptr;
- if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
- {
- // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
-
- Value *vGatherResult_lo = EXTRACT_16(vGatherResult[0], 0);
- Value *vGatherResult_hi = EXTRACT_16(vGatherResult[0], 1);
-
- Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
- Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
+ Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
+ Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
+ // shuffle mask
+ Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
+ Value* vi128XY = nullptr;
+ if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) {
+ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
// after pshufb: group components together in each 128bit lane
// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
- vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
- vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
-
+ vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
// after PERMD: move and pack xy components into each 128bit lane
// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
-#if 0
- vi128XY = JOIN_16(vi128XY_lo, vi128XY_hi);
-#endif
}
// do the same for zw components
- Value *vi128ZW = nullptr;
- Value *vi128ZW_lo = nullptr;
- Value *vi128ZW_hi = nullptr;
- if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
- {
- Value *vGatherResult_lo = EXTRACT_16(vGatherResult[1], 0);
- Value *vGatherResult_hi = EXTRACT_16(vGatherResult[1], 1);
-
- Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
- Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
-
- vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
- vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
-#if 0
- vi128ZW = JOIN_16(vi128ZW_lo, vi128ZW_hi);
-#endif
+ Value* vi128ZW = nullptr;
+ if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) {
+ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
+ vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
}
// init denormalize variables if needed
Instruction::CastOps IntToFpCast;
- Value *conversionFactor;
+ Value* conversionFactor;
switch (conversionType)
{
@@ -2862,43 +2524,35 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args)
// if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
// if x or y, use vi128XY permute result, else use vi128ZW
- Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
- Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
+ Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
- if (bFP)
- {
+ if (bFP) {
// extract 128 bit lanes to sign extend each component
- Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
- Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
-
- vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
+ vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
}
- else
- {
+ else {
// extract 128 bit lanes to sign extend each component
- Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
- Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
+ vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
// denormalize if needed
- if (conversionType != CONVERT_NONE)
- {
- temp_lo = FMUL(CAST(IntToFpCast, temp_lo, mSimdFP32Ty), conversionFactor);
- temp_hi = FMUL(CAST(IntToFpCast, temp_hi, mSimdFP32Ty), conversionFactor);
+ if (conversionType != CONVERT_NONE) {
+ vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
}
-
- vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
}
-
- currentVertexElement += 1;
+ currentVertexElement++;
}
else
{
- vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
+#if USE_SIMD16_SHADERS
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
+#else
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+#endif
}
if (currentVertexElement > 3)
{
- StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
+ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
// reset to the next vVertexElement to output
currentVertexElement = 0;
}
@@ -2909,17 +2563,14 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args)
else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
{
// pshufb masks for each component
- Value *vConstMask[2];
-
- if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
- {
+ Value* vConstMask[2];
+ if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) {
// x/z shuffle mask
vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
}
- if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
- {
+ if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) {
// y/w shuffle mask
vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
@@ -2961,14 +2612,7 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args)
// if x or y, use vi128XY permute result, else use vi128ZW
uint32_t selectedGather = (i < 2) ? 0 : 1;
- // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
-
- Value *vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
- Value *vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
-
- Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
- Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
-
+ vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
// after pshufb mask for x channel; z uses the same shuffle from the second gather
// 256i - 0 1 2 3 4 5 6 7
// xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
@@ -2976,22 +2620,22 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args)
// denormalize if needed
if (conversionType != CONVERT_NONE)
{
- temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
- temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
+ vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
}
-
- vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
-
- currentVertexElement += 1;
+ currentVertexElement++;
}
else
{
- vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
+#if USE_SIMD16_SHADERS
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
+#else
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+#endif
}
if (currentVertexElement > 3)
{
- StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
+ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
// reset to the next vVertexElement to output
currentVertexElement = 0;
}
@@ -3011,39 +2655,36 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args)
/// @param outputElt - simdvertex offset in VIN to write to
/// @param numEltsToStore - number of simdvertex rows to write out
/// @param vVertexElements - LLVM Value*[] simdvertex to write out
-void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
+#if USE_SIMD16_GATHERS
+void FetchJit::StoreVertexElements16(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
{
SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
- for(uint32_t c = 0; c < numEltsToStore; ++c)
+ for (uint32_t c = 0; c < numEltsToStore; ++c)
{
// STORE expects FP32 x vWidth type, just bitcast if needed
- if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
+ if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
{
#if FETCH_DUMP_VERTEX
- PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
+ PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
#endif
- vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
+ vVertexElements[c] = BITCAST(vVertexElements[c], mSimd16FP32Ty);
}
#if FETCH_DUMP_VERTEX
else
{
- PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
+ PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
}
#endif
// outputElt * 4 = offsetting by the size of a simdvertex
// + c offsets to a 32bit x vWidth row within the current vertex
-#if USE_SIMD16_SHADERS
- Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP");
-#else
Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
-#endif
STORE(vVertexElements[c], dest);
}
}
-#if USE_SIMD16_BUILDER
-void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
+#else
+void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
{
SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
@@ -3055,7 +2696,7 @@ void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, co
#if FETCH_DUMP_VERTEX
PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
#endif
- vVertexElements[c] = BITCAST(vVertexElements[c], mSimd16FP32Ty);
+ vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
}
#if FETCH_DUMP_VERTEX
else
@@ -3065,7 +2706,11 @@ void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, co
#endif
// outputElt * 4 = offsetting by the size of a simdvertex
// + c offsets to a 32bit x vWidth row within the current vertex
+#if USE_SIMD16_SHADERS
+ Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP");
+#else
Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
+#endif
STORE(vVertexElements[c], dest);
}
}
@@ -3075,21 +2720,56 @@ void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, co
/// @brief Generates a constant vector of values based on the
/// ComponentControl value
/// @param ctrl - ComponentControl value
+#if USE_SIMD16_GATHERS
+Value *FetchJit::GenerateCompCtrlVector16(const ComponentControl ctrl)
+{
+ switch (ctrl)
+ {
+ case NoStore:
+ return VUNDEF_I_16();
+ case Store0:
+ return VIMMED1_16(0);
+ case Store1Fp:
+ return VIMMED1_16(1.0f);
+ case Store1Int:
+ return VIMMED1_16(1);
+ case StoreVertexId:
+ {
+ Value *pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
+ Value *pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
+
+ Value *pId = JOIN_16(pId_lo, pId_hi);
+
+ return pId;
+ }
+ case StoreInstanceId:
+ {
+ Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
+ return VBROADCAST_16(pId);
+ }
+ case StoreSrc:
+ default:
+ SWR_INVALID("Invalid component control");
+ return VUNDEF_I_16();
+ }
+}
+
+#else
#if USE_SIMD16_SHADERS
-Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2)
+Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2)
#else
-Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
+Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
#endif
{
- switch(ctrl)
+ switch (ctrl)
{
- case NoStore:
+ case NoStore:
return VUNDEF_I();
- case Store0:
+ case Store0:
return VIMMED1(0);
- case Store1Fp:
+ case Store1Fp:
return VIMMED1(1.0f);
- case Store1Int:
+ case Store1Int:
return VIMMED1(1);
case StoreVertexId:
{
@@ -3114,41 +2794,9 @@ Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
return VBROADCAST(pId);
}
case StoreSrc:
- default:
- SWR_INVALID("Invalid component control"); return VUNDEF_I();
- }
-}
-
-#if USE_SIMD16_BUILDER
-Value* FetchJit::GenerateCompCtrlVector2(const ComponentControl ctrl)
-{
- switch (ctrl)
- {
- case NoStore:
- return VUNDEF_I_16();
- case Store0:
- return VIMMED1_16(0);
- case Store1Fp:
- return VIMMED1_16(1.0f);
- case Store1Int:
- return VIMMED1_16(1);
- case StoreVertexId:
- {
- Value *pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
- Value *pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
-
- Value *pId = JOIN_16(pId_lo, pId_hi);
-
- return pId;
- }
- case StoreInstanceId:
- {
- Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
- return VBROADCAST_16(pId);
- }
- case StoreSrc:
- default:
- SWR_INVALID("Invalid component control"); return VUNDEF_I_16();
+ default:
+ SWR_INVALID("Invalid component control");
+ return VUNDEF_I();
}
}