diff options
author | Tim Rowley <[email protected]> | 2017-12-19 13:39:09 -0600 |
---|---|---|
committer | Tim Rowley <[email protected]> | 2018-01-10 09:44:07 -0600 |
commit | e14b48e00e56b59de4bb916be994756295d7b685 (patch) | |
tree | 782a182fc1a1001a3098e1e33e8b0b794ca7743e | |
parent | 336afe7d7a8e066e1286bb93791d5c3d96ccc317 (diff) |
swr/rast: SIMD16 builder - cleanup naming (simd2 -> simd16)
Reviewed-by: Bruce Cherniak <[email protected]>
5 files changed, 239 insertions, 233 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp index 4b83a3204cf..c46159a35ac 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp @@ -40,52 +40,56 @@ namespace SwrJit Builder::Builder(JitManager *pJitMgr) : mpJitMgr(pJitMgr) { + SWR_ASSERT(pJitMgr->mVWidth == 8); + mVWidth = pJitMgr->mVWidth; -#if USE_SIMD16_BUILDER - mVWidth2 = pJitMgr->mVWidth * 2; -#endif + mVWidth16 = pJitMgr->mVWidth * 2; mpIRBuilder = &pJitMgr->mBuilder; - mVoidTy = Type::getVoidTy(pJitMgr->mContext); - mFP16Ty = Type::getHalfTy(pJitMgr->mContext); - mFP32Ty = Type::getFloatTy(pJitMgr->mContext); - mFP32PtrTy = PointerType::get(mFP32Ty, 0); - mDoubleTy = Type::getDoubleTy(pJitMgr->mContext); - mInt1Ty = Type::getInt1Ty(pJitMgr->mContext); - mInt8Ty = Type::getInt8Ty(pJitMgr->mContext); - mInt16Ty = Type::getInt16Ty(pJitMgr->mContext); - mInt32Ty = Type::getInt32Ty(pJitMgr->mContext); - mInt8PtrTy = PointerType::get(mInt8Ty, 0); + // Built in types: scalar + + mVoidTy = Type::getVoidTy(pJitMgr->mContext); + mFP16Ty = Type::getHalfTy(pJitMgr->mContext); + mFP32Ty = Type::getFloatTy(pJitMgr->mContext); + mFP32PtrTy = PointerType::get(mFP32Ty, 0); + mDoubleTy = Type::getDoubleTy(pJitMgr->mContext); + mInt1Ty = Type::getInt1Ty(pJitMgr->mContext); + mInt8Ty = Type::getInt8Ty(pJitMgr->mContext); + mInt16Ty = Type::getInt16Ty(pJitMgr->mContext); + mInt32Ty = Type::getInt32Ty(pJitMgr->mContext); + mInt8PtrTy = PointerType::get(mInt8Ty, 0); mInt16PtrTy = PointerType::get(mInt16Ty, 0); mInt32PtrTy = PointerType::get(mInt32Ty, 0); - mInt64Ty = Type::getInt64Ty(pJitMgr->mContext); - mSimdInt1Ty = VectorType::get(mInt1Ty, mVWidth); - mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth); - mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth); - mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth); - mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth); - mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth); - mSimdVectorTy = ArrayType::get(mSimdFP32Ty, 4); + mInt64Ty = Type::getInt64Ty(pJitMgr->mContext); + + // Built in types: simd8 + + mSimdInt1Ty = VectorType::get(mInt1Ty, mVWidth); + mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth); + mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth); + mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth); + mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth); + mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth); + mSimdVectorTy = ArrayType::get(mSimdFP32Ty, 4); mSimdVectorTRTy = ArrayType::get(mSimdFP32Ty, 5); -#if USE_SIMD16_BUILDER - mSimd2Int1Ty = VectorType::get(mInt1Ty, mVWidth2); - mSimd2Int16Ty = VectorType::get(mInt16Ty, mVWidth2); - mSimd2Int32Ty = VectorType::get(mInt32Ty, mVWidth2); - mSimd2Int64Ty = VectorType::get(mInt64Ty, mVWidth2); - mSimd2FP16Ty = VectorType::get(mFP16Ty, mVWidth2); - mSimd2FP32Ty = VectorType::get(mFP32Ty, mVWidth2); - mSimd2VectorTy = ArrayType::get(mSimd2FP32Ty, 4); - mSimd2VectorTRTy = ArrayType::get(mSimd2FP32Ty, 5); -#endif + + // Built in types: simd16 + + mSimd16Int1Ty = VectorType::get(mInt1Ty, mVWidth16); + mSimd16Int16Ty = VectorType::get(mInt16Ty, mVWidth16); + mSimd16Int32Ty = VectorType::get(mInt32Ty, mVWidth16); + mSimd16Int64Ty = VectorType::get(mInt64Ty, mVWidth16); + mSimd16FP16Ty = VectorType::get(mFP16Ty, mVWidth16); + mSimd16FP32Ty = VectorType::get(mFP32Ty, mVWidth16); + mSimd16VectorTy = ArrayType::get(mSimd16FP32Ty, 4); + mSimd16VectorTRTy = ArrayType::get(mSimd16FP32Ty, 5); if (sizeof(uint32_t*) == 4) { mIntPtrTy = mInt32Ty; mSimdIntPtrTy = mSimdInt32Ty; -#if USE_SIMD16_BUILDER - mSimd2IntPtrTy = mSimd2Int32Ty; -#endif + mSimd16IntPtrTy = mSimd16Int32Ty; } else { @@ -93,9 +97,7 @@ namespace SwrJit mIntPtrTy = mInt64Ty; mSimdIntPtrTy = mSimdInt64Ty; -#if USE_SIMD16_BUILDER - mSimd2IntPtrTy = mSimd2Int64Ty; -#endif + mSimd16IntPtrTy = mSimd16Int64Ty; } } } diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h index c6ab64e06e8..288c5d9ecd4 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h @@ -32,26 +32,23 @@ #include "JitManager.h" #include "common/formats.h" -#define USE_SIMD16_BUILDER 0 - namespace SwrJit { using namespace llvm; struct Builder { Builder(JitManager *pJitMgr); - IRBuilder<>* IRB() { return mpIRBuilder; }; - JitManager* JM() { return mpJitMgr; } + IRBuilder<> *IRB() { return mpIRBuilder; }; + JitManager *JM() { return mpJitMgr; } + + JitManager *mpJitMgr; + IRBuilder<> *mpIRBuilder; - JitManager* mpJitMgr; - IRBuilder<>* mpIRBuilder; + uint32_t mVWidth; // vector width simd8 + uint32_t mVWidth16; // vector width simd16 - uint32_t mVWidth; -#if USE_SIMD16_BUILDER - uint32_t mVWidth2; -#endif + // Built in types: scalar - // Built in types. Type* mVoidTy; Type* mInt1Ty; Type* mInt8Ty; @@ -66,6 +63,9 @@ namespace SwrJit Type* mInt8PtrTy; Type* mInt16PtrTy; Type* mInt32PtrTy; + + // Built in types: simd8 + Type* mSimdFP16Ty; Type* mSimdFP32Ty; Type* mSimdInt1Ty; @@ -75,17 +75,18 @@ namespace SwrJit Type* mSimdIntPtrTy; Type* mSimdVectorTy; Type* mSimdVectorTRTy; -#if USE_SIMD16_BUILDER - Type* mSimd2FP16Ty; - Type* mSimd2FP32Ty; - Type* mSimd2Int1Ty; - Type* mSimd2Int16Ty; - Type* mSimd2Int32Ty; - Type* mSimd2Int64Ty; - Type* mSimd2IntPtrTy; - Type* mSimd2VectorTy; - Type* mSimd2VectorTRTy; -#endif + + // Built in types: simd16 + + Type* mSimd16FP16Ty; + Type* mSimd16FP32Ty; + Type* mSimd16Int1Ty; + Type* mSimd16Int16Ty; + Type* mSimd16Int32Ty; + Type* mSimd16Int64Ty; + Type* mSimd16IntPtrTy; + Type* mSimd16VectorTy; + Type* mSimd16VectorTRTy; #include "gen_builder.hpp" #include "gen_builder_x86.hpp" diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index 0774889af10..92b07a5bfc0 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -196,56 +196,59 @@ namespace SwrJit return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); } - Value *Builder::VIMMED1(uint32_t i) + Value *Builder::VIMMED1_16(int i) { - return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); + return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i))); } - Value *Builder::VIMMED1(float i) + Value *Builder::VIMMED1(uint32_t i) { - return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i))); + return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); } - Value *Builder::VIMMED1(bool i) + Value *Builder::VIMMED1_16(uint32_t i) { - return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); + return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i))); } -#if USE_SIMD16_BUILDER - Value *Builder::VIMMED2_1(int i) + Value *Builder::VIMMED1(float i) { - return ConstantVector::getSplat(mVWidth2, cast<ConstantInt>(C(i))); + return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i))); } - Value *Builder::VIMMED2_1(uint32_t i) + Value *Builder::VIMMED1_16(float i) { - return ConstantVector::getSplat(mVWidth2, cast<ConstantInt>(C(i))); + return ConstantVector::getSplat(mVWidth16, cast<ConstantFP>(C(i))); } - Value *Builder::VIMMED2_1(float i) + Value *Builder::VIMMED1(bool i) { - return ConstantVector::getSplat(mVWidth2, cast<ConstantFP>(C(i))); + return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); } - Value *Builder::VIMMED2_1(bool i) + Value *Builder::VIMMED1_16(bool i) { - return ConstantVector::getSplat(mVWidth2, cast<ConstantInt>(C(i))); + return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i))); } -#endif Value *Builder::VUNDEF_IPTR() { return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth)); } + Value *Builder::VUNDEF(Type* t) + { + return UndefValue::get(VectorType::get(t, mVWidth)); + } + Value *Builder::VUNDEF_I() { return UndefValue::get(VectorType::get(mInt32Ty, mVWidth)); } - Value *Builder::VUNDEF(Type *ty, uint32_t size) + Value *Builder::VUNDEF_I_16() { - return UndefValue::get(VectorType::get(ty, size)); + return UndefValue::get(VectorType::get(mInt32Ty, mVWidth16)); } Value *Builder::VUNDEF_F() @@ -253,21 +256,14 @@ namespace SwrJit return UndefValue::get(VectorType::get(mFP32Ty, mVWidth)); } -#if USE_SIMD16_BUILDER - Value *Builder::VUNDEF2_F() - { - return UndefValue::get(VectorType::get(mFP32Ty, mVWidth2)); - } - - Value *Builder::VUNDEF2_I() + Value *Builder::VUNDEF_F_16() { - return UndefValue::get(VectorType::get(mInt32Ty, mVWidth2)); + return UndefValue::get(VectorType::get(mFP32Ty, mVWidth16)); } -#endif - Value *Builder::VUNDEF(Type* t) + Value *Builder::VUNDEF(Type *ty, uint32_t size) { - return UndefValue::get(VectorType::get(t, mVWidth)); + return UndefValue::get(VectorType::get(ty, size)); } Value *Builder::VBROADCAST(Value *src) @@ -281,8 +277,7 @@ namespace SwrJit return VECTOR_SPLAT(mVWidth, src); } -#if USE_SIMD16_BUILDER - Value *Builder::VBROADCAST2(Value *src) + Value *Builder::VBROADCAST_16(Value *src) { // check if src is already a vector if (src->getType()->isVectorTy()) @@ -290,10 +285,9 @@ namespace SwrJit return src; } - return VECTOR_SPLAT(mVWidth2, src); + return VECTOR_SPLAT(mVWidth16, src); } -#endif uint32_t Builder::IMMED(Value* v) { SWR_ASSERT(isa<ConstantInt>(v)); @@ -632,18 +626,18 @@ namespace SwrJit Value *val = LOAD(validAddress); vGather = VINSERT(vGather,val,C(i)); } + STACKRESTORE(pStack); } return vGather; } -#if USE_SIMD16_BUILDER Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale) { - Value *vGather = VUNDEF2_F(); + Value *vGather = VUNDEF_F_16(); - // use avx512 gather instruction if available + // use AVX512F gather instruction if available if (JM()->mArch.AVX512F()) { // force mask to <N-bit Integer>, required by vgather2 @@ -653,25 +647,24 @@ namespace SwrJit } else { - Value *src0 = EXTRACT2(vSrc, 0); - Value *src1 = EXTRACT2(vSrc, 1); + Value *src0 = EXTRACT_16(vSrc, 0); + Value *src1 = EXTRACT_16(vSrc, 1); - Value *indices0 = EXTRACT2(vIndices, 0); - Value *indices1 = EXTRACT2(vIndices, 1); + Value *indices0 = EXTRACT_16(vIndices, 0); + Value *indices1 = EXTRACT_16(vIndices, 1); - Value *mask0 = EXTRACT2(vMask, 0); - Value *mask1 = EXTRACT2(vMask, 1); + Value *mask0 = EXTRACT_16(vMask, 0); + Value *mask1 = EXTRACT_16(vMask, 1); Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale); Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale); - vGather = JOIN2(gather0, gather1); + vGather = JOIN_16(gather0, gather1); } return vGather; } -#endif ////////////////////////////////////////////////////////////////////////// /// @brief Generate a masked gather operation in LLVM IR. If not /// supported on the underlying platform, emulate it with loads @@ -718,15 +711,15 @@ namespace SwrJit STACKRESTORE(pStack); } + return vGather; } -#if USE_SIMD16_BUILDER Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale) { - Value *vGather = VUNDEF2_F(); + Value *vGather = VUNDEF_I_16(); - // use avx512 gather instruction if available + // use AVX512F gather instruction if available if (JM()->mArch.AVX512F()) { // force mask to <N-bit Integer>, required by vgather2 @@ -736,25 +729,24 @@ namespace SwrJit } else { - Value *src0 = EXTRACT2(vSrc, 0); - Value *src1 = EXTRACT2(vSrc, 1); + Value *src0 = EXTRACT_16(vSrc, 0); + Value *src1 = EXTRACT_16(vSrc, 1); - Value *indices0 = EXTRACT2(vIndices, 0); - Value *indices1 = EXTRACT2(vIndices, 1); + Value *indices0 = EXTRACT_16(vIndices, 0); + Value *indices1 = EXTRACT_16(vIndices, 1); - Value *mask0 = EXTRACT2(vMask, 0); - Value *mask1 = EXTRACT2(vMask, 1); + Value *mask0 = EXTRACT_16(vMask, 0); + Value *mask1 = EXTRACT_16(vMask, 1); Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale); Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale); - vGather = JOIN2(gather0, gather1); + vGather = JOIN_16(gather0, gather1); } return vGather; } -#endif ////////////////////////////////////////////////////////////////////////// /// @brief Generate a masked gather operation in LLVM IR. If not /// supported on the underlying platform, emulate it with loads @@ -804,21 +796,22 @@ namespace SwrJit return vGather; } -#if USE_SIMD16_BUILDER - Value *Builder::EXTRACT2(Value *x, uint32_t imm) + Value *Builder::EXTRACT_16(Value *x, uint32_t imm) { if (imm == 0) - return VSHUFFLE(x, UndefValue::get(x->getType()), {0, 1, 2, 3, 4, 5, 6, 7}); + { + return VSHUFFLE(x, UndefValue::get(x->getType()), { 0, 1, 2, 3, 4, 5, 6, 7 }); + } else - return VSHUFFLE(x, UndefValue::get(x->getType()), {8, 9, 10, 11, 12, 13, 14, 15}); + { + return VSHUFFLE(x, UndefValue::get(x->getType()), { 8, 9, 10, 11, 12, 13, 14, 15 }); + } } - Value *Builder::JOIN2(Value *a, Value *b) + Value *Builder::JOIN_16(Value *a, Value *b) { - return VSHUFFLE(a, b, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); + return VSHUFFLE(a, b, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }); } -#endif ////////////////////////////////////////////////////////////////////////// /// @brief convert x86 <N x float> mask to llvm <N x i1> mask @@ -828,14 +821,12 @@ namespace SwrJit return ICMP_SLT(src, VIMMED1(0)); } -#if USE_SIMD16_BUILDER - Value *Builder::MASK2(Value *vmask) + Value *Builder::MASK_16(Value *vmask) { - Value *src = BITCAST(vmask, mSimd2Int32Ty); - return ICMP_SLT(src, VIMMED2_1(0)); + Value *src = BITCAST(vmask, mSimd16Int32Ty); + return ICMP_SLT(src, VIMMED1_16(0)); } -#endif ////////////////////////////////////////////////////////////////////////// /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask Value *Builder::VMASK(Value *mask) @@ -843,13 +834,11 @@ namespace SwrJit return S_EXT(mask, mSimdInt32Ty); } -#if USE_SIMD16_BUILDER - Value *Builder::VMASK2(Value *mask) + Value *Builder::VMASK_16(Value *mask) { - return S_EXT(mask, mSimd2Int32Ty); + return S_EXT(mask, mSimd16Int32Ty); } -#endif ////////////////////////////////////////////////////////////////////////// /// @brief Generate a VPSHUFB operation in LLVM IR. If not /// supported on the underlying platform, emulate it diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h index 646ed0efb2e..16ff693c6e4 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h @@ -50,29 +50,34 @@ Constant *C(const std::initializer_list<Ty> &constList) } Constant *PRED(bool pred); + Value *VIMMED1(int i); +Value *VIMMED1_16(int i); + Value *VIMMED1(uint32_t i); +Value *VIMMED1_16(uint32_t i); + Value *VIMMED1(float i); +Value *VIMMED1_16(float i); + Value *VIMMED1(bool i); -#if USE_SIMD16_BUILDER -Value *VIMMED2_1(int i); -Value *VIMMED2_1(uint32_t i); -Value *VIMMED2_1(float i); -Value *VIMMED2_1(bool i); -#endif +Value *VIMMED1_16(bool i); + Value *VUNDEF(Type* t); + Value *VUNDEF_F(); +Value *VUNDEF_F_16(); + Value *VUNDEF_I(); -#if USE_SIMD16_BUILDER -Value *VUNDEF2_F(); -Value *VUNDEF2_I(); -#endif +Value *VUNDEF_I_16(); + Value *VUNDEF(Type* ty, uint32_t size); + Value *VUNDEF_IPTR(); + Value *VBROADCAST(Value *src); -#if USE_SIMD16_BUILDER -Value *VBROADCAST2(Value *src); -#endif +Value *VBROADCAST_16(Value *src); + Value *VRCP(Value *va); Value *VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY); @@ -105,21 +110,18 @@ Value *VCMPPS_GT(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_GT Value *VCMPPS_NOTNAN(Value* a, Value* b){ return VCMPPS(a, b, C((uint8_t)_CMP_ORD_Q)); } Value *MASK(Value *vmask); +Value *MASK_16(Value *vmask); + Value *VMASK(Value *mask); -#if USE_SIMD16_BUILDER -Value *MASK2(Value *vmask); -Value *VMASK2(Value *mask); -#endif +Value *VMASK_16(Value *mask); ////////////////////////////////////////////////////////////////////////// /// @brief functions that build IR to call x86 intrinsics directly, or /// emulate them with other instructions if not available on the host ////////////////////////////////////////////////////////////////////////// -#if USE_SIMD16_BUILDER -Value *EXTRACT2(Value *x, uint32_t imm); -Value *JOIN2(Value *a, Value *b); -#endif +Value *EXTRACT_16(Value *x, uint32_t imm); +Value *JOIN_16(Value *a, Value *b); Value *MASKLOADD(Value* src, Value* mask); @@ -127,16 +129,14 @@ void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, Value* mask, Value* vGatherComponents[], bool bPackedOutput); Value *GATHERPS(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1); -#if USE_SIMD16_BUILDER Value *GATHERPS_16(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1); -#endif + void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, Value* mask, Value* vGatherComponents[], bool bPackedOutput); Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1); -#if USE_SIMD16_BUILDER Value *GATHERDD_16(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1); -#endif + void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, Value* mask, Value* vGatherComponents[], bool bPackedOutput); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index aa911b58f3f..c7605046c35 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -49,6 +49,11 @@ enum ConversionType CONVERT_SFIXED, }; +#if USE_SIMD16_SHADERS +#define USE_SIMD16_GATHERS 0 +#define USE_SIMD16_BUILDER 0 +#endif + ////////////////////////////////////////////////////////////////////////// /// Interface to Jitting a fetch shader ////////////////////////////////////////////////////////////////////////// @@ -101,7 +106,6 @@ struct FetchJit : public Builder void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut); #if USE_SIMD16_SHADERS -#define USE_SIMD16_GATHERS 0 #if USE_SIMD16_GATHERS void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2); @@ -150,7 +154,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) pVtxOut = GEP(pVtxOut, C(0)); #if USE_SIMD16_SHADERS #if 0// USE_SIMD16_BUILDER - pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); + pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); #else pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0)); #endif @@ -841,7 +845,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance}); #if USE_SIMD16_GATHERS #if USE_SIMD16_BUILDER - Value* vBaseVertex16 = VBROADCAST2(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex })); + Value* vBaseVertex16 = VBROADCAST_16(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex })); #else Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex })); #endif @@ -872,7 +876,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch}); #if USE_SIMD16_GATHERS #if USE_SIMD16_BUILDER - Value *vStride16 = VBROADCAST2(stride); + Value *vStride16 = VBROADCAST_16(stride); #else Value *vStride = VBROADCAST(stride); #endif @@ -908,7 +912,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *startOffset; #if USE_SIMD16_GATHERS #if USE_SIMD16_BUILDER - Value *vInstanceStride16 = VIMMED2_1(0); + Value *vInstanceStride16 = VIMMED1_16(0); #else Value *vInstanceStride = VIMMED1(0); #endif @@ -932,7 +936,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, #if USE_SIMD16_GATHERS #if USE_SIMD16_BUILDER - vCurIndices16 = VBROADCAST2(calcInstance); + vCurIndices16 = VBROADCAST_16(calcInstance); #else vCurIndices = VBROADCAST(calcInstance); vCurIndices2 = VBROADCAST(calcInstance); @@ -949,7 +953,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* stepRate = C(ied.InstanceAdvancementState); #if USE_SIMD16_GATHERS #if USE_SIMD16_BUILDER - vInstanceStride16 = VBROADCAST2(MUL(curInstance, stepRate)); + vInstanceStride16 = VBROADCAST_16(MUL(curInstance, stepRate)); #else vInstanceStride = VBROADCAST(MUL(curInstance, stepRate)); #endif @@ -960,7 +964,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // offset indices by baseVertex #if USE_SIMD16_GATHERS #if USE_SIMD16_BUILDER - Value *vIndices16 = JOIN2(vIndices, vIndices2); + Value *vIndices16 = JOIN_16(vIndices, vIndices2); vCurIndices16 = ADD(vIndices16, vBaseVertex16); #else @@ -979,7 +983,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // offset indices by baseVertex #if USE_SIMD16_GATHERS #if USE_SIMD16_BUILDER - Value *vIndices16 = JOIN2(vIndices, vIndices2); + Value *vIndices16 = JOIN_16(vIndices, vIndices2); vCurIndices16 = ADD(vIndices16, vBaseVertex16); #else @@ -1019,9 +1023,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, partialInboundsSize = LOAD(partialInboundsSize); #if USE_SIMD16_GATHERS #if USE_SIMD16_BUILDER - Value *vPartialVertexSize = VBROADCAST2(partialInboundsSize); - Value *vBpp = VBROADCAST2(C(info.Bpp)); - Value *vAlignmentOffsets = VBROADCAST2(C(ied.AlignedByteOffset)); + Value *vPartialVertexSize = VBROADCAST_16(partialInboundsSize); + Value *vBpp = VBROADCAST_16(C(info.Bpp)); + Value *vAlignmentOffsets = VBROADCAST_16(C(ied.AlignedByteOffset)); #else Value *vPartialVertexSize = VBROADCAST(partialInboundsSize); Value *vBpp = VBROADCAST(C(info.Bpp)); @@ -1039,11 +1043,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, #if USE_SIMD16_GATHERS #if USE_SIMD16_BUILDER // override cur indices with 0 if pitch is 0 - Value *pZeroPitchMask16 = ICMP_EQ(vStride16, VIMMED2_1(0)); - vCurIndices16 = SELECT(pZeroPitchMask16, VIMMED2_1(0), vCurIndices16); + Value *pZeroPitchMask16 = ICMP_EQ(vStride16, VIMMED1_16(0)); + vCurIndices16 = SELECT(pZeroPitchMask16, VIMMED1_16(0), vCurIndices16); // are vertices partially OOB? - Value *vMaxVertex16 = VBROADCAST2(maxVertex); + Value *vMaxVertex16 = VBROADCAST_16(maxVertex); Value *vPartialOOBMask = ICMP_EQ(vCurIndices16, vMaxVertex16); // are vertices fully in bounds? @@ -1054,7 +1058,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, if (fetchState.bPartialVertexBuffer) { // are vertices below minVertex limit? - Value *vMinVertex16 = VBROADCAST2(minVertex); + Value *vMinVertex16 = VBROADCAST_16(minVertex); Value *vMinGatherMask16 = ICMP_UGE(vCurIndices16, vMinVertex16); // only fetch lanes that pass both tests @@ -1079,11 +1083,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // TODO: remove the following simd8 interop stuff once all code paths are fully widened to SIMD16.. - Value *vGatherMask = EXTRACT2(vGatherMask16, 0); - Value *vGatherMask2 = EXTRACT2(vGatherMask16, 1); + Value *vGatherMask = EXTRACT_16(vGatherMask16, 0); + Value *vGatherMask2 = EXTRACT_16(vGatherMask16, 1); - Value *vOffsets = EXTRACT2(vOffsets16, 0); - Value *vOffsets2 = EXTRACT2(vOffsets16, 1); + Value *vOffsets = EXTRACT_16(vOffsets16, 0); + Value *vOffsets2 = EXTRACT_16(vOffsets16, 1); #else // override cur indices with 0 if pitch is 0 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0)); @@ -1198,7 +1202,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, { #if USE_SIMD16_BUILDER // pack adjacent pairs of SIMD8s into SIMD16s - pVtxSrc2[currentVertexElement] = JOIN2(pResults[c], pResults2[c]); + pVtxSrc2[currentVertexElement] = JOIN_16(pResults[c], pResults2[c]); #else vVertexElements[currentVertexElement] = pResults[c]; @@ -1211,7 +1215,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, { #if USE_SIMD16_BUILDER // store SIMD16s - Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2); @@ -1254,7 +1258,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, #if USE_SIMD16_GATHERS Value *gatherSrc2 = VIMMED1(0.0f); #if USE_SIMD16_BUILDER - Value *gatherSrc16 = VIMMED2_1(0.0f); + Value *gatherSrc16 = VIMMED1_16(0.0f); #endif #endif @@ -1282,7 +1286,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } else { - gatherResult[0] = VUNDEF2_I(); + gatherResult[0] = VUNDEF_I_16(); } // if we have at least one component out of z or w to fetch @@ -1300,7 +1304,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } else { - gatherResult[1] = VUNDEF2_I(); + gatherResult[1] = VUNDEF_I_16(); } #else @@ -1347,7 +1351,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, if (compMask) { #if USE_SIMD16_BUILDER - Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, CONVERT_NONE, currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2); @@ -1432,7 +1436,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, #if USE_SIMD16_BUILDER // pack adjacent pairs of SIMD8s into SIMD16s - pVtxSrc2[currentVertexElement] = JOIN2(vVertexElements[currentVertexElement], + pVtxSrc2[currentVertexElement] = JOIN_16(vVertexElements[currentVertexElement], vVertexElements2[currentVertexElement]); #endif @@ -1449,7 +1453,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, #if USE_SIMD16_BUILDER // pack adjacent pairs of SIMD8s into SIMD16s - pVtxSrc2[currentVertexElement] = JOIN2(vVertexElements[currentVertexElement], + pVtxSrc2[currentVertexElement] = JOIN_16(vVertexElements[currentVertexElement], vVertexElements2[currentVertexElement]); #endif @@ -1461,7 +1465,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, { #if USE_SIMD16_BUILDER // store SIMD16s - Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2); @@ -1553,7 +1557,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, #if USE_SIMD16_BUILDER // pack adjacent pairs of SIMD8s into SIMD16s - pVtxSrc2[currentVertexElement] = JOIN2(pGather, pGather2); + pVtxSrc2[currentVertexElement] = JOIN_16(pGather, pGather2); #else vVertexElements[currentVertexElement] = pGather; @@ -1579,7 +1583,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, { #if USE_SIMD16_BUILDER // store SIMD16s - Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2); @@ -1692,7 +1696,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, #if USE_SIMD16_GATHERS Value* gatherSrc2 = VIMMED1(0); #if USE_SIMD16_BUILDER - Value *gatherSrc16 = VIMMED2_1(0); + Value *gatherSrc16 = VIMMED1_16(0); #endif #endif @@ -1722,7 +1726,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, #endif #if USE_SIMD16_BUILDER - Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); Shuffle8bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType, currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2, info.swizzle); @@ -1776,7 +1780,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } else { - gatherResult[0] = VUNDEF2_I(); + gatherResult[0] = VUNDEF_I_16(); } // if we have at least one component out of z or w to fetch @@ -1794,7 +1798,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } else { - gatherResult[1] = VUNDEF2_I(); + gatherResult[1] = VUNDEF_I_16(); } #else @@ -1841,7 +1845,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, if (compMask) { #if USE_SIMD16_BUILDER - Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType, currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2); @@ -1914,15 +1918,15 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, if (conversionType == CONVERT_USCALED) { - pGather = UI_TO_FP(pGather, mSimd2FP32Ty); + pGather = UI_TO_FP(pGather, mSimd16FP32Ty); } else if (conversionType == CONVERT_SSCALED) { - pGather = SI_TO_FP(pGather, mSimd2FP32Ty); + pGather = SI_TO_FP(pGather, mSimd16FP32Ty); } else if (conversionType == CONVERT_SFIXED) { - pGather = FMUL(SI_TO_FP(pGather, mSimd2FP32Ty), VBROADCAST2(C(1 / 65536.0f))); + pGather = FMUL(SI_TO_FP(pGather, mSimd16FP32Ty), VBROADCAST_16(C(1 / 65536.0f))); } #else @@ -2008,7 +2012,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, #if USE_SIMD16_GATHERS #if USE_SIMD16_BUILDER // store SIMD16s - Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2); @@ -2043,7 +2047,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, #if USE_SIMD16_GATHERS #if USE_SIMD16_BUILDER // store SIMD16s - Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); StoreVertexElements2(pVtxOut2, outputElt, currentVertexElement, pVtxSrc2); @@ -2320,8 +2324,8 @@ void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args) // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now.. - Value *vGatherResult_lo = EXTRACT2(vGatherResult, 0); - Value *vGatherResult_hi = EXTRACT2(vGatherResult, 1); + Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0); + Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1); Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); @@ -2399,7 +2403,7 @@ void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args) temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor); } - vVertexElements[currentVertexElement] = JOIN2(temp_lo, temp_hi); + vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi); currentVertexElement += 1; } @@ -2480,8 +2484,8 @@ void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args) break; } - Value *vGatherResult_lo = EXTRACT2(vGatherResult, 0); - Value *vGatherResult_hi = EXTRACT2(vGatherResult, 1); + Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0); + Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1); Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); @@ -2497,7 +2501,7 @@ void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args) temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor); } - vVertexElements[currentVertexElement] = JOIN2(temp_lo, temp_hi); + vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi); currentVertexElement += 1; } @@ -2785,8 +2789,8 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args) { // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now.. - Value *vGatherResult_lo = EXTRACT2(vGatherResult[0], 0); - Value *vGatherResult_hi = EXTRACT2(vGatherResult[0], 1); + Value *vGatherResult_lo = EXTRACT_16(vGatherResult[0], 0); + Value *vGatherResult_hi = EXTRACT_16(vGatherResult[0], 1); Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); @@ -2802,7 +2806,7 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args) // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy #if 0 - vi128XY = JOIN2(vi128XY_lo, vi128XY_hi); + vi128XY = JOIN_16(vi128XY_lo, vi128XY_hi); #endif } @@ -2812,8 +2816,8 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args) Value *vi128ZW_hi = nullptr; if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) { - Value *vGatherResult_lo = EXTRACT2(vGatherResult[1], 0); - Value *vGatherResult_hi = EXTRACT2(vGatherResult[1], 1); + Value *vGatherResult_lo = EXTRACT_16(vGatherResult[1], 0); + Value *vGatherResult_hi = EXTRACT_16(vGatherResult[1], 1); Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); @@ -2821,7 +2825,7 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args) vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); #if 0 - vi128ZW = JOIN2(vi128ZW_lo, vi128ZW_hi); + vi128ZW = JOIN_16(vi128ZW_lo, vi128ZW_hi); #endif } @@ -2868,7 +2872,7 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args) Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty)); Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty)); - vVertexElements[currentVertexElement] = JOIN2(temp_lo, temp_hi); + vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi); } else { @@ -2883,7 +2887,7 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args) temp_hi = FMUL(CAST(IntToFpCast, temp_hi, mSimdFP32Ty), conversionFactor); } - vVertexElements[currentVertexElement] = JOIN2(temp_lo, temp_hi); + vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi); } currentVertexElement += 1; @@ -2960,8 +2964,8 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args) // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now.. - Value *vGatherResult_lo = EXTRACT2(vGatherResult[selectedGather], 0); - Value *vGatherResult_hi = EXTRACT2(vGatherResult[selectedGather], 1); + Value *vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0); + Value *vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1); Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy); Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy); @@ -2977,7 +2981,7 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args) temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor); } - vVertexElements[currentVertexElement] = JOIN2(temp_lo, temp_hi); + vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi); currentVertexElement += 1; } @@ -3052,7 +3056,7 @@ void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, co #if FETCH_DUMP_VERTEX PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] }); #endif - vVertexElements[c] = BITCAST(vVertexElements[c], mSimd2FP32Ty); + vVertexElements[c] = BITCAST(vVertexElements[c], mSimd16FP32Ty); } #if FETCH_DUMP_VERTEX else @@ -3080,14 +3084,18 @@ Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl) { switch(ctrl) { - case NoStore: return VUNDEF_I(); - case Store0: return VIMMED1(0); - case Store1Fp: return VIMMED1(1.0f); - case Store1Int: return VIMMED1(1); + case NoStore: + return VUNDEF_I(); + case Store0: + return VIMMED1(0); + case Store1Fp: + return VIMMED1(1.0f); + case Store1Int: + return VIMMED1(1); case StoreVertexId: { #if USE_SIMD16_SHADERS - Value* pId; + Value *pId; if (useVertexID2) { pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty); @@ -3097,17 +3105,18 @@ Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl) pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty); } #else - Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty); + Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty); #endif return pId; } case StoreInstanceId: { - Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty); + Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty); return VBROADCAST(pId); } case StoreSrc: - default: SWR_INVALID("Invalid component control"); return VUNDEF_I(); + default: + SWR_INVALID("Invalid component control"); return VUNDEF_I(); } } @@ -3116,26 +3125,31 @@ Value* FetchJit::GenerateCompCtrlVector2(const ComponentControl ctrl) { switch (ctrl) { - case NoStore: return VUNDEF2_I(); - case Store0: return VIMMED2_1(0); - case Store1Fp: return VIMMED2_1(1.0f); - case Store1Int: return VIMMED2_1(1); + case NoStore: + return VUNDEF_I_16(); + case Store0: + return VIMMED1_16(0); + case Store1Fp: + return VIMMED1_16(1.0f); + case Store1Int: + return VIMMED1_16(1); case StoreVertexId: { - Value* pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty); - Value* pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty); + Value *pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty); + Value *pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty); - Value *pId = JOIN2(pId_lo, pId_hi); + Value *pId = JOIN_16(pId_lo, pId_hi); return pId; } case StoreInstanceId: { - Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty); - return VBROADCAST2(pId); + Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty); + return VBROADCAST_16(pId); } case StoreSrc: - default: SWR_INVALID("Invalid component control"); return VUNDEF2_I(); + default: + SWR_INVALID("Invalid component control"); return VUNDEF_I_16(); } } |