aboutsummaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
diff options
context:
space:
mode:
authorTim Rowley <[email protected]>2016-02-16 17:28:09 -0600
committerTim Rowley <[email protected]>2016-03-02 18:38:41 -0600
commitc6e67f5a9373e916a8d2333585cb5787aa5f7bb7 (patch)
tree5b5c60bea784f16736c394c989fdd5df3ebae233 /src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
parent2b2d3680bf164ec4f8b50436b96c3fc195318ea5 (diff)
gallium/swr: add OpenSWR rasterizer
Acked-by: Roland Scheidegger <[email protected]> Acked-by: Jose Fonseca <[email protected]>
Diffstat (limited to 'src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp')
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp357
1 files changed, 357 insertions, 0 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
new file mode 100644
index 00000000000..6c5f22bc47c
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
@@ -0,0 +1,357 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file streamout_jit.cpp
+*
+* @brief Implementation of the streamout jitter
+*
+* Notes:
+*
+******************************************************************************/
+#include "jit_api.h"
+#include "streamout_jit.h"
+#include "builder.h"
+#include "state_llvm.h"
+#include "common/containers.hpp"
+#include "llvm/IR/DataLayout.h"
+
+#include <sstream>
+#include <unordered_set>
+
+//////////////////////////////////////////////////////////////////////////
+/// Interface to Jitting a fetch shader
+//////////////////////////////////////////////////////////////////////////
+struct StreamOutJit : public Builder
+{
+ StreamOutJit(JitManager* pJitMgr) : Builder(pJitMgr){};
+
+ // returns pointer to SWR_STREAMOUT_BUFFER
+ Value* getSOBuffer(Value* pSoCtx, uint32_t buffer)
+ {
+ return LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pBuffer, buffer });
+ }
+
+
+ //////////////////////////////////////////////////////////////////////////
+ // @brief checks if streamout buffer is oob
+ // @return <i1> true/false
+ Value* oob(const STREAMOUT_COMPILE_STATE& state, Value* pSoCtx, uint32_t buffer)
+ {
+ Value* returnMask = C(false);
+
+ Value* pBuf = getSOBuffer(pSoCtx, buffer);
+
+ // load enable
+ // @todo bool data types should generate <i1> llvm type
+ Value* enabled = TRUNC(LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_enable }), IRB()->getInt1Ty());
+
+ // load buffer size
+ Value* bufferSize = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_bufferSize });
+
+ // load current streamOffset
+ Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
+
+ // load buffer pitch
+ Value* pitch = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch });
+
+ // buffer is considered oob if in use in a decl but not enabled
+ returnMask = OR(returnMask, NOT(enabled));
+
+ // buffer is oob if cannot fit a prims worth of verts
+ Value* newOffset = ADD(streamOffset, MUL(pitch, C(state.numVertsPerPrim)));
+ returnMask = OR(returnMask, ICMP_SGT(newOffset, bufferSize));
+
+ return returnMask;
+ }
+
+
+ //////////////////////////////////////////////////////////////////////////
+ // @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector,
+ // packing the active mask bits
+ // ex. bitmask 0011 -> (0, 1, 0, 0)
+ // bitmask 1000 -> (3, 0, 0, 0)
+ // bitmask 1100 -> (2, 3, 0, 0)
+ Value* PackMask(uint32_t bitmask)
+ {
+ std::vector<Constant*> indices(4, C(0));
+ DWORD index;
+ uint32_t elem = 0;
+ while (_BitScanForward(&index, bitmask))
+ {
+ indices[elem++] = C((int)index);
+ bitmask &= ~(1 << index);
+ }
+
+ return ConstantVector::get(indices);
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ // @brief convert scalar bitmask to <4xfloat> bitmask
+ Value* ToMask(uint32_t bitmask)
+ {
+ std::vector<Constant*> indices;
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ if (bitmask & (1 << i))
+ {
+ indices.push_back(C(-1.0f));
+ }
+ else
+ {
+ indices.push_back(C(0.0f));
+ }
+ }
+ return ConstantVector::get(indices);
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ // @brief processes a single decl from the streamout stream. Reads 4 components from the input
+ // stream and writes N components to the output buffer given the componentMask or if
+ // a hole, just increments the buffer pointer
+ // @param pStream - pointer to current attribute
+ // @param pOutBuffers - pointers to the current location of each output buffer
+ // @param decl - input decl
+ void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl)
+ {
+ // @todo add this to x86 macros
+ Function* maskStore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskstore_ps);
+
+ uint32_t numComponents = _mm_popcnt_u32(decl.componentMask);
+ uint32_t packedMask = (1 << numComponents) - 1;
+ if (!decl.hole)
+ {
+ // increment stream pointer to correct slot
+ Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot));
+
+ // load 4 components from stream
+ Type* simd4Ty = VectorType::get(IRB()->getFloatTy(), 4);
+ Type* simd4PtrTy = PointerType::get(simd4Ty, 0);
+ pAttrib = BITCAST(pAttrib, simd4PtrTy);
+ Value *vattrib = LOAD(pAttrib);
+
+ // shuffle/pack enabled components
+ Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask));
+
+ // store to output buffer
+ // cast SO buffer to i8*, needed by maskstore
+ Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(mInt8Ty, 0));
+
+ // cast input to <4xfloat>
+ Value* src = BITCAST(vpackedAttrib, simd4Ty);
+ CALL(maskStore, {pOut, ToMask(packedMask), src});
+ }
+
+ // increment SO buffer
+ pOutBuffers[decl.bufferIndex] = GEP(pOutBuffers[decl.bufferIndex], C(numComponents));
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ // @brief builds a single vertex worth of data for the given stream
+ // @param streamState - state for this stream
+ // @param pCurVertex - pointer to src stream vertex data
+ // @param pOutBuffer - pointers to up to 4 SO buffers
+ void buildVertex(const STREAMOUT_STREAM& streamState, Value* pCurVertex, Value* pOutBuffer[4])
+ {
+ for (uint32_t d = 0; d < streamState.numDecls; ++d)
+ {
+ const STREAMOUT_DECL& decl = streamState.decl[d];
+ buildDecl(pCurVertex, pOutBuffer, decl);
+ }
+ }
+
+ void buildStream(const STREAMOUT_COMPILE_STATE& state, const STREAMOUT_STREAM& streamState, Value* pSoCtx, BasicBlock* returnBB, Function* soFunc)
+ {
+ // get list of active SO buffers
+ std::unordered_set<uint32_t> activeSOBuffers;
+ for (uint32_t d = 0; d < streamState.numDecls; ++d)
+ {
+ const STREAMOUT_DECL& decl = streamState.decl[d];
+ activeSOBuffers.insert(decl.bufferIndex);
+ }
+
+ // always increment numPrimStorageNeeded
+ Value *numPrimStorageNeeded = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded });
+ numPrimStorageNeeded = ADD(numPrimStorageNeeded, C(1));
+ STORE(numPrimStorageNeeded, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded });
+
+ // check OOB on active SO buffers. If any buffer is out of bound, don't write
+ // the primitive to any buffer
+ Value* oobMask = C(false);
+ for (uint32_t buffer : activeSOBuffers)
+ {
+ oobMask = OR(oobMask, oob(state, pSoCtx, buffer));
+ }
+
+ BasicBlock* validBB = BasicBlock::Create(JM()->mContext, "valid", soFunc);
+
+ // early out if OOB
+ COND_BR(oobMask, returnBB, validBB);
+
+ IRB()->SetInsertPoint(validBB);
+
+ Value* numPrimsWritten = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten });
+ numPrimsWritten = ADD(numPrimsWritten, C(1));
+ STORE(numPrimsWritten, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten });
+
+ // compute start pointer for each output buffer
+ Value* pOutBuffer[4];
+ Value* pOutBufferStartVertex[4];
+ Value* outBufferPitch[4];
+ for (uint32_t b: activeSOBuffers)
+ {
+ Value* pBuf = getSOBuffer(pSoCtx, b);
+ Value* pData = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pBuffer });
+ Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
+ pOutBuffer[b] = GEP(pData, streamOffset);
+ pOutBufferStartVertex[b] = pOutBuffer[b];
+
+ outBufferPitch[b] = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch });
+ }
+
+ // loop over the vertices of the prim
+ Value* pStreamData = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pPrimData });
+ for (uint32_t v = 0; v < state.numVertsPerPrim; ++v)
+ {
+ buildVertex(streamState, pStreamData, pOutBuffer);
+
+ // increment stream and output buffer pointers
+ // stream verts are always 32*4 dwords apart
+ pStreamData = GEP(pStreamData, C(KNOB_NUM_ATTRIBUTES * 4));
+
+ // output buffers offset using pitch in buffer state
+ for (uint32_t b : activeSOBuffers)
+ {
+ pOutBufferStartVertex[b] = GEP(pOutBufferStartVertex[b], outBufferPitch[b]);
+ pOutBuffer[b] = pOutBufferStartVertex[b];
+ }
+ }
+
+ // update each active buffer's streamOffset
+ for (uint32_t b : activeSOBuffers)
+ {
+ Value* pBuf = getSOBuffer(pSoCtx, b);
+ Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
+ streamOffset = ADD(streamOffset, MUL(C(state.numVertsPerPrim), outBufferPitch[b]));
+ STORE(streamOffset, pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
+ }
+ }
+
+ Function* Create(const STREAMOUT_COMPILE_STATE& state)
+ {
+ static std::size_t soNum = 0;
+
+ std::stringstream fnName("SOShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
+ fnName << soNum++;
+
+ // SO function signature
+ // typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT*)
+
+ std::vector<Type*> args{
+ PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*
+ };
+
+ FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
+ Function* soFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
+
+ // create return basic block
+ BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", soFunc);
+ BasicBlock* returnBB = BasicBlock::Create(JM()->mContext, "return", soFunc);
+
+ IRB()->SetInsertPoint(entry);
+
+ // arguments
+ auto argitr = soFunc->getArgumentList().begin();
+ Value* pSoCtx = &*argitr++;
+ pSoCtx->setName("pSoCtx");
+
+ const STREAMOUT_STREAM& streamState = state.stream;
+ buildStream(state, streamState, pSoCtx, returnBB, soFunc);
+
+ BR(returnBB);
+
+ IRB()->SetInsertPoint(returnBB);
+ RET_VOID();
+
+ JitManager::DumpToFile(soFunc, "SoFunc");
+
+ FunctionPassManager passes(JM()->mpCurrentModule);
+ passes.add(createBreakCriticalEdgesPass());
+ passes.add(createCFGSimplificationPass());
+ passes.add(createEarlyCSEPass());
+ passes.add(createPromoteMemoryToRegisterPass());
+ passes.add(createCFGSimplificationPass());
+ passes.add(createEarlyCSEPass());
+ passes.add(createInstructionCombiningPass());
+ passes.add(createInstructionSimplifierPass());
+ passes.add(createConstantPropagationPass());
+ passes.add(createSCCPPass());
+ passes.add(createAggressiveDCEPass());
+
+ passes.run(*soFunc);
+
+ JitManager::DumpToFile(soFunc, "SoFunc_optimized");
+
+ return soFunc;
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JITs from streamout shader IR
+/// @param hJitMgr - JitManager handle
+/// @param func - LLVM function IR
+/// @return PFN_SO_FUNC - pointer to SOS function
+PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc)
+{
+ const llvm::Function *func = (const llvm::Function*)hFunc;
+ JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
+ PFN_SO_FUNC pfnStreamOut;
+ pfnStreamOut = (PFN_SO_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
+ // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
+ pJitMgr->mIsModuleFinalized = true;
+
+ return pfnStreamOut;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JIT compiles streamout shader
+/// @param hJitMgr - JitManager handle
+/// @param state - SO state to build function from
+extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitMgr, const STREAMOUT_COMPILE_STATE& state)
+{
+ JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
+
+ STREAMOUT_COMPILE_STATE soState = state;
+ if (soState.offsetAttribs)
+ {
+ for (uint32_t i = 0; i < soState.stream.numDecls; ++i)
+ {
+ soState.stream.decl[i].attribSlot -= soState.offsetAttribs;
+ }
+ }
+
+ pJitMgr->SetupNewModule();
+
+ StreamOutJit theJit(pJitMgr);
+ HANDLE hFunc = theJit.Create(soState);
+
+ return JitStreamoutFunc(hJitMgr, hFunc);
+}