diff options
16 files changed, 565 insertions, 46 deletions
diff --git a/src/gallium/drivers/swr/Makefile.am b/src/gallium/drivers/swr/Makefile.am index 32dd9e59ffd..c22f09ec39c 100644 --- a/src/gallium/drivers/swr/Makefile.am +++ b/src/gallium/drivers/swr/Makefile.am @@ -80,7 +80,7 @@ BUILT_SOURCES = \ rasterizer/codegen/gen_knobs.h \ rasterizer/jitter/gen_state_llvm.h \ rasterizer/jitter/gen_builder.hpp \ - rasterizer/jitter/gen_builder_x86.hpp \ + rasterizer/jitter/gen_builder_meta.hpp \ rasterizer/jitter/gen_builder_intrin.hpp \ rasterizer/archrast/gen_ar_event.hpp \ rasterizer/archrast/gen_ar_event.cpp \ @@ -134,12 +134,12 @@ rasterizer/jitter/gen_builder.hpp: rasterizer/codegen/gen_llvm_ir_macros.py rast --output rasterizer/jitter \ --gen_h -rasterizer/jitter/gen_builder_x86.hpp: rasterizer/codegen/gen_llvm_ir_macros.py rasterizer/codegen/templates/gen_builder.hpp rasterizer/codegen/gen_common.py +rasterizer/jitter/gen_builder_meta.hpp: rasterizer/codegen/gen_llvm_ir_macros.py rasterizer/codegen/templates/gen_builder.hpp rasterizer/codegen/gen_common.py $(MKDIR_GEN) $(PYTHON_GEN) \ $(srcdir)/rasterizer/codegen/gen_llvm_ir_macros.py \ --output rasterizer/jitter \ - --gen_x86_h + --gen_meta_h rasterizer/jitter/gen_builder_intrin.hpp: rasterizer/codegen/gen_llvm_ir_macros.py rasterizer/codegen/templates/gen_builder.hpp rasterizer/codegen/gen_common.py $(MKDIR_GEN) diff --git a/src/gallium/drivers/swr/Makefile.sources b/src/gallium/drivers/swr/Makefile.sources index 4924da1f778..a7fcba84042 100644 --- a/src/gallium/drivers/swr/Makefile.sources +++ b/src/gallium/drivers/swr/Makefile.sources @@ -152,7 +152,8 @@ JITTER_CXX_SOURCES := \ rasterizer/jitter/JitManager.h \ rasterizer/jitter/streamout_jit.cpp \ rasterizer/jitter/streamout_jit.h \ - rasterizer/jitter/shader_lib/DebugOutput.cpp + rasterizer/jitter/shader_lib/DebugOutput.cpp \ + rasterizer/jitter/functionpasses/lower_x86.cpp MEMORY_CXX_SOURCES := \ rasterizer/memory/ClearTile.cpp \ diff --git a/src/gallium/drivers/swr/SConscript b/src/gallium/drivers/swr/SConscript index 5097be67bb4..528cfac39f6 100644 --- a/src/gallium/drivers/swr/SConscript +++ b/src/gallium/drivers/swr/SConscript @@ -76,10 +76,10 @@ Depends('rasterizer/jitter/gen_builder.hpp', swrroot + 'rasterizer/codegen/templates/gen_builder.hpp') env.CodeGenerate( - target = 'rasterizer/jitter/gen_builder_x86.hpp', + target = 'rasterizer/jitter/gen_builder_meta.hpp', script = swrroot + 'rasterizer/codegen/gen_llvm_ir_macros.py', source = '', - command = python_cmd + ' $SCRIPT --output ' + bldroot + '/rasterizer/jitter --gen_x86_h' + command = python_cmd + ' $SCRIPT --output ' + bldroot + '/rasterizer/jitter --gen_meta_h' ) Depends('rasterizer/jitter/gen_builder.hpp', swrroot + 'rasterizer/codegen/templates/gen_builder.hpp') diff --git a/src/gallium/drivers/swr/meson.build b/src/gallium/drivers/swr/meson.build index 6c0f7ae7a51..7703a6c4831 100644 --- a/src/gallium/drivers/swr/meson.build +++ b/src/gallium/drivers/swr/meson.build @@ -80,6 +80,7 @@ files_swr_mesa = files( 'rasterizer/jitter/streamout_jit.cpp', 'rasterizer/jitter/streamout_jit.h', 'rasterizer/jitter/shader_lib/DebugOutput.cpp', + 'rasterizer/jitter/functionpasses/lower_x86.cpp', ) files_swr_arch = files( @@ -301,7 +302,7 @@ endif libmesaswr = static_library( 'mesaswr', [files_swr_mesa, files_swr_common, gen_knobs_h, gen_knobs_cpp, - gen_builder_hpp, gen_builder_x86_hpp, gen_builder_intrin_hpp], + gen_builder_hpp, gen_builder_meta_hpp, gen_builder_intrin_hpp], cpp_args : [cpp_vis_args, swr_cpp_args, swr_avx_args, swr_arch_defines], include_directories : [inc_common, swr_incs], dependencies : dep_llvm, diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py index 324f24a3557..bdd785a155d 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py +++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py @@ -42,28 +42,28 @@ inst_aliases = { } intrinsics = [ - ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 'mask', 'scale']], - ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 'mask', 'scale']], - ['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale']], - ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 'mask', 'scale']], - ['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 'indices', 'mask', 'scale']], - ['VRCPPS', 'x86_avx_rcp_ps_256', ['a']], - ['VROUND', 'x86_avx_round_ps_256', ['a', 'rounding']], - ['BEXTR_32', 'x86_bmi_bextr_32', ['src', 'control']], - ['VPSHUFB', 'x86_avx2_pshuf_b', ['a', 'b']], - ['VPERMD', 'x86_avx2_permd', ['a', 'idx']], - ['VPERMPS', 'x86_avx2_permps', ['idx', 'a']], - ['VCVTPD2PS', 'x86_avx_cvt_pd2_ps_256', ['a']], - ['VCVTPH2PS', 'x86_vcvtph2ps_256', ['a']], - ['VCVTPS2PH', 'x86_vcvtps2ph_256', ['a', 'round']], - ['VHSUBPS', 'x86_avx_hsub_ps_256', ['a', 'b']], - ['VPTESTC', 'x86_avx_ptestc_256', ['a', 'b']], - ['VPTESTZ', 'x86_avx_ptestz_256', ['a', 'b']], - ['VFMADDPS', 'x86_fma_vfmadd_ps_256', ['a', 'b', 'c']], - ['VMOVMSKPS', 'x86_avx_movmsk_ps_256', ['a']], - ['VPHADDD', 'x86_avx2_phadd_d', ['a', 'b']], - ['PDEP32', 'x86_bmi_pdep_32', ['a', 'b']], - ['RDTSC', 'x86_rdtsc', []], + ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimd4FP64Ty'], + ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimdFP32Ty'], + ['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimd16FP32Ty'], + ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimdInt32Ty'], + ['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimd16Int32Ty'], + ['VRCPPS', 'x86_avx_rcp_ps_256', ['a'], 'mSimdFP32Ty'], + ['VROUND', 'x86_avx_round_ps_256', ['a', 'rounding'], 'mSimdFP32Ty'], + ['BEXTR_32', 'x86_bmi_bextr_32', ['src', 'control'], 'mInt32Ty'], + ['VPSHUFB', 'x86_avx2_pshuf_b', ['a', 'b'], 'mSimd32Int8Ty'], + ['VPERMD', 'x86_avx2_permd', ['a', 'idx'], 'mSimdInt32Ty'], + ['VPERMPS', 'x86_avx2_permps', ['idx', 'a'], 'mSimdFP32Ty'], + ['VCVTPD2PS', 'x86_avx_cvt_pd2_ps_256', ['a'], 'mSimdFP32Ty'], + ['VCVTPH2PS', 'x86_vcvtph2ps_256', ['a'], 'mSimdFP32Ty'], + ['VCVTPS2PH', 'x86_vcvtps2ph_256', ['a', 'round'], 'mSimdFP16Ty'], + ['VHSUBPS', 'x86_avx_hsub_ps_256', ['a', 'b'], 'mSimdFP32Ty'], + ['VPTESTC', 'x86_avx_ptestc_256', ['a', 'b'], 'mInt32Ty'], + ['VPTESTZ', 'x86_avx_ptestz_256', ['a', 'b'], 'mInt32Ty'], + ['VFMADDPS', 'x86_fma_vfmadd_ps_256', ['a', 'b', 'c'], 'mSimdFP32Ty'], + ['VMOVMSKPS', 'x86_avx_movmsk_ps_256', ['a'], 'mInt32Ty'], + ['VPHADDD', 'x86_avx2_phadd_d', ['a', 'b'], 'mSimdInt32Ty'], + ['PDEP32', 'x86_bmi_pdep_32', ['a', 'b'], 'mInt32Ty'], + ['RDTSC', 'x86_rdtsc', [], 'mInt64Ty'], ] llvm_intrinsics = [ @@ -223,8 +223,8 @@ def generate_gen_h(functions, output_dir): ''' Auto-generates macros for LLVM IR ''' -def generate_x86_h(output_dir): - filename = 'gen_builder_x86.hpp' +def generate_meta_h(output_dir): + filename = 'gen_builder_meta.hpp' output_filename = os.path.join(output_dir, filename) functions = [] @@ -238,15 +238,17 @@ def generate_x86_h(output_dir): functions.append({ 'decl' : decl, + 'name' : inst[0], 'intrin' : inst[1], 'args' : inst[2], + 'returnType': inst[3] }) MakoTemplateWriter.to_file( template, output_filename, cmdline=sys.argv, - comment='x86 intrinsics', + comment='meta intrinsics', filename=filename, functions=functions, isX86=True, isIntrin=False) @@ -291,7 +293,7 @@ def main(): parser.add_argument('--input', '-i', type=FileType('r'), help='Path to IRBuilder.h', required=False) parser.add_argument('--output-dir', '-o', action='store', dest='output', help='Path to output directory', required=True) parser.add_argument('--gen_h', help='Generate builder_gen.h', action='store_true', default=False) - parser.add_argument('--gen_x86_h', help='Generate x86 intrinsics. No input is needed.', action='store_true', default=False) + parser.add_argument('--gen_meta_h', help='Generate meta intrinsics. No input is needed.', action='store_true', default=False) parser.add_argument('--gen_intrin_h', help='Generate llvm intrinsics. No input is needed.', action='store_true', default=False) args = parser.parse_args() @@ -307,8 +309,8 @@ def main(): elif args.gen_h: print('Need to specify --input for --gen_h!') - if args.gen_x86_h: - generate_x86_h(args.output) + if args.gen_meta_h: + generate_meta_h(args.output) if args.gen_intrin_h: generate_intrin_h(args.output) diff --git a/src/gallium/drivers/swr/rasterizer/codegen/meson.build b/src/gallium/drivers/swr/rasterizer/codegen/meson.build index bbe6efff01a..841540e0f30 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/meson.build +++ b/src/gallium/drivers/swr/rasterizer/codegen/meson.build @@ -44,7 +44,7 @@ gen_knobs_h = custom_target( # The generators above this are needed individually, while the below generators # are all inputs to the same lib, so they don't need unique names. files_swr_common += [ - gen_builder_hpp, gen_builder_x86_hpp, gen_knobs_h, gen_knobs_cpp + gen_builder_hpp, gen_builder_meta_hpp, gen_knobs_h, gen_knobs_cpp ] foreach x : [[swr_context_files, 'gen_swr_context_llvm.h'], diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp index 5a47c9aa105..bcbcb30cc14 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp +++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp @@ -40,7 +40,16 @@ ${func['decl']} { %if isX86: - Function * pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']}); + %if len(func['args']) != 0: + SmallVector<Type*, ${len(func['args'])}> argTypes; + %for arg in func['args']: + argTypes.push_back(${arg}->getType()); + %endfor + FunctionType* pFuncTy = FunctionType::get(${ func['returnType'] }, argTypes, false); + %else: + FunctionType* pFuncTy = FunctionType::get(${ func['returnType'] }, {}, false); + %endif: + Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("meta.intrinsic.${func['name']}", pFuncTy)); return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name); %elif isIntrin: %if len(func['types']) != 0: diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp index 912a88fd00d..58fdb7fb171 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp @@ -32,6 +32,7 @@ #include "jit_api.h" #include "blend_jit.h" #include "gen_state_llvm.h" +#include "functionpasses/passes.h" // components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized #define QUANTIZE_THRESHOLD 2 @@ -820,6 +821,8 @@ struct BlendJit : public Builder passes.add(createSCCPPass()); passes.add(createAggressiveDCEPass()); + passes.add(createLowerX86Pass(JM(), this)); + passes.run(*blendFunc); JitManager::DumpToFile(blendFunc, "optimized"); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp index 9f9438de1d8..260daab8621 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp @@ -65,6 +65,8 @@ namespace SwrJit mInt32PtrTy = PointerType::get(mInt32Ty, 0); mInt64Ty = Type::getInt64Ty(pJitMgr->mContext); + mSimd4FP64Ty = VectorType::get(mDoubleTy, 4); + // Built in types: simd8 mSimdInt1Ty = VectorType::get(mInt1Ty, mVWidth); @@ -87,6 +89,8 @@ namespace SwrJit mSimd16VectorTy = ArrayType::get(mSimd16FP32Ty, 4); mSimd16VectorTRTy = ArrayType::get(mSimd16FP32Ty, 5); + mSimd32Int8Ty = VectorType::get(mInt8Ty, 32); + if (sizeof(uint32_t*) == 4) { mIntPtrTy = mInt32Ty; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h index 516e872eb0e..0b57fbf16d4 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h @@ -66,6 +66,8 @@ namespace SwrJit Type* mInt16PtrTy; Type* mInt32PtrTy; + Type* mSimd4FP64Ty; + // Built in types: simd8 Type* mSimdFP16Ty; @@ -90,8 +92,10 @@ namespace SwrJit Type* mSimd16VectorTy; Type* mSimd16VectorTRTy; + Type* mSimd32Int8Ty; + #include "gen_builder.hpp" -#include "gen_builder_x86.hpp" +#include "gen_builder_meta.hpp" #include "gen_builder_intrin.hpp" #include "builder_misc.h" #include "builder_math.h" diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp index dee08b81693..68695c46c81 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp @@ -159,10 +159,7 @@ namespace SwrJit // use avx2 gather instruction if available if (JM()->mArch.AVX2()) { - // force mask to <N x float>, required by vgather - Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty); - - vGather = VGATHERPS(vSrc, pBasePtr, vIndices, mask, C(scale)); + vGather = VGATHERPS(vSrc, pBasePtr, vIndices, vMask, C(scale)); } else { diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index 5971a52db7e..f9293aa3b4b 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -32,6 +32,7 @@ #include "jit_api.h" #include "fetch_jit.h" #include "gen_state_llvm.h" +#include "functionpasses/passes.h" //#define FETCH_DUMP_VERTEX 1 using namespace llvm; @@ -356,6 +357,8 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) optPasses.add(createAggressiveDCEPass()); optPasses.run(*fetch); + + optPasses.add(createLowerX86Pass(JM(), this)); optPasses.run(*fetch); JitManager::DumpToFile(fetch, "opt"); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp new file mode 100644 index 00000000000..11a2397c43d --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp @@ -0,0 +1,455 @@ +/**************************************************************************** +* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file lower_x86.cpp +* +* @brief llvm pass to lower meta code to x86 +* +* Notes: +* +******************************************************************************/ + +#include "jit_pch.hpp" +#include "passes.h" +#include "JitManager.h" + +#include <unordered_map> + + +namespace llvm +{ + // foward declare the initializer + void initializeLowerX86Pass(PassRegistry&); +} + +namespace SwrJit +{ + using namespace llvm; + + enum TargetArch + { + AVX = 0, + AVX2 = 1, + AVX512 = 2 + }; + + enum TargetWidth + { + W256 = 0, + W512 = 1, + NUM_WIDTHS = 2 + }; + + struct LowerX86; + + typedef std::function<Instruction*(LowerX86*, TargetArch, TargetWidth, CallInst*)> EmuFunc; + + struct X86Intrinsic + { + Intrinsic::ID intrin[NUM_WIDTHS]; + EmuFunc emuFunc; + }; + + // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the previous behavior of + // mapping directly to avx/avx2 intrinsics. + static std::map<std::string, Intrinsic::ID> intrinsicMap = { + {"meta.intrinsic.VGATHERPD", Intrinsic::x86_avx2_gather_d_pd_256}, + {"meta.intrinsic.VROUND", Intrinsic::x86_avx_round_ps_256}, + {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32}, + {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b}, + {"meta.intrinsic.VCVTPD2PS", Intrinsic::x86_avx_cvt_pd2_ps_256}, + {"meta.intrinsic.VCVTPH2PS", Intrinsic::x86_vcvtph2ps_256}, + {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256}, + {"meta.intrinsic.VHSUBPS", Intrinsic::x86_avx_hsub_ps_256}, + {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256}, + {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256}, + {"meta.intrinsic.VFMADDPS", Intrinsic::x86_fma_vfmadd_ps_256}, + {"meta.intrinsic.VMOVMSKPS", Intrinsic::x86_avx_movmsk_ps_256}, + {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d}, + {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32}, + {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc}, + }; + + // Forward decls + Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); + Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); + Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); + + static std::map<std::string, X86Intrinsic> intrinsicMap2[] = { + // 256 wide 512 wide + { // AVX + {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, + {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, + {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, + {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + }, + { // AVX2 + {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, + {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}}, + {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}}, + {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + }, + { // AVX512 + {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}}, + {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256, Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}}, + {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256, Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}}, + {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + } + }; + + struct LowerX86 : public FunctionPass + { + LowerX86(JitManager* pJitMgr = nullptr, Builder* b = nullptr) + : FunctionPass(ID), mpJitMgr(pJitMgr), B(b) + { + initializeLowerX86Pass(*PassRegistry::getPassRegistry()); + + // Determine target arch + if (mpJitMgr->mArch.AVX512F()) + { + mTarget = AVX512; + } + else if (mpJitMgr->mArch.AVX2()) + { + mTarget = AVX2; + } + else if (mpJitMgr->mArch.AVX()) + { + mTarget = AVX; + + } + else + { + SWR_ASSERT(false, "Unsupported AVX architecture."); + mTarget = AVX; + } + } + + // Try to decipher the vector type of the instruction. This does not work properly + // across all intrinsics, and will have to be rethought. Probably need something + // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed + // intrinsic. + void GetRequestedWidthAndType(CallInst* pCallInst, TargetWidth* pWidth, Type** pTy) + { + uint32_t vecWidth; + Type* pVecTy = pCallInst->getType(); + if (!pVecTy->isVectorTy()) + { + for (auto& op : pCallInst->arg_operands()) + { + if (op.get()->getType()->isVectorTy()) + { + pVecTy = op.get()->getType(); + break; + } + } + } + SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size"); + + uint32_t width = cast<VectorType>(pVecTy)->getBitWidth(); + switch (width) + { + case 256: *pWidth = W256; break; + case 512: *pWidth = W512; break; + default: SWR_ASSERT(false, "Unhandled vector width %d", width); + *pWidth = W256; + } + + *pTy = pVecTy->getScalarType(); + } + + Value* GetZeroVec(TargetWidth width, Type* pTy) + { + uint32_t numElem = 0; + switch (width) + { + case W256: numElem = 8; break; + case W512: numElem = 16; break; + } + + return ConstantVector::getNullValue(VectorType::get(pTy, numElem)); + } + + Value* GetMask(TargetWidth width) + { + Value* mask; + switch (width) + { + case W256: mask = B->C((uint8_t)-1); break; + case W512: mask = B->C((uint16_t)-1); break; + } + return mask; + } + + Instruction* ProcessIntrinsicAdvanced(CallInst* pCallInst) + { + Function* pFunc = pCallInst->getCalledFunction(); + auto& intrinsic = intrinsicMap2[mTarget][pFunc->getName()]; + TargetWidth vecWidth; + Type* pElemTy; + GetRequestedWidthAndType(pCallInst, &vecWidth, &pElemTy); + + // Check if there is a native intrinsic for this instruction + Intrinsic::ID id = intrinsic.intrin[vecWidth]; + if (id != Intrinsic::not_intrinsic) + { + Function* pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id); + SmallVector<Value*, 8> args; + for (auto& arg : pCallInst->arg_operands()) + { + args.push_back(arg.get()); + } + + // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and full mask for now + // Assuming the intrinsics are consistent and place the src operand and mask last in the argument list. + if (mTarget == AVX512) + { + args.push_back(GetZeroVec(vecWidth, pElemTy)); + args.push_back(GetMask(vecWidth)); + } + + return B->CALLA(pIntrin, args); + } + else + { + // No native intrinsic, call emulation function + return intrinsic.emuFunc(this, mTarget, vecWidth, pCallInst); + } + + SWR_ASSERT(false); + return nullptr; + } + + Instruction* ProcessIntrinsic(CallInst* pCallInst) + { + Function* pFunc = pCallInst->getCalledFunction(); + + // Forward to the advanced support if found + if (intrinsicMap2[mTarget].find(pFunc->getName()) != intrinsicMap2[mTarget].end()) + { + return ProcessIntrinsicAdvanced(pCallInst); + } + + SWR_ASSERT(intrinsicMap.find(pFunc->getName()) != intrinsicMap.end(), "Unimplemented intrinsic %s.", pFunc->getName()); + + Intrinsic::ID x86Intrinsic = intrinsicMap[pFunc->getName()]; + Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic); + + SmallVector<Value*, 8> args; + for (auto& arg : pCallInst->arg_operands()) + { + args.push_back(arg.get()); + } + return B->CALLA(pX86IntrinFunc, args); + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief LLVM funtion pass run method. + /// @param f- The function we're working on with this pass. + virtual bool runOnFunction(Function& F) + { + std::vector<Instruction*> toRemove; + + for (auto& BB : F.getBasicBlockList()) + { + for (auto& I : BB.getInstList()) + { + if (CallInst* pCallInst = dyn_cast<CallInst>(&I)) + { + Function* pFunc = pCallInst->getCalledFunction(); + if (pFunc) + { + if (pFunc->getName().startswith("meta.intrinsic")) + { + B->IRB()->SetInsertPoint(&I); + Instruction* pReplace = ProcessIntrinsic(pCallInst); + SWR_ASSERT(pReplace); + toRemove.push_back(pCallInst); + pCallInst->replaceAllUsesWith(pReplace); + } + } + + } + } + } + + for (auto* pInst : toRemove) + { + pInst->eraseFromParent(); + } + + JitManager::DumpToFile(&F, "lowerx86"); + + return true; + } + + virtual void getAnalysisUsage(AnalysisUsage& AU) const + { + } + + JitManager* JM() { return mpJitMgr; } + + JitManager* mpJitMgr; + Builder* B; + + TargetArch mTarget; + + static char ID; ///< Needed by LLVM to generate ID for FunctionPass. + }; + + char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID. + + FunctionPass* createLowerX86Pass(JitManager* pJitMgr, Builder* b) + { + return new LowerX86(pJitMgr, b); + } + + Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) + { + SWR_ASSERT(false, "Unimplemented intrinsic emulation."); + return nullptr; + } + + Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) + { + // Only need vperm emulation for AVX + SWR_ASSERT(arch == AVX); + + Builder* B = pThis->B; + auto v32A = pCallInst->getArgOperand(0); + auto vi32Index = pCallInst->getArgOperand(1); + + Value* v32Result; + if (isa<Constant>(vi32Index)) + { + // Can use llvm shuffle vector directly with constant shuffle indices + v32Result = B->VSHUFFLE(v32A, v32A, vi32Index); + } + else + { + v32Result = UndefValue::get(v32A->getType()); + for (uint32_t l = 0; l < v32A->getType()->getVectorNumElements(); ++l) + { + auto i32Index = B->VEXTRACT(vi32Index, B->C(l)); + auto val = B->VEXTRACT(v32A, i32Index); + v32Result = B->VINSERT(v32Result, val, B->C(l)); + } + } + return cast<Instruction>(v32Result); + } + + Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) + { + Builder* B = pThis->B; + auto vSrc = pCallInst->getArgOperand(0); + auto pBase = pCallInst->getArgOperand(1); + auto vi32Indices = pCallInst->getArgOperand(2); + auto vi1Mask = pCallInst->getArgOperand(3); + auto i8Scale = pCallInst->getArgOperand(4); + + pBase = B->INT_TO_PTR(pBase, PointerType::get(B->mInt8Ty, 0)); + uint32_t numElem = vSrc->getType()->getVectorNumElements(); + auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty); + auto srcTy = vSrc->getType()->getVectorElementType(); + Value* v32Gather; + if (arch == AVX) + { + // Full emulation for AVX + // Store source on stack to provide a valid address to load from inactive lanes + auto pStack = B->STACKSAVE(); + auto pTmp = B->ALLOCA(vSrc->getType()); + B->STORE(vSrc, pTmp); + + v32Gather = UndefValue::get(vSrc->getType()); + auto vi32Scale = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale)); + auto vi32Offsets = B->MUL(vi32Indices, vi32Scale); + + for (uint32_t i = 0; i < numElem; ++i) + { + auto i32Offset = B->VEXTRACT(vi32Offsets, B->C(i)); + auto pLoadAddress = B->GEP(pBase, i32Offset); + pLoadAddress = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0)); + auto pMaskedLoadAddress = B->GEP(pTmp, { 0, i }); + auto i1Mask = B->VEXTRACT(vi1Mask, B->C(i)); + auto pValidAddress = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress); + auto val = B->LOAD(pValidAddress); + v32Gather = B->VINSERT(v32Gather, val, B->C(i)); + } + + B->STACKRESTORE(pStack); + } + else if (arch == AVX2 || (arch == AVX512 && width == W256)) + { + Function* pX86IntrinFunc = srcTy == B->mFP32Ty ? Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_ps_256) : + Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_d_256); + if (width == W256) + { + auto v32Mask = B->BITCAST(B->VMASK(vi1Mask), vSrc->getType()); + v32Gather = B->CALL(pX86IntrinFunc, { vSrc, pBase, vi32Indices, v32Mask, i8Scale }); + } + else if (width == W512) + { + // Double pump 8-wide + auto v32Mask = B->BITCAST(B->VMASK_16(vi1Mask), vSrc->getType()); + Value *src0 = B->EXTRACT_16(vSrc, 0); + Value *src1 = B->EXTRACT_16(vSrc, 1); + + Value *indices0 = B->EXTRACT_16(vi32Indices, 0); + Value *indices1 = B->EXTRACT_16(vi32Indices, 1); + + Value *mask0 = B->EXTRACT_16(v32Mask, 0); + Value *mask1 = B->EXTRACT_16(v32Mask, 1); + + Value *gather0 = B->CALL(pX86IntrinFunc, { src0, pBase, indices0, mask0, i8Scale }); + Value *gather1 = B->CALL(pX86IntrinFunc, { src1, pBase, indices1, mask1, i8Scale }); + + v32Gather = B->JOIN_16(gather0, gather1); + } + } + else if (arch == AVX512) + { + auto i16Mask = B->BITCAST(vi1Mask, B->mInt16Ty); + + Function* pX86IntrinFunc = srcTy == B->mFP32Ty ? Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dps_512) : + Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dpi_512); + auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty); + v32Gather = B->CALL(pX86IntrinFunc, { vSrc, pBase, vi32Indices, i16Mask, i32Scale }); + } + + return cast<Instruction>(v32Gather); + } +} + +using namespace SwrJit; + +INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false) +INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false) + diff --git a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h new file mode 100644 index 00000000000..f7373f034be --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h @@ -0,0 +1,37 @@ +/**************************************************************************** +* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file passes.h +* +* @brief Include file for llvm passes +* +******************************************************************************/ + +#include "JitManager.h" +#include "builder.h" + +namespace SwrJit +{ + using namespace llvm; + + FunctionPass* createLowerX86Pass(JitManager* pJitMgr, Builder* b); +} diff --git a/src/gallium/drivers/swr/rasterizer/jitter/meson.build b/src/gallium/drivers/swr/rasterizer/jitter/meson.build index 4a2f46ae1e7..5c201990b50 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/meson.build +++ b/src/gallium/drivers/swr/rasterizer/jitter/meson.build @@ -37,12 +37,12 @@ gen_builder_hpp = custom_target( build_by_default : true, ) -gen_builder_x86_hpp = custom_target( - 'gen_builder_x86.hpp', +gen_builder_meta_hpp = custom_target( + 'gen_builder_meta.hpp', input : '../codegen/gen_llvm_ir_macros.py', - output : 'gen_builder_x86.hpp', + output : 'gen_builder_meta.hpp', command : [ - prog_python2, '@INPUT0@', '--gen_x86_h', '--output', '@OUTPUT@', + prog_python2, '@INPUT0@', '--gen_meta_h', '--output', '@OUTPUT@', '--output-dir', '@OUTDIR@' ], depend_files : swr_gen_builder_depends, diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp index f9d858090ff..15a6bc40289 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp @@ -32,6 +32,7 @@ #include "jit_api.h" #include "streamout_jit.h" #include "gen_state_llvm.h" +#include "functionpasses/passes.h" using namespace llvm; using namespace SwrJit; @@ -306,6 +307,8 @@ struct StreamOutJit : public Builder passes.add(createSCCPPass()); passes.add(createAggressiveDCEPass()); + passes.add(createLowerX86Pass(JM(), this)); + passes.run(*soFunc); JitManager::DumpToFile(soFunc, "SoFunc_optimized"); |