aboutsummaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/swr/rasterizer
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers/swr/rasterizer')
-rw-r--r--src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py58
-rw-r--r--src/gallium/drivers/swr/rasterizer/codegen/meson.build2
-rw-r--r--src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp11
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp3
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/builder.cpp4
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/builder.h6
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp5
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp3
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp455
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h37
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/meson.build8
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp3
12 files changed, 556 insertions, 39 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index 324f24a3557..bdd785a155d 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -42,28 +42,28 @@ inst_aliases = {
}
intrinsics = [
- ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 'mask', 'scale']],
- ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 'mask', 'scale']],
- ['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale']],
- ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 'mask', 'scale']],
- ['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 'indices', 'mask', 'scale']],
- ['VRCPPS', 'x86_avx_rcp_ps_256', ['a']],
- ['VROUND', 'x86_avx_round_ps_256', ['a', 'rounding']],
- ['BEXTR_32', 'x86_bmi_bextr_32', ['src', 'control']],
- ['VPSHUFB', 'x86_avx2_pshuf_b', ['a', 'b']],
- ['VPERMD', 'x86_avx2_permd', ['a', 'idx']],
- ['VPERMPS', 'x86_avx2_permps', ['idx', 'a']],
- ['VCVTPD2PS', 'x86_avx_cvt_pd2_ps_256', ['a']],
- ['VCVTPH2PS', 'x86_vcvtph2ps_256', ['a']],
- ['VCVTPS2PH', 'x86_vcvtps2ph_256', ['a', 'round']],
- ['VHSUBPS', 'x86_avx_hsub_ps_256', ['a', 'b']],
- ['VPTESTC', 'x86_avx_ptestc_256', ['a', 'b']],
- ['VPTESTZ', 'x86_avx_ptestz_256', ['a', 'b']],
- ['VFMADDPS', 'x86_fma_vfmadd_ps_256', ['a', 'b', 'c']],
- ['VMOVMSKPS', 'x86_avx_movmsk_ps_256', ['a']],
- ['VPHADDD', 'x86_avx2_phadd_d', ['a', 'b']],
- ['PDEP32', 'x86_bmi_pdep_32', ['a', 'b']],
- ['RDTSC', 'x86_rdtsc', []],
+ ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimd4FP64Ty'],
+ ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimdFP32Ty'],
+ ['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimd16FP32Ty'],
+ ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimdInt32Ty'],
+ ['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimd16Int32Ty'],
+ ['VRCPPS', 'x86_avx_rcp_ps_256', ['a'], 'mSimdFP32Ty'],
+ ['VROUND', 'x86_avx_round_ps_256', ['a', 'rounding'], 'mSimdFP32Ty'],
+ ['BEXTR_32', 'x86_bmi_bextr_32', ['src', 'control'], 'mInt32Ty'],
+ ['VPSHUFB', 'x86_avx2_pshuf_b', ['a', 'b'], 'mSimd32Int8Ty'],
+ ['VPERMD', 'x86_avx2_permd', ['a', 'idx'], 'mSimdInt32Ty'],
+ ['VPERMPS', 'x86_avx2_permps', ['idx', 'a'], 'mSimdFP32Ty'],
+ ['VCVTPD2PS', 'x86_avx_cvt_pd2_ps_256', ['a'], 'mSimdFP32Ty'],
+ ['VCVTPH2PS', 'x86_vcvtph2ps_256', ['a'], 'mSimdFP32Ty'],
+ ['VCVTPS2PH', 'x86_vcvtps2ph_256', ['a', 'round'], 'mSimdFP16Ty'],
+ ['VHSUBPS', 'x86_avx_hsub_ps_256', ['a', 'b'], 'mSimdFP32Ty'],
+ ['VPTESTC', 'x86_avx_ptestc_256', ['a', 'b'], 'mInt32Ty'],
+ ['VPTESTZ', 'x86_avx_ptestz_256', ['a', 'b'], 'mInt32Ty'],
+ ['VFMADDPS', 'x86_fma_vfmadd_ps_256', ['a', 'b', 'c'], 'mSimdFP32Ty'],
+ ['VMOVMSKPS', 'x86_avx_movmsk_ps_256', ['a'], 'mInt32Ty'],
+ ['VPHADDD', 'x86_avx2_phadd_d', ['a', 'b'], 'mSimdInt32Ty'],
+ ['PDEP32', 'x86_bmi_pdep_32', ['a', 'b'], 'mInt32Ty'],
+ ['RDTSC', 'x86_rdtsc', [], 'mInt64Ty'],
]
llvm_intrinsics = [
@@ -223,8 +223,8 @@ def generate_gen_h(functions, output_dir):
'''
Auto-generates macros for LLVM IR
'''
-def generate_x86_h(output_dir):
- filename = 'gen_builder_x86.hpp'
+def generate_meta_h(output_dir):
+ filename = 'gen_builder_meta.hpp'
output_filename = os.path.join(output_dir, filename)
functions = []
@@ -238,15 +238,17 @@ def generate_x86_h(output_dir):
functions.append({
'decl' : decl,
+ 'name' : inst[0],
'intrin' : inst[1],
'args' : inst[2],
+ 'returnType': inst[3]
})
MakoTemplateWriter.to_file(
template,
output_filename,
cmdline=sys.argv,
- comment='x86 intrinsics',
+ comment='meta intrinsics',
filename=filename,
functions=functions,
isX86=True, isIntrin=False)
@@ -291,7 +293,7 @@ def main():
parser.add_argument('--input', '-i', type=FileType('r'), help='Path to IRBuilder.h', required=False)
parser.add_argument('--output-dir', '-o', action='store', dest='output', help='Path to output directory', required=True)
parser.add_argument('--gen_h', help='Generate builder_gen.h', action='store_true', default=False)
- parser.add_argument('--gen_x86_h', help='Generate x86 intrinsics. No input is needed.', action='store_true', default=False)
+ parser.add_argument('--gen_meta_h', help='Generate meta intrinsics. No input is needed.', action='store_true', default=False)
parser.add_argument('--gen_intrin_h', help='Generate llvm intrinsics. No input is needed.', action='store_true', default=False)
args = parser.parse_args()
@@ -307,8 +309,8 @@ def main():
elif args.gen_h:
print('Need to specify --input for --gen_h!')
- if args.gen_x86_h:
- generate_x86_h(args.output)
+ if args.gen_meta_h:
+ generate_meta_h(args.output)
if args.gen_intrin_h:
generate_intrin_h(args.output)
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/meson.build b/src/gallium/drivers/swr/rasterizer/codegen/meson.build
index bbe6efff01a..841540e0f30 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/meson.build
+++ b/src/gallium/drivers/swr/rasterizer/codegen/meson.build
@@ -44,7 +44,7 @@ gen_knobs_h = custom_target(
# The generators above this are needed individually, while the below generators
# are all inputs to the same lib, so they don't need unique names.
files_swr_common += [
- gen_builder_hpp, gen_builder_x86_hpp, gen_knobs_h, gen_knobs_cpp
+ gen_builder_hpp, gen_builder_meta_hpp, gen_knobs_h, gen_knobs_cpp
]
foreach x : [[swr_context_files, 'gen_swr_context_llvm.h'],
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp
index 5a47c9aa105..bcbcb30cc14 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp
@@ -40,7 +40,16 @@
${func['decl']}
{
%if isX86:
- Function * pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']});
+ %if len(func['args']) != 0:
+ SmallVector<Type*, ${len(func['args'])}> argTypes;
+ %for arg in func['args']:
+ argTypes.push_back(${arg}->getType());
+ %endfor
+ FunctionType* pFuncTy = FunctionType::get(${ func['returnType'] }, argTypes, false);
+ %else:
+ FunctionType* pFuncTy = FunctionType::get(${ func['returnType'] }, {}, false);
+ %endif:
+ Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("meta.intrinsic.${func['name']}", pFuncTy));
return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name);
%elif isIntrin:
%if len(func['types']) != 0:
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
index 912a88fd00d..58fdb7fb171 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
@@ -32,6 +32,7 @@
#include "jit_api.h"
#include "blend_jit.h"
#include "gen_state_llvm.h"
+#include "functionpasses/passes.h"
// components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized
#define QUANTIZE_THRESHOLD 2
@@ -820,6 +821,8 @@ struct BlendJit : public Builder
passes.add(createSCCPPass());
passes.add(createAggressiveDCEPass());
+ passes.add(createLowerX86Pass(JM(), this));
+
passes.run(*blendFunc);
JitManager::DumpToFile(blendFunc, "optimized");
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index 9f9438de1d8..260daab8621 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -65,6 +65,8 @@ namespace SwrJit
mInt32PtrTy = PointerType::get(mInt32Ty, 0);
mInt64Ty = Type::getInt64Ty(pJitMgr->mContext);
+ mSimd4FP64Ty = VectorType::get(mDoubleTy, 4);
+
// Built in types: simd8
mSimdInt1Ty = VectorType::get(mInt1Ty, mVWidth);
@@ -87,6 +89,8 @@ namespace SwrJit
mSimd16VectorTy = ArrayType::get(mSimd16FP32Ty, 4);
mSimd16VectorTRTy = ArrayType::get(mSimd16FP32Ty, 5);
+ mSimd32Int8Ty = VectorType::get(mInt8Ty, 32);
+
if (sizeof(uint32_t*) == 4)
{
mIntPtrTy = mInt32Ty;
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 516e872eb0e..0b57fbf16d4 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -66,6 +66,8 @@ namespace SwrJit
Type* mInt16PtrTy;
Type* mInt32PtrTy;
+ Type* mSimd4FP64Ty;
+
// Built in types: simd8
Type* mSimdFP16Ty;
@@ -90,8 +92,10 @@ namespace SwrJit
Type* mSimd16VectorTy;
Type* mSimd16VectorTRTy;
+ Type* mSimd32Int8Ty;
+
#include "gen_builder.hpp"
-#include "gen_builder_x86.hpp"
+#include "gen_builder_meta.hpp"
#include "gen_builder_intrin.hpp"
#include "builder_misc.h"
#include "builder_math.h"
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
index dee08b81693..68695c46c81 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -159,10 +159,7 @@ namespace SwrJit
// use avx2 gather instruction if available
if (JM()->mArch.AVX2())
{
- // force mask to <N x float>, required by vgather
- Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);
-
- vGather = VGATHERPS(vSrc, pBasePtr, vIndices, mask, C(scale));
+ vGather = VGATHERPS(vSrc, pBasePtr, vIndices, vMask, C(scale));
}
else
{
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 5971a52db7e..f9293aa3b4b 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -32,6 +32,7 @@
#include "jit_api.h"
#include "fetch_jit.h"
#include "gen_state_llvm.h"
+#include "functionpasses/passes.h"
//#define FETCH_DUMP_VERTEX 1
using namespace llvm;
@@ -356,6 +357,8 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
optPasses.add(createAggressiveDCEPass());
optPasses.run(*fetch);
+
+ optPasses.add(createLowerX86Pass(JM(), this));
optPasses.run(*fetch);
JitManager::DumpToFile(fetch, "opt");
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
new file mode 100644
index 00000000000..11a2397c43d
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
@@ -0,0 +1,455 @@
+/****************************************************************************
+* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file lower_x86.cpp
+*
+* @brief llvm pass to lower meta code to x86
+*
+* Notes:
+*
+******************************************************************************/
+
+#include "jit_pch.hpp"
+#include "passes.h"
+#include "JitManager.h"
+
+#include <unordered_map>
+
+
+namespace llvm
+{
+ // foward declare the initializer
+ void initializeLowerX86Pass(PassRegistry&);
+}
+
+namespace SwrJit
+{
+ using namespace llvm;
+
+ enum TargetArch
+ {
+ AVX = 0,
+ AVX2 = 1,
+ AVX512 = 2
+ };
+
+ enum TargetWidth
+ {
+ W256 = 0,
+ W512 = 1,
+ NUM_WIDTHS = 2
+ };
+
+ struct LowerX86;
+
+ typedef std::function<Instruction*(LowerX86*, TargetArch, TargetWidth, CallInst*)> EmuFunc;
+
+ struct X86Intrinsic
+ {
+ Intrinsic::ID intrin[NUM_WIDTHS];
+ EmuFunc emuFunc;
+ };
+
+ // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the previous behavior of
+ // mapping directly to avx/avx2 intrinsics.
+ static std::map<std::string, Intrinsic::ID> intrinsicMap = {
+ {"meta.intrinsic.VGATHERPD", Intrinsic::x86_avx2_gather_d_pd_256},
+ {"meta.intrinsic.VROUND", Intrinsic::x86_avx_round_ps_256},
+ {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32},
+ {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b},
+ {"meta.intrinsic.VCVTPD2PS", Intrinsic::x86_avx_cvt_pd2_ps_256},
+ {"meta.intrinsic.VCVTPH2PS", Intrinsic::x86_vcvtph2ps_256},
+ {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256},
+ {"meta.intrinsic.VHSUBPS", Intrinsic::x86_avx_hsub_ps_256},
+ {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256},
+ {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256},
+ {"meta.intrinsic.VFMADDPS", Intrinsic::x86_fma_vfmadd_ps_256},
+ {"meta.intrinsic.VMOVMSKPS", Intrinsic::x86_avx_movmsk_ps_256},
+ {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d},
+ {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32},
+ {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc},
+ };
+
+ // Forward decls
+ Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
+ Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
+ Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
+
+ static std::map<std::string, X86Intrinsic> intrinsicMap2[] = {
+ // 256 wide 512 wide
+ { // AVX
+ {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
+ {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
+ {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
+ {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ },
+ { // AVX2
+ {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
+ {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}},
+ {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}},
+ {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ },
+ { // AVX512
+ {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}},
+ {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256, Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}},
+ {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256, Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}},
+ {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ }
+ };
+
+ struct LowerX86 : public FunctionPass
+ {
+ LowerX86(JitManager* pJitMgr = nullptr, Builder* b = nullptr)
+ : FunctionPass(ID), mpJitMgr(pJitMgr), B(b)
+ {
+ initializeLowerX86Pass(*PassRegistry::getPassRegistry());
+
+ // Determine target arch
+ if (mpJitMgr->mArch.AVX512F())
+ {
+ mTarget = AVX512;
+ }
+ else if (mpJitMgr->mArch.AVX2())
+ {
+ mTarget = AVX2;
+ }
+ else if (mpJitMgr->mArch.AVX())
+ {
+ mTarget = AVX;
+
+ }
+ else
+ {
+ SWR_ASSERT(false, "Unsupported AVX architecture.");
+ mTarget = AVX;
+ }
+ }
+
+ // Try to decipher the vector type of the instruction. This does not work properly
+ // across all intrinsics, and will have to be rethought. Probably need something
+ // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
+ // intrinsic.
+ void GetRequestedWidthAndType(CallInst* pCallInst, TargetWidth* pWidth, Type** pTy)
+ {
+ uint32_t vecWidth;
+ Type* pVecTy = pCallInst->getType();
+ if (!pVecTy->isVectorTy())
+ {
+ for (auto& op : pCallInst->arg_operands())
+ {
+ if (op.get()->getType()->isVectorTy())
+ {
+ pVecTy = op.get()->getType();
+ break;
+ }
+ }
+ }
+ SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size");
+
+ uint32_t width = cast<VectorType>(pVecTy)->getBitWidth();
+ switch (width)
+ {
+ case 256: *pWidth = W256; break;
+ case 512: *pWidth = W512; break;
+ default: SWR_ASSERT(false, "Unhandled vector width %d", width);
+ *pWidth = W256;
+ }
+
+ *pTy = pVecTy->getScalarType();
+ }
+
+ Value* GetZeroVec(TargetWidth width, Type* pTy)
+ {
+ uint32_t numElem = 0;
+ switch (width)
+ {
+ case W256: numElem = 8; break;
+ case W512: numElem = 16; break;
+ }
+
+ return ConstantVector::getNullValue(VectorType::get(pTy, numElem));
+ }
+
+ Value* GetMask(TargetWidth width)
+ {
+ Value* mask;
+ switch (width)
+ {
+ case W256: mask = B->C((uint8_t)-1); break;
+ case W512: mask = B->C((uint16_t)-1); break;
+ }
+ return mask;
+ }
+
+ Instruction* ProcessIntrinsicAdvanced(CallInst* pCallInst)
+ {
+ Function* pFunc = pCallInst->getCalledFunction();
+ auto& intrinsic = intrinsicMap2[mTarget][pFunc->getName()];
+ TargetWidth vecWidth;
+ Type* pElemTy;
+ GetRequestedWidthAndType(pCallInst, &vecWidth, &pElemTy);
+
+ // Check if there is a native intrinsic for this instruction
+ Intrinsic::ID id = intrinsic.intrin[vecWidth];
+ if (id != Intrinsic::not_intrinsic)
+ {
+ Function* pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id);
+ SmallVector<Value*, 8> args;
+ for (auto& arg : pCallInst->arg_operands())
+ {
+ args.push_back(arg.get());
+ }
+
+ // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and full mask for now
+ // Assuming the intrinsics are consistent and place the src operand and mask last in the argument list.
+ if (mTarget == AVX512)
+ {
+ args.push_back(GetZeroVec(vecWidth, pElemTy));
+ args.push_back(GetMask(vecWidth));
+ }
+
+ return B->CALLA(pIntrin, args);
+ }
+ else
+ {
+ // No native intrinsic, call emulation function
+ return intrinsic.emuFunc(this, mTarget, vecWidth, pCallInst);
+ }
+
+ SWR_ASSERT(false);
+ return nullptr;
+ }
+
+ Instruction* ProcessIntrinsic(CallInst* pCallInst)
+ {
+ Function* pFunc = pCallInst->getCalledFunction();
+
+ // Forward to the advanced support if found
+ if (intrinsicMap2[mTarget].find(pFunc->getName()) != intrinsicMap2[mTarget].end())
+ {
+ return ProcessIntrinsicAdvanced(pCallInst);
+ }
+
+ SWR_ASSERT(intrinsicMap.find(pFunc->getName()) != intrinsicMap.end(), "Unimplemented intrinsic %s.", pFunc->getName());
+
+ Intrinsic::ID x86Intrinsic = intrinsicMap[pFunc->getName()];
+ Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic);
+
+ SmallVector<Value*, 8> args;
+ for (auto& arg : pCallInst->arg_operands())
+ {
+ args.push_back(arg.get());
+ }
+ return B->CALLA(pX86IntrinFunc, args);
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief LLVM funtion pass run method.
+ /// @param f- The function we're working on with this pass.
+ virtual bool runOnFunction(Function& F)
+ {
+ std::vector<Instruction*> toRemove;
+
+ for (auto& BB : F.getBasicBlockList())
+ {
+ for (auto& I : BB.getInstList())
+ {
+ if (CallInst* pCallInst = dyn_cast<CallInst>(&I))
+ {
+ Function* pFunc = pCallInst->getCalledFunction();
+ if (pFunc)
+ {
+ if (pFunc->getName().startswith("meta.intrinsic"))
+ {
+ B->IRB()->SetInsertPoint(&I);
+ Instruction* pReplace = ProcessIntrinsic(pCallInst);
+ SWR_ASSERT(pReplace);
+ toRemove.push_back(pCallInst);
+ pCallInst->replaceAllUsesWith(pReplace);
+ }
+ }
+
+ }
+ }
+ }
+
+ for (auto* pInst : toRemove)
+ {
+ pInst->eraseFromParent();
+ }
+
+ JitManager::DumpToFile(&F, "lowerx86");
+
+ return true;
+ }
+
+ virtual void getAnalysisUsage(AnalysisUsage& AU) const
+ {
+ }
+
+ JitManager* JM() { return mpJitMgr; }
+
+ JitManager* mpJitMgr;
+ Builder* B;
+
+ TargetArch mTarget;
+
+ static char ID; ///< Needed by LLVM to generate ID for FunctionPass.
+ };
+
+ char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID.
+
+ FunctionPass* createLowerX86Pass(JitManager* pJitMgr, Builder* b)
+ {
+ return new LowerX86(pJitMgr, b);
+ }
+
+ Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
+ {
+ SWR_ASSERT(false, "Unimplemented intrinsic emulation.");
+ return nullptr;
+ }
+
+ Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
+ {
+ // Only need vperm emulation for AVX
+ SWR_ASSERT(arch == AVX);
+
+ Builder* B = pThis->B;
+ auto v32A = pCallInst->getArgOperand(0);
+ auto vi32Index = pCallInst->getArgOperand(1);
+
+ Value* v32Result;
+ if (isa<Constant>(vi32Index))
+ {
+ // Can use llvm shuffle vector directly with constant shuffle indices
+ v32Result = B->VSHUFFLE(v32A, v32A, vi32Index);
+ }
+ else
+ {
+ v32Result = UndefValue::get(v32A->getType());
+ for (uint32_t l = 0; l < v32A->getType()->getVectorNumElements(); ++l)
+ {
+ auto i32Index = B->VEXTRACT(vi32Index, B->C(l));
+ auto val = B->VEXTRACT(v32A, i32Index);
+ v32Result = B->VINSERT(v32Result, val, B->C(l));
+ }
+ }
+ return cast<Instruction>(v32Result);
+ }
+
+ Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
+ {
+ Builder* B = pThis->B;
+ auto vSrc = pCallInst->getArgOperand(0);
+ auto pBase = pCallInst->getArgOperand(1);
+ auto vi32Indices = pCallInst->getArgOperand(2);
+ auto vi1Mask = pCallInst->getArgOperand(3);
+ auto i8Scale = pCallInst->getArgOperand(4);
+
+ pBase = B->INT_TO_PTR(pBase, PointerType::get(B->mInt8Ty, 0));
+ uint32_t numElem = vSrc->getType()->getVectorNumElements();
+ auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
+ auto srcTy = vSrc->getType()->getVectorElementType();
+ Value* v32Gather;
+ if (arch == AVX)
+ {
+ // Full emulation for AVX
+ // Store source on stack to provide a valid address to load from inactive lanes
+ auto pStack = B->STACKSAVE();
+ auto pTmp = B->ALLOCA(vSrc->getType());
+ B->STORE(vSrc, pTmp);
+
+ v32Gather = UndefValue::get(vSrc->getType());
+ auto vi32Scale = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale));
+ auto vi32Offsets = B->MUL(vi32Indices, vi32Scale);
+
+ for (uint32_t i = 0; i < numElem; ++i)
+ {
+ auto i32Offset = B->VEXTRACT(vi32Offsets, B->C(i));
+ auto pLoadAddress = B->GEP(pBase, i32Offset);
+ pLoadAddress = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0));
+ auto pMaskedLoadAddress = B->GEP(pTmp, { 0, i });
+ auto i1Mask = B->VEXTRACT(vi1Mask, B->C(i));
+ auto pValidAddress = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress);
+ auto val = B->LOAD(pValidAddress);
+ v32Gather = B->VINSERT(v32Gather, val, B->C(i));
+ }
+
+ B->STACKRESTORE(pStack);
+ }
+ else if (arch == AVX2 || (arch == AVX512 && width == W256))
+ {
+ Function* pX86IntrinFunc = srcTy == B->mFP32Ty ? Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_ps_256) :
+ Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_d_256);
+ if (width == W256)
+ {
+ auto v32Mask = B->BITCAST(B->VMASK(vi1Mask), vSrc->getType());
+ v32Gather = B->CALL(pX86IntrinFunc, { vSrc, pBase, vi32Indices, v32Mask, i8Scale });
+ }
+ else if (width == W512)
+ {
+ // Double pump 8-wide
+ auto v32Mask = B->BITCAST(B->VMASK_16(vi1Mask), vSrc->getType());
+ Value *src0 = B->EXTRACT_16(vSrc, 0);
+ Value *src1 = B->EXTRACT_16(vSrc, 1);
+
+ Value *indices0 = B->EXTRACT_16(vi32Indices, 0);
+ Value *indices1 = B->EXTRACT_16(vi32Indices, 1);
+
+ Value *mask0 = B->EXTRACT_16(v32Mask, 0);
+ Value *mask1 = B->EXTRACT_16(v32Mask, 1);
+
+ Value *gather0 = B->CALL(pX86IntrinFunc, { src0, pBase, indices0, mask0, i8Scale });
+ Value *gather1 = B->CALL(pX86IntrinFunc, { src1, pBase, indices1, mask1, i8Scale });
+
+ v32Gather = B->JOIN_16(gather0, gather1);
+ }
+ }
+ else if (arch == AVX512)
+ {
+ auto i16Mask = B->BITCAST(vi1Mask, B->mInt16Ty);
+
+ Function* pX86IntrinFunc = srcTy == B->mFP32Ty ? Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dps_512) :
+ Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dpi_512);
+ auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
+ v32Gather = B->CALL(pX86IntrinFunc, { vSrc, pBase, vi32Indices, i16Mask, i32Scale });
+ }
+
+ return cast<Instruction>(v32Gather);
+ }
+}
+
+using namespace SwrJit;
+
+INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false)
+INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false)
+
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h
new file mode 100644
index 00000000000..f7373f034be
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h
@@ -0,0 +1,37 @@
+/****************************************************************************
+* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file passes.h
+*
+* @brief Include file for llvm passes
+*
+******************************************************************************/
+
+#include "JitManager.h"
+#include "builder.h"
+
+namespace SwrJit
+{
+ using namespace llvm;
+
+ FunctionPass* createLowerX86Pass(JitManager* pJitMgr, Builder* b);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/meson.build b/src/gallium/drivers/swr/rasterizer/jitter/meson.build
index 4a2f46ae1e7..5c201990b50 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/meson.build
+++ b/src/gallium/drivers/swr/rasterizer/jitter/meson.build
@@ -37,12 +37,12 @@ gen_builder_hpp = custom_target(
build_by_default : true,
)
-gen_builder_x86_hpp = custom_target(
- 'gen_builder_x86.hpp',
+gen_builder_meta_hpp = custom_target(
+ 'gen_builder_meta.hpp',
input : '../codegen/gen_llvm_ir_macros.py',
- output : 'gen_builder_x86.hpp',
+ output : 'gen_builder_meta.hpp',
command : [
- prog_python2, '@INPUT0@', '--gen_x86_h', '--output', '@OUTPUT@',
+ prog_python2, '@INPUT0@', '--gen_meta_h', '--output', '@OUTPUT@',
'--output-dir', '@OUTDIR@'
],
depend_files : swr_gen_builder_depends,
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
index f9d858090ff..15a6bc40289 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
@@ -32,6 +32,7 @@
#include "jit_api.h"
#include "streamout_jit.h"
#include "gen_state_llvm.h"
+#include "functionpasses/passes.h"
using namespace llvm;
using namespace SwrJit;
@@ -306,6 +307,8 @@ struct StreamOutJit : public Builder
passes.add(createSCCPPass());
passes.add(createAggressiveDCEPass());
+ passes.add(createLowerX86Pass(JM(), this));
+
passes.run(*soFunc);
JitManager::DumpToFile(soFunc, "SoFunc_optimized");