12 files changed, 556 insertions, 39 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index 324f24a3557..bdd785a155d 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -42,28 +42,28 @@ inst_aliases = {
 }
 
 intrinsics = [
-    ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 'mask', 'scale']],
-    ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 'mask', 'scale']],
-    ['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale']],
-    ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 'mask', 'scale']],
-    ['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 'indices', 'mask', 'scale']],
-    ['VRCPPS', 'x86_avx_rcp_ps_256', ['a']],
-    ['VROUND', 'x86_avx_round_ps_256', ['a', 'rounding']],
-    ['BEXTR_32', 'x86_bmi_bextr_32', ['src', 'control']],
-    ['VPSHUFB', 'x86_avx2_pshuf_b', ['a', 'b']],
-    ['VPERMD', 'x86_avx2_permd', ['a', 'idx']],
-    ['VPERMPS', 'x86_avx2_permps', ['idx', 'a']],
-    ['VCVTPD2PS', 'x86_avx_cvt_pd2_ps_256', ['a']],
-    ['VCVTPH2PS', 'x86_vcvtph2ps_256', ['a']],
-    ['VCVTPS2PH', 'x86_vcvtps2ph_256', ['a', 'round']],
-    ['VHSUBPS', 'x86_avx_hsub_ps_256', ['a', 'b']],
-    ['VPTESTC', 'x86_avx_ptestc_256', ['a', 'b']],
-    ['VPTESTZ', 'x86_avx_ptestz_256', ['a', 'b']],
-    ['VFMADDPS', 'x86_fma_vfmadd_ps_256', ['a', 'b', 'c']],
-    ['VMOVMSKPS', 'x86_avx_movmsk_ps_256', ['a']],
-    ['VPHADDD', 'x86_avx2_phadd_d', ['a', 'b']],
-    ['PDEP32', 'x86_bmi_pdep_32', ['a', 'b']],
-    ['RDTSC', 'x86_rdtsc', []],
+    ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimd4FP64Ty'],
+    ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimdFP32Ty'],
+    ['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimd16FP32Ty'],
+    ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimdInt32Ty'],
+    ['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimd16Int32Ty'],
+    ['VRCPPS', 'x86_avx_rcp_ps_256', ['a'], 'mSimdFP32Ty'],
+    ['VROUND', 'x86_avx_round_ps_256', ['a', 'rounding'], 'mSimdFP32Ty'],
+    ['BEXTR_32', 'x86_bmi_bextr_32', ['src', 'control'], 'mInt32Ty'],
+    ['VPSHUFB', 'x86_avx2_pshuf_b', ['a', 'b'], 'mSimd32Int8Ty'],
+    ['VPERMD', 'x86_avx2_permd', ['a', 'idx'], 'mSimdInt32Ty'],
+    ['VPERMPS', 'x86_avx2_permps', ['idx', 'a'], 'mSimdFP32Ty'],
+    ['VCVTPD2PS', 'x86_avx_cvt_pd2_ps_256', ['a'], 'mSimdFP32Ty'],
+    ['VCVTPH2PS', 'x86_vcvtph2ps_256', ['a'], 'mSimdFP32Ty'],
+    ['VCVTPS2PH', 'x86_vcvtps2ph_256', ['a', 'round'], 'mSimdFP16Ty'],
+    ['VHSUBPS', 'x86_avx_hsub_ps_256', ['a', 'b'], 'mSimdFP32Ty'],
+    ['VPTESTC', 'x86_avx_ptestc_256', ['a', 'b'], 'mInt32Ty'],
+    ['VPTESTZ', 'x86_avx_ptestz_256', ['a', 'b'], 'mInt32Ty'],
+    ['VFMADDPS', 'x86_fma_vfmadd_ps_256', ['a', 'b', 'c'], 'mSimdFP32Ty'],
+    ['VMOVMSKPS', 'x86_avx_movmsk_ps_256', ['a'], 'mInt32Ty'],
+    ['VPHADDD', 'x86_avx2_phadd_d', ['a', 'b'], 'mSimdInt32Ty'],
+    ['PDEP32', 'x86_bmi_pdep_32', ['a', 'b'], 'mInt32Ty'],
+    ['RDTSC', 'x86_rdtsc', [], 'mInt64Ty'],
 ]
 
 llvm_intrinsics = [
@@ -223,8 +223,8 @@ def generate_gen_h(functions, output_dir):
 '''
     Auto-generates macros for LLVM IR
 '''
-def generate_x86_h(output_dir):
-    filename = 'gen_builder_x86.hpp'
+def generate_meta_h(output_dir):
+    filename = 'gen_builder_meta.hpp'
     output_filename = os.path.join(output_dir, filename)
 
     functions = []
@@ -238,15 +238,17 @@ def generate_x86_h(output_dir):
 
         functions.append({
             'decl'      : decl,
+            'name'      : inst[0],
             'intrin'    : inst[1],
             'args'      : inst[2],
+            'returnType': inst[3]
         })
 
     MakoTemplateWriter.to_file(
         template,
         output_filename,
         cmdline=sys.argv,
-        comment='x86 intrinsics',
+        comment='meta intrinsics',
         filename=filename,
         functions=functions,
         isX86=True, isIntrin=False)
@@ -291,7 +293,7 @@ def main():
     parser.add_argument('--input', '-i', type=FileType('r'), help='Path to IRBuilder.h', required=False)
     parser.add_argument('--output-dir', '-o', action='store', dest='output', help='Path to output directory', required=True)
     parser.add_argument('--gen_h', help='Generate builder_gen.h', action='store_true', default=False)
-    parser.add_argument('--gen_x86_h', help='Generate x86 intrinsics. No input is needed.', action='store_true', default=False)
+    parser.add_argument('--gen_meta_h', help='Generate meta intrinsics. No input is needed.', action='store_true', default=False)
     parser.add_argument('--gen_intrin_h', help='Generate llvm intrinsics. No input is needed.', action='store_true', default=False)
     args = parser.parse_args()
 
@@ -307,8 +309,8 @@ def main():
     elif args.gen_h:
         print('Need to specify --input for --gen_h!')
 
-    if args.gen_x86_h:
-        generate_x86_h(args.output)
+    if args.gen_meta_h:
+        generate_meta_h(args.output)
 
     if args.gen_intrin_h:
         generate_intrin_h(args.output)
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/meson.build b/src/gallium/drivers/swr/rasterizer/codegen/meson.build
index bbe6efff01a..841540e0f30 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/meson.build
+++ b/src/gallium/drivers/swr/rasterizer/codegen/meson.build
@@ -44,7 +44,7 @@ gen_knobs_h = custom_target(
 # The generators above this are needed individually, while the below generators
 # are all inputs to the same lib, so they don't need unique names.
 files_swr_common += [
-  gen_builder_hpp, gen_builder_x86_hpp, gen_knobs_h, gen_knobs_cpp
+  gen_builder_hpp, gen_builder_meta_hpp, gen_knobs_h, gen_knobs_cpp
 ]
 
 foreach x : [[swr_context_files, 'gen_swr_context_llvm.h'],
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp
index 5a47c9aa105..bcbcb30cc14 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp
@@ -40,7 +40,16 @@
 ${func['decl']}
 {
 %if isX86:
-    Function * pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']});
+    %if len(func['args']) != 0:
+    SmallVector<Type*, ${len(func['args'])}> argTypes;
+    %for arg in func['args']:
+    argTypes.push_back(${arg}->getType());
+    %endfor
+    FunctionType* pFuncTy = FunctionType::get(${ func['returnType'] }, argTypes, false);
+    %else:
+    FunctionType* pFuncTy = FunctionType::get(${ func['returnType'] }, {}, false);
+    %endif:
+    Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("meta.intrinsic.${func['name']}", pFuncTy));
     return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name);
 %elif isIntrin:
     %if len(func['types']) != 0:
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
index 912a88fd00d..58fdb7fb171 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
@@ -32,6 +32,7 @@
 #include "jit_api.h"
 #include "blend_jit.h"
 #include "gen_state_llvm.h"
+#include "functionpasses/passes.h"
 
 // components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized
 #define QUANTIZE_THRESHOLD 2
@@ -820,6 +821,8 @@ struct BlendJit : public Builder
         passes.add(createSCCPPass());
         passes.add(createAggressiveDCEPass());
 
+        passes.add(createLowerX86Pass(JM(), this));
+
         passes.run(*blendFunc);
 
         JitManager::DumpToFile(blendFunc, "optimized");
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index 9f9438de1d8..260daab8621 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -65,6 +65,8 @@ namespace SwrJit
         mInt32PtrTy = PointerType::get(mInt32Ty, 0);
         mInt64Ty    = Type::getInt64Ty(pJitMgr->mContext);
 
+        mSimd4FP64Ty = VectorType::get(mDoubleTy, 4);
+
         // Built in types: simd8
 
         mSimdInt1Ty     = VectorType::get(mInt1Ty,  mVWidth);
@@ -87,6 +89,8 @@ namespace SwrJit
         mSimd16VectorTy     = ArrayType::get(mSimd16FP32Ty, 4);
         mSimd16VectorTRTy   = ArrayType::get(mSimd16FP32Ty, 5);
 
+        mSimd32Int8Ty       = VectorType::get(mInt8Ty, 32);
+
         if (sizeof(uint32_t*) == 4)
         {
             mIntPtrTy = mInt32Ty;
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 516e872eb0e..0b57fbf16d4 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -66,6 +66,8 @@ namespace SwrJit
         Type*                mInt16PtrTy;
         Type*                mInt32PtrTy;
 
+        Type*                mSimd4FP64Ty;
+
         // Built in types: simd8
 
         Type*                mSimdFP16Ty;
@@ -90,8 +92,10 @@ namespace SwrJit
         Type*                mSimd16VectorTy;
         Type*                mSimd16VectorTRTy;
 
+        Type*                mSimd32Int8Ty;
+
 #include "gen_builder.hpp"
-#include "gen_builder_x86.hpp"
+#include "gen_builder_meta.hpp"
 #include "gen_builder_intrin.hpp"
 #include "builder_misc.h"
 #include "builder_math.h"
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
index dee08b81693..68695c46c81 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -159,10 +159,7 @@ namespace SwrJit
         // use avx2 gather instruction if available
         if (JM()->mArch.AVX2())
         {
-            // force mask to <N x float>, required by vgather
-            Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);
-
-            vGather = VGATHERPS(vSrc, pBasePtr, vIndices, mask, C(scale));
+            vGather = VGATHERPS(vSrc, pBasePtr, vIndices, vMask, C(scale));
         }
         else
         {
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 5971a52db7e..f9293aa3b4b 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -32,6 +32,7 @@
 #include "jit_api.h"
 #include "fetch_jit.h"
 #include "gen_state_llvm.h"
+#include "functionpasses/passes.h"
 
 //#define FETCH_DUMP_VERTEX 1
 using namespace llvm;
@@ -356,6 +357,8 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
     optPasses.add(createAggressiveDCEPass());
 
     optPasses.run(*fetch);
+
+    optPasses.add(createLowerX86Pass(JM(), this));
     optPasses.run(*fetch);
 
     JitManager::DumpToFile(fetch, "opt");
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
new file mode 100644
index 00000000000..11a2397c43d
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
@@ -0,0 +1,455 @@
+/****************************************************************************
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file lower_x86.cpp
+*
+* @brief llvm pass to lower meta code to x86
+*
+* Notes:
+*
+******************************************************************************/
+
+#include "jit_pch.hpp"
+#include "passes.h"
+#include "JitManager.h"
+
+#include <unordered_map>
+
+
+namespace llvm
+{
+    // foward declare the initializer
+    void initializeLowerX86Pass(PassRegistry&);
+}
+
+namespace SwrJit
+{
+    using namespace llvm;
+
+    enum TargetArch
+    {
+        AVX = 0,
+        AVX2 = 1,
+        AVX512 = 2
+    };
+
+    enum TargetWidth
+    {
+        W256 = 0,
+        W512 = 1,
+        NUM_WIDTHS = 2
+    };
+
+    struct LowerX86;
+
+    typedef std::function<Instruction*(LowerX86*, TargetArch, TargetWidth, CallInst*)> EmuFunc;
+
+    struct X86Intrinsic
+    {
+        Intrinsic::ID intrin[NUM_WIDTHS];
+        EmuFunc emuFunc;
+    };
+
+    // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the previous behavior of
+    // mapping directly to avx/avx2 intrinsics.
+    static std::map<std::string, Intrinsic::ID> intrinsicMap = {
+        {"meta.intrinsic.VGATHERPD",       Intrinsic::x86_avx2_gather_d_pd_256},
+        {"meta.intrinsic.VROUND",          Intrinsic::x86_avx_round_ps_256},
+        {"meta.intrinsic.BEXTR_32",        Intrinsic::x86_bmi_bextr_32},
+        {"meta.intrinsic.VPSHUFB",         Intrinsic::x86_avx2_pshuf_b},
+        {"meta.intrinsic.VCVTPD2PS",       Intrinsic::x86_avx_cvt_pd2_ps_256},
+        {"meta.intrinsic.VCVTPH2PS",       Intrinsic::x86_vcvtph2ps_256},
+        {"meta.intrinsic.VCVTPS2PH",       Intrinsic::x86_vcvtps2ph_256},
+        {"meta.intrinsic.VHSUBPS",         Intrinsic::x86_avx_hsub_ps_256},
+        {"meta.intrinsic.VPTESTC",         Intrinsic::x86_avx_ptestc_256},
+        {"meta.intrinsic.VPTESTZ",         Intrinsic::x86_avx_ptestz_256},
+        {"meta.intrinsic.VFMADDPS",        Intrinsic::x86_fma_vfmadd_ps_256},
+        {"meta.intrinsic.VMOVMSKPS",       Intrinsic::x86_avx_movmsk_ps_256},
+        {"meta.intrinsic.VPHADDD",         Intrinsic::x86_avx2_phadd_d},
+        {"meta.intrinsic.PDEP32",          Intrinsic::x86_bmi_pdep_32},
+        {"meta.intrinsic.RDTSC",           Intrinsic::x86_rdtsc},
+    };
+
+    // Forward decls
+    Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
+    Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
+    Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
+
+    static std::map<std::string, X86Intrinsic> intrinsicMap2[] = {
+        //                              256 wide                                    512 wide
+    {   // AVX
+        {"meta.intrinsic.VRCPPS",      {{Intrinsic::x86_avx_rcp_ps_256,              Intrinsic::not_intrinsic},                      NO_EMU}},
+        {"meta.intrinsic.VPERMPS",     {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VPERM_EMU}},
+        {"meta.intrinsic.VPERMD",      {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VPERM_EMU}},
+        {"meta.intrinsic.VGATHERPS",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
+        {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
+        {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
+        {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
+    },
+    {   // AVX2
+        {"meta.intrinsic.VRCPPS",      {{Intrinsic::x86_avx_rcp_ps_256,              Intrinsic::not_intrinsic},                      NO_EMU}},
+        {"meta.intrinsic.VPERMPS",     {{Intrinsic::x86_avx2_permps,                 Intrinsic::not_intrinsic},                      VPERM_EMU}},
+        {"meta.intrinsic.VPERMD",      {{Intrinsic::x86_avx2_permd,                  Intrinsic::not_intrinsic},                      VPERM_EMU}},
+        {"meta.intrinsic.VGATHERPS",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
+        {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
+        {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
+        {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
+    },
+    {   // AVX512
+        {"meta.intrinsic.VRCPPS",      {{Intrinsic::x86_avx512_rcp14_ps_256,         Intrinsic::x86_avx512_rcp14_ps_512},            NO_EMU}},
+        {"meta.intrinsic.VPERMPS",     {{Intrinsic::x86_avx512_mask_permvar_sf_256,  Intrinsic::x86_avx512_mask_permvar_sf_512},     NO_EMU}},
+        {"meta.intrinsic.VPERMD",      {{Intrinsic::x86_avx512_mask_permvar_si_256,  Intrinsic::x86_avx512_mask_permvar_si_512},     NO_EMU}},
+        {"meta.intrinsic.VGATHERPS",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
+        {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
+        {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
+        {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
+    }
+    };
+
+    struct LowerX86 : public FunctionPass
+    {
+        LowerX86(JitManager* pJitMgr = nullptr, Builder* b = nullptr)
+            : FunctionPass(ID), mpJitMgr(pJitMgr), B(b)
+        {
+            initializeLowerX86Pass(*PassRegistry::getPassRegistry());
+
+            // Determine target arch
+            if (mpJitMgr->mArch.AVX512F())
+            {
+                mTarget = AVX512;
+            }
+            else if (mpJitMgr->mArch.AVX2())
+            {
+                mTarget = AVX2;
+            }
+            else if (mpJitMgr->mArch.AVX())
+            {
+                mTarget = AVX;
+
+            }
+            else
+            {
+                SWR_ASSERT(false, "Unsupported AVX architecture.");
+                mTarget = AVX;
+            }
+        }
+
+        // Try to decipher the vector type of the instruction. This does not work properly
+        // across all intrinsics, and will have to be rethought. Probably need something
+        // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
+        // intrinsic.
+        void GetRequestedWidthAndType(CallInst* pCallInst, TargetWidth* pWidth, Type** pTy)
+        {
+            uint32_t vecWidth;
+            Type* pVecTy = pCallInst->getType();
+            if (!pVecTy->isVectorTy())
+            {
+                for (auto& op : pCallInst->arg_operands())
+                {
+                    if (op.get()->getType()->isVectorTy())
+                    {
+                        pVecTy = op.get()->getType();
+                        break;
+                    }
+                }
+            }
+            SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size");
+
+            uint32_t width = cast<VectorType>(pVecTy)->getBitWidth();
+            switch (width)
+            {
+            case 256: *pWidth = W256; break;
+            case 512: *pWidth = W512; break;
+            default: SWR_ASSERT(false, "Unhandled vector width %d", width);
+                *pWidth = W256;
+            }
+
+            *pTy = pVecTy->getScalarType();
+        }
+
+        Value* GetZeroVec(TargetWidth width, Type* pTy)
+        {
+            uint32_t numElem = 0;
+            switch (width)
+            {
+            case W256: numElem = 8; break;
+            case W512: numElem = 16; break;
+            }
+
+            return ConstantVector::getNullValue(VectorType::get(pTy, numElem));
+        }
+
+        Value* GetMask(TargetWidth width)
+        {
+            Value* mask;
+            switch (width)
+            {
+            case W256: mask = B->C((uint8_t)-1); break;
+            case W512: mask = B->C((uint16_t)-1); break;
+            }
+            return mask;
+        }
+
+        Instruction* ProcessIntrinsicAdvanced(CallInst* pCallInst)
+        {
+            Function* pFunc = pCallInst->getCalledFunction();
+            auto& intrinsic = intrinsicMap2[mTarget][pFunc->getName()];
+            TargetWidth vecWidth;
+            Type* pElemTy;
+            GetRequestedWidthAndType(pCallInst, &vecWidth, &pElemTy);
+
+            // Check if there is a native intrinsic for this instruction
+            Intrinsic::ID id = intrinsic.intrin[vecWidth];
+            if (id != Intrinsic::not_intrinsic)
+            {
+                Function* pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id);
+                SmallVector<Value*, 8> args;
+                for (auto& arg : pCallInst->arg_operands())
+                {
+                    args.push_back(arg.get());
+                }
+
+                // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and full mask for now
+                // Assuming the intrinsics are consistent and place the src operand and mask last in the argument list.
+                if (mTarget == AVX512)
+                {
+                    args.push_back(GetZeroVec(vecWidth, pElemTy));
+                    args.push_back(GetMask(vecWidth));
+                }
+
+                return B->CALLA(pIntrin, args);
+            }
+            else
+            {
+                // No native intrinsic, call emulation function
+                return intrinsic.emuFunc(this, mTarget, vecWidth, pCallInst);
+            }
+
+            SWR_ASSERT(false);
+            return nullptr;
+        }
+
+        Instruction* ProcessIntrinsic(CallInst* pCallInst)
+        {
+            Function* pFunc = pCallInst->getCalledFunction();
+            
+            // Forward to the advanced support if found
+            if (intrinsicMap2[mTarget].find(pFunc->getName()) != intrinsicMap2[mTarget].end())
+            {
+                return ProcessIntrinsicAdvanced(pCallInst);
+            }
+
+            SWR_ASSERT(intrinsicMap.find(pFunc->getName()) != intrinsicMap.end(), "Unimplemented intrinsic %s.", pFunc->getName());
+
+            Intrinsic::ID x86Intrinsic = intrinsicMap[pFunc->getName()];
+            Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic);
+
+            SmallVector<Value*, 8> args;
+            for (auto& arg : pCallInst->arg_operands())
+            {
+                args.push_back(arg.get());
+            }
+            return B->CALLA(pX86IntrinFunc, args);
+        }
+
+        //////////////////////////////////////////////////////////////////////////
+        /// @brief LLVM funtion pass run method.
+        /// @param f- The function we're working on with this pass.
+        virtual bool runOnFunction(Function& F)
+        {
+            std::vector<Instruction*> toRemove;
+
+            for (auto& BB : F.getBasicBlockList())
+            {
+                for (auto& I : BB.getInstList())
+                {
+                    if (CallInst* pCallInst = dyn_cast<CallInst>(&I))
+                    {
+                        Function* pFunc = pCallInst->getCalledFunction();
+                        if (pFunc)
+                        {
+                            if (pFunc->getName().startswith("meta.intrinsic"))
+                            {
+                                B->IRB()->SetInsertPoint(&I);
+                                Instruction* pReplace = ProcessIntrinsic(pCallInst);
+                                SWR_ASSERT(pReplace);
+                                toRemove.push_back(pCallInst);
+                                pCallInst->replaceAllUsesWith(pReplace);
+                            }
+                        }
+
+                    }
+                }
+            }
+
+            for (auto* pInst : toRemove)
+            {
+                pInst->eraseFromParent();
+            }
+
+            JitManager::DumpToFile(&F, "lowerx86");
+
+            return true;
+        }
+
+        virtual void getAnalysisUsage(AnalysisUsage& AU) const
+        {
+        }
+
+        JitManager* JM() { return mpJitMgr; }
+
+        JitManager* mpJitMgr;
+        Builder* B;
+
+        TargetArch mTarget;
+
+        static char ID;  ///< Needed by LLVM to generate ID for FunctionPass.
+    };
+
+    char LowerX86::ID = 0;   // LLVM uses address of ID as the actual ID.
+
+    FunctionPass* createLowerX86Pass(JitManager* pJitMgr, Builder* b)
+    {
+        return new LowerX86(pJitMgr, b);
+    }
+
+    Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
+    {
+        SWR_ASSERT(false, "Unimplemented intrinsic emulation.");
+        return nullptr;
+    }
+
+    Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
+    {
+        // Only need vperm emulation for AVX
+        SWR_ASSERT(arch == AVX);
+
+        Builder* B = pThis->B;
+        auto v32A = pCallInst->getArgOperand(0);
+        auto vi32Index = pCallInst->getArgOperand(1);
+
+        Value* v32Result;
+        if (isa<Constant>(vi32Index))
+        {
+            // Can use llvm shuffle vector directly with constant shuffle indices
+            v32Result = B->VSHUFFLE(v32A, v32A, vi32Index);
+        }
+        else
+        {
+            v32Result = UndefValue::get(v32A->getType());
+            for (uint32_t l = 0; l < v32A->getType()->getVectorNumElements(); ++l)
+            {
+                auto i32Index = B->VEXTRACT(vi32Index, B->C(l));
+                auto val = B->VEXTRACT(v32A, i32Index);
+                v32Result = B->VINSERT(v32Result, val, B->C(l));
+            }
+        }
+        return cast<Instruction>(v32Result);
+    }
+
+    Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
+    {
+        Builder* B = pThis->B;
+        auto vSrc = pCallInst->getArgOperand(0);
+        auto pBase = pCallInst->getArgOperand(1);
+        auto vi32Indices = pCallInst->getArgOperand(2);
+        auto vi1Mask = pCallInst->getArgOperand(3);
+        auto i8Scale = pCallInst->getArgOperand(4);
+
+        pBase = B->INT_TO_PTR(pBase, PointerType::get(B->mInt8Ty, 0));
+        uint32_t numElem = vSrc->getType()->getVectorNumElements();
+        auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
+        auto srcTy = vSrc->getType()->getVectorElementType();
+        Value* v32Gather;
+        if (arch == AVX)
+        {
+            // Full emulation for AVX
+            // Store source on stack to provide a valid address to load from inactive lanes
+            auto pStack = B->STACKSAVE();
+            auto pTmp = B->ALLOCA(vSrc->getType());
+            B->STORE(vSrc, pTmp);
+
+            v32Gather = UndefValue::get(vSrc->getType());
+            auto vi32Scale = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale));
+            auto vi32Offsets = B->MUL(vi32Indices, vi32Scale);
+
+            for (uint32_t i = 0; i < numElem; ++i)
+            {
+                auto i32Offset = B->VEXTRACT(vi32Offsets, B->C(i));
+                auto pLoadAddress = B->GEP(pBase, i32Offset);
+                pLoadAddress = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0));
+                auto pMaskedLoadAddress = B->GEP(pTmp, { 0, i });
+                auto i1Mask = B->VEXTRACT(vi1Mask, B->C(i));
+                auto pValidAddress = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress);
+                auto val = B->LOAD(pValidAddress);
+                v32Gather = B->VINSERT(v32Gather, val, B->C(i));
+            }
+
+            B->STACKRESTORE(pStack);
+        }
+        else if (arch == AVX2 || (arch == AVX512 && width == W256))
+        {
+            Function* pX86IntrinFunc = srcTy == B->mFP32Ty ? Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_ps_256) :
+                Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_d_256);
+            if (width == W256)
+            {
+                auto v32Mask = B->BITCAST(B->VMASK(vi1Mask), vSrc->getType());
+                v32Gather = B->CALL(pX86IntrinFunc, { vSrc, pBase, vi32Indices, v32Mask, i8Scale });
+            }
+            else if (width == W512)
+            {
+                // Double pump 8-wide
+                auto v32Mask = B->BITCAST(B->VMASK_16(vi1Mask), vSrc->getType());
+                Value *src0 = B->EXTRACT_16(vSrc, 0);
+                Value *src1 = B->EXTRACT_16(vSrc, 1);
+
+                Value *indices0 = B->EXTRACT_16(vi32Indices, 0);
+                Value *indices1 = B->EXTRACT_16(vi32Indices, 1);
+
+                Value *mask0 = B->EXTRACT_16(v32Mask, 0);
+                Value *mask1 = B->EXTRACT_16(v32Mask, 1);
+
+                Value *gather0 = B->CALL(pX86IntrinFunc, { src0, pBase, indices0, mask0, i8Scale });
+                Value *gather1 = B->CALL(pX86IntrinFunc, { src1, pBase, indices1, mask1, i8Scale });
+
+                v32Gather = B->JOIN_16(gather0, gather1);
+            }
+        }
+        else if (arch == AVX512)
+        {
+            auto i16Mask = B->BITCAST(vi1Mask, B->mInt16Ty);
+
+            Function* pX86IntrinFunc = srcTy == B->mFP32Ty ? Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dps_512) :
+                Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dpi_512);
+            auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
+            v32Gather = B->CALL(pX86IntrinFunc, { vSrc, pBase, vi32Indices, i16Mask, i32Scale });
+        }
+
+        return cast<Instruction>(v32Gather);
+    }
+}
+
+using namespace SwrJit;
+
+INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false)
+INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false)
+
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h
new file mode 100644
index 00000000000..f7373f034be
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h
@@ -0,0 +1,37 @@
+/****************************************************************************
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file passes.h
+*
+* @brief Include file for llvm passes
+*
+******************************************************************************/
+
+#include "JitManager.h"
+#include "builder.h"
+
+namespace SwrJit
+{
+    using namespace llvm;
+
+    FunctionPass* createLowerX86Pass(JitManager* pJitMgr, Builder* b);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/meson.build b/src/gallium/drivers/swr/rasterizer/jitter/meson.build
index 4a2f46ae1e7..5c201990b50 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/meson.build
+++ b/src/gallium/drivers/swr/rasterizer/jitter/meson.build
@@ -37,12 +37,12 @@ gen_builder_hpp = custom_target(
   build_by_default : true,
 )
 
-gen_builder_x86_hpp = custom_target(
-  'gen_builder_x86.hpp',
+gen_builder_meta_hpp = custom_target(
+  'gen_builder_meta.hpp',
   input : '../codegen/gen_llvm_ir_macros.py',
-  output : 'gen_builder_x86.hpp',
+  output : 'gen_builder_meta.hpp',
   command : [
-    prog_python2, '@INPUT0@', '--gen_x86_h', '--output', '@OUTPUT@',
+    prog_python2, '@INPUT0@', '--gen_meta_h', '--output', '@OUTPUT@',
     '--output-dir', '@OUTDIR@'
   ],
   depend_files : swr_gen_builder_depends,
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
index f9d858090ff..15a6bc40289 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
@@ -32,6 +32,7 @@
 #include "jit_api.h"
 #include "streamout_jit.h"
 #include "gen_state_llvm.h"
+#include "functionpasses/passes.h"
 
 using namespace llvm;
 using namespace SwrJit;
@@ -306,6 +307,8 @@ struct StreamOutJit : public Builder
         passes.add(createSCCPPass());
         passes.add(createAggressiveDCEPass());
 
+        passes.add(createLowerX86Pass(JM(), this));
+
         passes.run(*soFunc);
 
         JitManager::DumpToFile(soFunc, "SoFunc_optimized");