4 files changed, 194 insertions, 47 deletions
diff --git a/src/mesa/pipe/draw/draw_vertex_shader_llvm.c b/src/mesa/pipe/draw/draw_vertex_shader_llvm.c
index b38498efc57..fd499010516 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader_llvm.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader_llvm.c
@@ -112,7 +112,7 @@ void draw_vertex_shader_queue_flush_llvm(struct draw_context *draw)
    float (*consts)[4] = (float (*)[4]) draw->mapped_constants;
    struct ga_llvm_prog *prog = draw->vertex_shader->state->llvm_prog;
 
-   fprintf(stderr, "XX q(%d) ", draw->vs.queue_nr);
+   fprintf(stderr, "--- XX q(%d) ", draw->vs.queue_nr);
 
    /* fetch the inputs */
    for (i = 0; i < draw->vs.queue_nr; ++i) {
@@ -123,7 +123,8 @@ void draw_vertex_shader_queue_flush_llvm(struct draw_context *draw)
 
    /* batch execute the shaders on all the vertices */
    ga_llvm_prog_exec(prog, inputs, dests, consts,
-                     draw->vs.queue_nr);
+                     draw->vs.queue_nr,
+                     draw->vertex_info.num_attribs);
 
    draw->vs.queue_nr = 0;
 }
diff --git a/src/mesa/pipe/llvm/llvm_builtins.c b/src/mesa/pipe/llvm/llvm_builtins.c
index 0f0efeb3039..c7a9ea0d5a1 100644
--- a/src/mesa/pipe/llvm/llvm_builtins.c
+++ b/src/mesa/pipe/llvm/llvm_builtins.c
@@ -1,20 +1,29 @@
+/* clang --emit-llvm llvm_builtins.c |llvm-as |opt -std-compile-opts |llvm-dis */
+/* clang --emit-llvm llvm_builtins.c |llvm-as |opt -std-compile-opts |llvm2cpp -for=Shader -gen-module -funcname=createBaseShader */
+typedef __attribute__(( ocu_vector_type(4) )) float float4;
 
+#if 0
+//clang doesn't suppoer "struct->member" notation yet
+struct vertex_header {
+   unsigned clipmask:12;
+   unsigned edgeflag:1;
+   unsigned pad:3;
+   unsigned vertex_id:16;
 
-inline float4 compute_clip(float4 vec, float4 scale, float4 trans)
-{
-   return vec*scale + trans;
-}
+   float clip[4];
 
+   float data[][4];
+};
 
 inline float
-dot4(const float4 a, const float4 b)
+dot4(float4 a, float4 b)
 {
    float4 c = a*b;
    return c.x + c.y + c.z + c.w;
 }
 
 inline unsigned
-compute_clipmask(float4 clip, const float4 (*plane), unsigned nr)
+compute_clipmask(float4 clip, float4 (*plane), unsigned nr)
 {
    unsigned mask = 0;
    unsigned i;
@@ -29,7 +38,8 @@ compute_clipmask(float4 clip, const float4 (*plane), unsigned nr)
 
 inline void collect_results(float4 *results, struct vertex_header *vOut,
                             float4 *planes, int nr_planes,
-                            float4 scale, float4 trans)
+                            float4 scale, float4 trans,
+                            int num_attribs)
 {
    /* store results */
    unsigned slot;
@@ -38,13 +48,14 @@ inline void collect_results(float4 *results, struct vertex_header *vOut,
    /* Handle attr[0] (position) specially:
     */
    float4 res0 = results[0];
-   x = vOut->clip[0] = clip.x;
-   y = vOut->clip[1] = clip.y;
-   z = vOut->clip[2] = clip.z;
-   w = vOut->clip[3] = clip.w;
+   float *clip = vOut->clip;
+   x = clip[0] = res0.x;
+   y = clip[1] = res0.y;
+   z = clip[2] = res0.z;
+   w = clip[3] = res0.w;
 
-   vOut[i]->clipmask = compute_clipmask(res0, planes, nr_planes);
-   vOut[i]->edgeflag = 1;
+   vOut->clipmask = compute_clipmask(res0, planes, nr_planes);
+   vOut->edgeflag = 1;
 
    /* divide by w */
    w = 1.0f / w;
@@ -54,10 +65,10 @@ inline void collect_results(float4 *results, struct vertex_header *vOut,
    res0.x = x; res0.y = y; res0.z = z; res0.w = 1;
 
    /* Viewport mapping */
-   res = res * scale + trans;
-   vOut->data[0][0] = res.x;
-   vOut->data[0][1] = res.y;
-   vOut->data[0][2] = res.z;
+   res0 = res0 * scale + trans;
+   vOut->data[0][0] = res0.x;
+   vOut->data[0][1] = res0.y;
+   vOut->data[0][2] = res0.z;
    vOut->data[0][3] = w;
 
    /* Remaining attributes are packed into sequential post-transform
@@ -65,7 +76,7 @@ inline void collect_results(float4 *results, struct vertex_header *vOut,
     * Skip 0 since we just did it above.
     * Subtract two because of the VERTEX_HEADER, CLIP_POS attribs.
     */
-   for (slot = 1; slot < draw->vertex_info.num_attribs - 2; slot++) {
+   for (slot = 1; slot < num_attribs - 2; slot++) {
       float4 vec = results[slot];
       vOut->data[slot][0] = vec.x;
       vOut->data[slot][1] = vec.y;
@@ -79,12 +90,68 @@ inline void collect_results(float4 *results, struct vertex_header *vOut,
              vOut->data[slot][3]);
    }
 }
+#endif
 
-void run_vertex_shader(float ainputs[VS_QUEUE_LENGTH][PIPE_MAX_SHADER_INPUTS][4],
-                       struct vertex_header *dests[VS_QUEUE_LENGTH],
-                       float *aconsts[4]
-                       int count)
+void from_array(float4 (*res)[32], float (*ainputs)[32][4],
+                int count, int num_attribs)
 {
-   float4  inputs[VS_QUEUE_LENGTH][PIPE_MAX_SHADER_INPUTS];
-   float4 *consts;
+   for (int i = 0; i < count; ++i) {
+      for (int j = 0; j < num_attribs; ++j) {
+         float4 vec;
+         vec.x = ainputs[i][j][0];
+         vec.y = ainputs[i][j][1];
+         vec.z = ainputs[i][j][2];
+         vec.w = ainputs[i][j][3];
+         res[i][j] = vec;
+      }
+   }
+}
+
+void from_consts(float4 *res, float (*ainputs)[4],
+                int count)
+{
+   for (int i = 0; i < count; ++i) {
+      float4 vec;
+      vec.x = ainputs[i][0];
+      vec.y = ainputs[i][1];
+      vec.z = ainputs[i][2];
+      vec.w = ainputs[i][3];
+      res[i] = vec;
+   }
+}
+
+void to_array(float (*dests)[4], float4 *in, int num_attribs)
+{
+   for (int i = 0; i < num_attribs; ++i) {
+      float  *rd = dests[i];
+      float4  ri = in[i];
+      rd[0] = ri.x;
+      rd[1] = ri.y;
+      rd[2] = ri.z;
+      rd[3] = ri.w;
+   }
+}
+
+extern void execute_shader(float4 *dests, float4 *inputs,
+                           float4 *consts);
+
+void run_vertex_shader(float (*ainputs)[32][4],
+                       float (*dests)[32][4],
+                       float (*aconsts)[4],
+                       int count,
+                       int num_attribs)
+{
+   float4  inputs[16*32*4][32];
+   float4  consts[32];
+   float4  results[16*32*4][32];
+
+   printf("XXXXXXXXXXX run_vertex_shader\n");
+   from_array(inputs, ainputs, count, num_attribs);
+   from_consts(consts, aconsts, 32);
+   for (int i = 0; i < count; ++i) {
+      float4 *in  = inputs[i];
+      float4 *res = results[i];
+      to_array(dests[i], results[i], num_attribs);
+      execute_shader(res, in, consts);
+   }
 }
diff --git a/src/mesa/pipe/llvm/llvmtgsi.cpp b/src/mesa/pipe/llvm/llvmtgsi.cpp
index 46b7561b5e8..1abc148521f 100644
--- a/src/mesa/pipe/llvm/llvmtgsi.cpp
+++ b/src/mesa/pipe/llvm/llvmtgsi.cpp
@@ -14,15 +14,88 @@
 #include <llvm/DerivedTypes.h>
 #include <llvm/Instructions.h>
 #include <llvm/ModuleProvider.h>
+#include <llvm/Pass.h>
+#include <llvm/PassManager.h>
 #include <llvm/ParameterAttributes.h>
 #include <llvm/Support/PatternMatch.h>
 #include <llvm/ExecutionEngine/JIT.h>
 #include <llvm/ExecutionEngine/Interpreter.h>
 #include <llvm/ExecutionEngine/GenericValue.h>
 #include <llvm/Support/MemoryBuffer.h>
+#include <llvm/LinkAllPasses.h>
+#include <llvm/Analysis/Verifier.h>
+#include <llvm/Analysis/LoopPass.h>
+#include <llvm/Target/TargetData.h>
 #include <llvm/Bitcode/ReaderWriter.h>
 #include <iostream>
 
+using namespace llvm;
+#include "llvm_base_shader.cpp"
+
+
+static inline void addPass(PassManager &PM, Pass *P) {
+  // Add the pass to the pass manager...
+  PM.add(P);
+}
+
+static inline void AddStandardCompilePasses(PassManager &PM) {
+   PM.add(createVerifierPass());                  // Verify that input is correct
+
+   addPass(PM, createLowerSetJmpPass());          // Lower llvm.setjmp/.longjmp
+
+   // If the -strip-debug command line option was specified, do it.
+   //if (StripDebug)
+   //  addPass(PM, createStripSymbolsPass(true));
+
+   addPass(PM, createRaiseAllocationsPass());     // call %malloc -> malloc inst
+   addPass(PM, createCFGSimplificationPass());    // Clean up disgusting code
+   addPass(PM, createPromoteMemoryToRegisterPass());// Kill useless allocas
+   addPass(PM, createGlobalOptimizerPass());      // Optimize out global vars
+   addPass(PM, createGlobalDCEPass());            // Remove unused fns and globs
+   addPass(PM, createIPConstantPropagationPass());// IP Constant Propagation
+   addPass(PM, createDeadArgEliminationPass());   // Dead argument elimination
+   addPass(PM, createInstructionCombiningPass()); // Clean up after IPCP & DAE
+   addPass(PM, createCFGSimplificationPass());    // Clean up after IPCP & DAE
+
+   addPass(PM, createPruneEHPass());              // Remove dead EH info
+
+   //if (!DisableInline)
+   addPass(PM, createFunctionInliningPass());   // Inline small functions
+   addPass(PM, createArgumentPromotionPass());    // Scalarize uninlined fn args
+
+   addPass(PM, createTailDuplicationPass());      // Simplify cfg by copying code
+   addPass(PM, createInstructionCombiningPass()); // Cleanup for scalarrepl.
+   addPass(PM, createCFGSimplificationPass());    // Merge & remove BBs
+   addPass(PM, createScalarReplAggregatesPass()); // Break up aggregate allocas
+   addPass(PM, createInstructionCombiningPass()); // Combine silly seq's
+   addPass(PM, createCondPropagationPass());      // Propagate conditionals
+
+   addPass(PM, createTailCallEliminationPass());  // Eliminate tail calls
+   addPass(PM, createCFGSimplificationPass());    // Merge & remove BBs
+   addPass(PM, createReassociatePass());          // Reassociate expressions
+   addPass(PM, createLoopRotatePass());
+   addPass(PM, createLICMPass());                 // Hoist loop invariants
+   addPass(PM, createLoopUnswitchPass());         // Unswitch loops.
+   addPass(PM, createLoopIndexSplitPass());       // Index split loops.
+   addPass(PM, createInstructionCombiningPass()); // Clean up after LICM/reassoc
+   addPass(PM, createIndVarSimplifyPass());       // Canonicalize indvars
+   addPass(PM, createLoopUnrollPass());           // Unroll small loops
+   addPass(PM, createInstructionCombiningPass()); // Clean up after the unroller
+   addPass(PM, createGVNPass());                  // Remove redundancies
+   addPass(PM, createSCCPPass());                 // Constant prop with SCCP
+
+   // Run instcombine after redundancy elimination to exploit opportunities
+   // opened up by them.
+   addPass(PM, createInstructionCombiningPass());
+   addPass(PM, createCondPropagationPass());      // Propagate conditionals
+
+   addPass(PM, createDeadStoreEliminationPass()); // Delete dead stores
+   addPass(PM, createAggressiveDCEPass());        // SSA based 'Aggressive DCE'
+   addPass(PM, createCFGSimplificationPass());    // Merge & remove BBs
+   addPass(PM, createSimplifyLibCallsPass());     // Library Call Optimizations
+   addPass(PM, createDeadTypeEliminationPass());  // Eliminate dead types
+   addPass(PM, createConstantMergePass());        // Merge dup global constants
+}
 
 static void
 translate_declaration(llvm::Module *module,
@@ -341,7 +414,7 @@ translate_instruction(llvm::Module *module,
 static llvm::Module *
 tgsi_to_llvm(const struct tgsi_token *tokens)
 {
-   llvm::Module *mod = new llvm::Module("tgsi");
+   llvm::Module *mod = createBaseShader();
    struct tgsi_parse_context parse;
    struct tgsi_full_instruction fi;
    struct tgsi_full_declaration fd;
@@ -402,18 +475,33 @@ ga_llvm_from_tgsi(const struct tgsi_token *tokens)
    struct ga_llvm_prog *ga_llvm =
       (struct ga_llvm_prog *)malloc(sizeof(struct ga_llvm_prog));
    llvm::Module *mod = tgsi_to_llvm(tokens);
+
+   /* Run optimization passes over it */
+   PassManager passes;
+   // Add an appropriate TargetData instance for this module...
+   passes.add(new TargetData(mod));
+   AddStandardCompilePasses(passes);
+   std::cout<<"Running optimization passes..."<<std::endl;
+   bool b = passes.run(*mod);
+   std::cout<<"\tModified mod = "<<b<<std::endl;
+
    llvm::ExistingModuleProvider *mp =
       new llvm::ExistingModuleProvider(mod);
-   //llvm::ExecutionEngine *ee =
-   //   llvm::ExecutionEngine::create(mp, false);
+   llvm::ExecutionEngine *ee =
+      llvm::ExecutionEngine::create(mp, false);
 
    ga_llvm->module = mod;
-   ga_llvm->engine = 0;//ee;
+   ga_llvm->engine = ee;
    fprintf(stderr, "DUMPX \n");
    //tgsi_dump(tokens, TGSI_DUMP_VERBOSE);
    tgsi_dump(tokens, 0);
    fprintf(stderr, "DUMPEND \n");
 
+   Function *func = mod->getFunction("run_vertex_shader");
+   std::cout << "run_vertex_shader  = "<<func;
+   ga_llvm->function = ee->getPointerToFunctionOrStub(func);
+   std::cout << " -- FUNC is " <<ga_llvm->function;
+
    return ga_llvm;
 }
 
@@ -423,6 +511,7 @@ void ga_llvm_prog_delete(struct ga_llvm_prog *prog)
    delete mod;
    prog->module = 0;
    prog->engine = 0;
+   prog->function = 0;
    free(prog);
 }
 
@@ -430,24 +519,12 @@ int ga_llvm_prog_exec(struct ga_llvm_prog *prog,
                       float (*inputs)[32][4],
                       void *dests[16*32*4],
                       float (*consts)[4],
-                      int count)
+                      int count,
+                      int num_attribs)
 {
-   //std::cout << "START "<<std::endl;
-   llvm::Module *mod = static_cast<llvm::Module*>(prog->module);
-   llvm::Function *func = mod->getFunction("main");
-   llvm::ExecutionEngine *ee = static_cast<llvm::ExecutionEngine*>(prog->engine);
-
-   std::vector<llvm::GenericValue> args(0);
-   //args[0] = GenericValue(&st);
-   //std::cout << "Mod is "<<*mod;
-   //std::cout << "\n\nRunning llvm: " << std::endl;
-   if (func) {
-      std::cout << "Func is "<<func;
-      llvm::GenericValue gv = ee->runFunction(func, args);
-   }
+   std::cout << "---- START LLVM Execution "<<std::endl;
 
-//delete ee;
-//delete mp;
 
+   std::cout << "---- END LLVM Execution "<<std::endl;
    return 0;
 }
diff --git a/src/mesa/pipe/llvm/llvmtgsi.h b/src/mesa/pipe/llvm/llvmtgsi.h
index 9fbb0ea8f9d..b1b5717f6d0 100644
--- a/src/mesa/pipe/llvm/llvmtgsi.h
+++ b/src/mesa/pipe/llvm/llvmtgsi.h
@@ -12,6 +12,7 @@ struct tgsi_sampler;
 struct ga_llvm_prog {
    void *module;
    void *engine;
+   void *function;
 };
 struct ga_llvm_prog *
 ga_llvm_from_tgsi(const struct tgsi_token *tokens);
@@ -22,7 +23,8 @@ int ga_llvm_prog_exec(struct ga_llvm_prog *prog,
                       float (*inputs)[32][4],
                       void *dests[16*32*4],
                       float (*consts)[4],
-                      int count);
+                      int count,
+                      int num_attribs);
 
 #if defined __cplusplus
 } // extern "C"