12 files changed, 121 insertions, 79 deletions
diff --git a/src/mesa/Makefile b/src/mesa/Makefile
index 3ff8da7e4d6..6e13f5289c5 100644
--- a/src/mesa/Makefile
+++ b/src/mesa/Makefile
@@ -220,6 +220,7 @@ tags:
 
 clean:
 	-rm -f */*.o
+	-rm -f */*/*.o
 	-rm -f depend depend.bak mesa.a
 	-rm -f drivers/*/*.o
 	(cd drivers/dri ; $(MAKE) clean)
diff --git a/src/mesa/main/imports.c b/src/mesa/main/imports.c
index 767cef59e2e..1972826a756 100644
--- a/src/mesa/main/imports.c
+++ b/src/mesa/main/imports.c
@@ -842,7 +842,7 @@ _mesa_printf( const char *fmtString, ... )
 #if defined(XFree86LOADER) && defined(IN_MODULE)
    xf86printf("%s", s);
 #else
-   printf("%s", s);
+   fprintf(stderr,"%s", s);
 #endif
 }
 
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 62a8c76aa14..1733c43bce0 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -1770,6 +1770,7 @@ struct vertex_program
    GLuint InputsRead;     /* Bitmask of which input regs are read */
    GLuint OutputsWritten; /* Bitmask of which output regs are written to */
    struct program_parameter_list *Parameters; /**< array [NumParameters] */
+   void *TnlData;		/* should probably use Base.DriverData */
 };
 
 
diff --git a/src/mesa/tnl/t_context.c b/src/mesa/tnl/t_context.c
index 48119d2d58e..a6a7cdc738b 100644
--- a/src/mesa/tnl/t_context.c
+++ b/src/mesa/tnl/t_context.c
@@ -95,7 +95,11 @@ _tnl_CreateContext( GLcontext *ctx )
    _tnl_save_init( ctx );
    _tnl_array_init( ctx );
    _tnl_vtx_init( ctx );
-   _tnl_install_pipeline( ctx, _tnl_default_pipeline );
+
+   if (ctx->_MaintainTnlProgram) 
+      _tnl_install_pipeline( ctx, _tnl_vp_pipeline );
+   else 
+      _tnl_install_pipeline( ctx, _tnl_default_pipeline );
 
    /* Initialize the arrayelt helper
     */
@@ -140,6 +144,8 @@ _tnl_DestroyContext( GLcontext *ctx )
    _tnl_destroy_pipeline( ctx );
    _ae_destroy_context( ctx );
 
+   _tnl_ProgramCacheDestroy( ctx );
+
    FREE(tnl);
    ctx->swtnl_context = NULL;
 }
diff --git a/src/mesa/tnl/t_context.h b/src/mesa/tnl/t_context.h
index 4988920cf25..cdaa252e8fc 100644
--- a/src/mesa/tnl/t_context.h
+++ b/src/mesa/tnl/t_context.h
@@ -614,6 +614,15 @@ struct tnl_clipspace
 };
 
 
+
+struct tnl_cache {
+   GLuint hash;
+   void *key;
+   void *data;
+   struct tnl_cache *next;
+};
+
+
 struct tnl_device_driver
 {
    /***
@@ -769,6 +778,8 @@ typedef struct
    GLvertexformat exec_vtxfmt;
    GLvertexformat save_vtxfmt;
 
+   struct tnl_cache *vp_cache;
+
 } TNLcontext;
 
 
diff --git a/src/mesa/tnl/t_pipeline.c b/src/mesa/tnl/t_pipeline.c
index 6b0ea815e2b..61bfed290e9 100644
--- a/src/mesa/tnl/t_pipeline.c
+++ b/src/mesa/tnl/t_pipeline.c
@@ -131,9 +131,8 @@ void _tnl_run_pipeline( GLcontext *ctx )
     * (ie const or non-const).
     */
    if (check_input_changes( ctx ) || tnl->pipeline.new_state) {
-#if TNL_FIXED_FUNCTION_PROGRAM
-      _tnl_UpdateFixedFunctionProgram( ctx );
-#endif
+      if (ctx->_MaintainTnlProgram)
+	 _tnl_UpdateFixedFunctionProgram( ctx );
 
       for (i = 0; i < tnl->pipeline.nr_stages ; i++) {
 	 struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i];
@@ -197,9 +196,6 @@ void _tnl_run_pipeline( GLcontext *ctx )
  * case, if it becomes necessary to do so.
  */
 const struct tnl_pipeline_stage *_tnl_default_pipeline[] = {
-#if TNL_FIXED_FUNCTION_PROGRAM
-   &_tnl_arb_vertex_program_stage,
-#else
    &_tnl_vertex_transform_stage,
    &_tnl_normal_transform_stage,
    &_tnl_lighting_stage,
@@ -208,9 +204,15 @@ const struct tnl_pipeline_stage *_tnl_default_pipeline[] = {
    &_tnl_texture_transform_stage,
    &_tnl_point_attenuation_stage,
 #if defined(FEATURE_NV_vertex_program) || defined(FEATURE_ARB_vertex_program)
-   &_tnl_vertex_program_stage,
-#endif
+   &_tnl_arb_vertex_program_stage,
+   &_tnl_vertex_program_stage, 
 #endif
    &_tnl_render_stage,
    NULL 
 };
+
+const struct tnl_pipeline_stage *_tnl_vp_pipeline[] = {
+   &_tnl_arb_vertex_program_stage,
+   &_tnl_render_stage,
+   NULL
+};
diff --git a/src/mesa/tnl/t_pipeline.h b/src/mesa/tnl/t_pipeline.h
index a9d8c7e4899..6c7a0814c56 100644
--- a/src/mesa/tnl/t_pipeline.h
+++ b/src/mesa/tnl/t_pipeline.h
@@ -59,6 +59,7 @@ extern const struct tnl_pipeline_stage _tnl_render_stage;
 /* Shorthand to plug in the default pipeline:
  */
 extern const struct tnl_pipeline_stage *_tnl_default_pipeline[];
+extern const struct tnl_pipeline_stage *_tnl_vp_pipeline[];
 
 
 /* Convenience routines provided by t_vb_render.c:
diff --git a/src/mesa/tnl/t_vb_arbprogram.c b/src/mesa/tnl/t_vb_arbprogram.c
index 07dc34d2668..4789b790175 100644
--- a/src/mesa/tnl/t_vb_arbprogram.c
+++ b/src/mesa/tnl/t_vb_arbprogram.c
@@ -56,7 +56,6 @@ struct opcode_info {
 struct compilation {
    GLuint reg_active;
    union instruction *csr;
-   struct vertex_buffer *VB;	/* for input sizes! */
 };
 
 
@@ -518,7 +517,7 @@ static void print_reg( GLuint file, GLuint reg )
 	 _mesa_printf("ARG%d", reg - REG_ARG0);
       else if (reg >= REG_TMP0 && reg <= REG_TMP11)
 	 _mesa_printf("TMP%d", reg - REG_TMP0);
-      else if (reg >= REG_IN0 && reg <= REG_IN15)
+      else if (reg >= REG_IN0 && reg <= REG_IN31)
 	 _mesa_printf("IN%d", reg - REG_IN0);
       else if (reg >= REG_OUT0 && reg <= REG_OUT14)
 	 _mesa_printf("OUT%d", reg - REG_OUT0);
@@ -989,18 +988,33 @@ static void cvp_emit_inst( struct compilation *cp,
    }
 }
 
+static void free_tnl_data( struct vertex_program *program  )
+{
+   struct tnl_compiled_program *p = program->TnlData;
+   if (p->compiled_func) free((void *)p->compiled_func);
+   free(p);
+   program->TnlData = NULL;
+}
 
-static void compile_vertex_program( struct arb_vp_machine *m,
-				    const struct vertex_program *program )
+static void compile_vertex_program( struct vertex_program *program,
+				    GLboolean try_codegen )
 { 
    struct compilation cp;
+   struct tnl_compiled_program *p = CALLOC_STRUCT(tnl_compiled_program);
    GLuint i;
 
-   /* Initialize cp:
+   _mesa_printf("%s\n", __FUNCTION__);
+
+   if (program->TnlData) 
+      free_tnl_data( program );
+   
+   program->TnlData = p;
+
+   /* Initialize cp.  Note that ctx and VB aren't used in compilation
+    * so we don't have to worry about statechanges:
     */
    memset(&cp, 0, sizeof(cp));
-   cp.VB = m->VB;
-   cp.csr = m->store;
+   cp.csr = p->instructions;
 
    /* Compile instructions:
     */
@@ -1010,24 +1024,20 @@ static void compile_vertex_program( struct arb_vp_machine *m,
 
    /* Finish up:
     */
-   m->instructions = m->store;
-   m->nr_instructions = cp.csr - m->store;
-
+   p->nr_instructions = cp.csr - p->instructions;
 
    /* Print/disassemble:
     */
    if (DISASSEM) {
-      for (i = 0; i < m->nr_instructions; i++) {
-	 _tnl_disassem_vba_insn(m->instructions[i]);
+      for (i = 0; i < p->nr_instructions; i++) {
+	 _tnl_disassem_vba_insn(p->instructions[i]);
       }
       _mesa_printf("\n\n");
    }
    
 #ifdef USE_SSE_ASM
-   /* TODO: check if anything changed...
-    */
-   if (m->try_codegen)
-      _tnl_sse_codegen_vertex_program(m);
+   if (try_codegen)
+      _tnl_sse_codegen_vertex_program(p);
 #endif
 
 }
@@ -1046,7 +1056,7 @@ static void userclip( GLcontext *ctx,
 {
    GLuint p;
 
-   for (p = 0; p < ctx->Const.MaxClipPlanes; p++)
+   for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
       if (ctx->Transform.ClipPlanesEnabled & (1 << p)) {
 	 GLuint nr, i;
 	 const GLfloat a = ctx->Transform._ClipUserPlane[p][0];
@@ -1079,6 +1089,7 @@ static void userclip( GLcontext *ctx,
 	    }
 	 }
       }
+   }
 }
 
 
@@ -1138,9 +1149,10 @@ static GLboolean do_ndc_cliptest( struct arb_vp_machine *m )
 }
 
 
-static void call_func( struct arb_vp_machine *m )
+static INLINE void call_func( struct tnl_compiled_program *p,
+			      struct arb_vp_machine *m )
 {
-   m->func(m);
+   p->compiled_func(m);
 }
 
 /**
@@ -1160,12 +1172,18 @@ run_arb_vertex_program(GLcontext *ctx, struct tnl_pipeline_stage *stage)
 				     ctx->_TnlProgram);
    struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
    struct arb_vp_machine *m = ARB_VP_MACHINE(stage);
-   GLuint i, j, outputs = program->OutputsWritten;
+   struct tnl_compiled_program *p;
+   GLuint i, j, outputs;
+
+   if (!program || program->IsNVProgram)
+      return GL_TRUE;   
 
    if (program->Parameters) {
       _mesa_load_state_parameters(ctx, program->Parameters);
    }   
    
+   p = (struct tnl_compiled_program *)program->TnlData;
+   assert(p);
 
    /* Initialize regs where necessary:
     */
@@ -1173,11 +1191,11 @@ run_arb_vertex_program(GLcontext *ctx, struct tnl_pipeline_stage *stage)
 
    m->nr_inputs = m->nr_outputs = 0;
 
-   for (i = 0; i < 16; i++) {
+   for (i = 0; i < _TNL_ATTRIB_MAX; i++) {
       if (program->InputsRead & (1<<i)) {
 	 GLuint j = m->nr_inputs++;
 	 m->input[j].idx = i;
-	 m->input[j].data = m->VB->AttribPtr[i]->data;
+	 m->input[j].data = (GLfloat *)m->VB->AttribPtr[i]->data;
 	 m->input[j].stride = m->VB->AttribPtr[i]->stride;
 	 m->input[j].size = m->VB->AttribPtr[i]->size;
 	 ASSIGN_4V(m->File[0][REG_IN0 + i], 0, 0, 0, 1);
@@ -1188,7 +1206,7 @@ run_arb_vertex_program(GLcontext *ctx, struct tnl_pipeline_stage *stage)
       if (program->OutputsWritten & (1<<i)) {
 	 GLuint j = m->nr_outputs++;
 	 m->output[j].idx = i;
-	 m->output[j].data = m->attribs[i].data;
+	 m->output[j].data = (GLfloat *)m->attribs[i].data;
       }
    }     
 
@@ -1208,12 +1226,12 @@ run_arb_vertex_program(GLcontext *ctx, struct tnl_pipeline_stage *stage)
 	 STRIDE_F(m->input[j].data, m->input[j].stride);
       }
 
-      if (m->func) {
-	 call_func( m );
+      if (p->compiled_func) {
+	 call_func( p, m );
       }
       else {
-	 for (j = 0; j < m->nr_instructions; j++) {
-	    union instruction inst = m->instructions[j];	 
+	 for (j = 0; j < p->nr_instructions; j++) {
+	    union instruction inst = p->instructions[j];	 
 	    opcode_func[inst.alu.opcode]( m, inst );
 	 }
       }
@@ -1241,6 +1259,8 @@ run_arb_vertex_program(GLcontext *ctx, struct tnl_pipeline_stage *stage)
    VB->ClipPtr = &m->attribs[VERT_RESULT_HPOS];
    VB->ClipPtr->count = VB->Count;
 
+   outputs = program->OutputsWritten;
+
    if (outputs & (1<<VERT_RESULT_COL0)) {
       VB->ColorPtr[0] = &m->attribs[VERT_RESULT_COL0];
       VB->AttribPtr[VERT_ATTRIB_COLOR0] = VB->ColorPtr[0];
@@ -1303,14 +1323,13 @@ validate_vertex_program( GLcontext *ctx, struct tnl_pipeline_stage *stage )
    struct vertex_program *program = 
       (ctx->VertexProgram._Enabled ? ctx->VertexProgram.Current : 0);
 
-#if TNL_FIXED_FUNCTION_PROGRAM
-   if (!program) {
+   if (!program && ctx->_MaintainTnlProgram) {
       program = ctx->_TnlProgram;
    }
-#endif
 
    if (program) {
-      compile_vertex_program( m, program );
+      if (!program->TnlData)
+	 compile_vertex_program( program, m->try_codegen );
       
       /* Grab the state GL state and put into registers:
        */
@@ -1354,8 +1373,6 @@ static GLboolean init_vertex_program( GLcontext *ctx,
    if (_mesa_getenv("MESA_EXPERIMENTAL"))
       m->try_codegen = 1;
 
-   _mesa_printf("try_codegen %d\n", m->try_codegen);
-
    /* Allocate arrays of vertex output values */
    for (i = 0; i < VERT_RESULT_MAX; i++) {
       _mesa_vector4f_alloc( &m->attribs[i], 0, size, 32 );
@@ -1366,11 +1383,8 @@ static GLboolean init_vertex_program( GLcontext *ctx,
    _mesa_vector4f_alloc( &m->ndcCoords, 0, size, 32 );
    m->clipmask = (GLubyte *) ALIGN_MALLOC(sizeof(GLubyte)*size, 32 );
 
-
-#if TNL_FIXED_FUNCTION_PROGRAM
-   _mesa_allow_light_in_model( ctx, GL_FALSE );
-#endif
-
+   if (ctx->_MaintainTnlProgram)
+      _mesa_allow_light_in_model( ctx, GL_FALSE );
 
    return GL_TRUE;
 }
diff --git a/src/mesa/tnl/t_vb_arbprogram.h b/src/mesa/tnl/t_vb_arbprogram.h
index fd2f09f1da6..6279f098f6f 100644
--- a/src/mesa/tnl/t_vb_arbprogram.h
+++ b/src/mesa/tnl/t_vb_arbprogram.h
@@ -55,11 +55,11 @@
 #define REG_OUT0   17
 #define REG_OUT14  31
 #define REG_IN0    32
-#define REG_IN15   47
-#define REG_ID     48		/* 0,0,0,1 */
-#define REG_ONES   49		/* 1,1,1,1 */
-#define REG_SWZ    50		/* -1,1,0,0 */
-#define REG_NEG    51		/* -1,-1,-1,-1 */
+#define REG_IN31   63
+#define REG_ID     64		/* 0,0,0,1 */
+#define REG_ONES   65		/* 1,1,1,1 */
+#define REG_SWZ    66		/* -1,1,0,0 */
+#define REG_NEG    67		/* -1,-1,-1,-1 */
 #define REG_UNDEF  127		/* special case - never used */
 #define REG_MAX    128
 #define REG_INVALID ~0
@@ -122,6 +122,8 @@ struct output {
    GLfloat *data;
 };
 
+
+
 /*--------------------------------------------------------------------------- */
 
 /*!
@@ -129,18 +131,13 @@ struct output {
  */
 struct arb_vp_machine {
    GLfloat (*File[4])[4];	/* All values referencable from the program. */
-   GLint AddressReg;
 
-   struct input input[16];
+   struct input input[_TNL_ATTRIB_MAX];
    GLuint nr_inputs;
 
    struct output output[15];
    GLuint nr_outputs;
 
-   union instruction store[1024];
-   union instruction *instructions;
-   GLint nr_instructions;
-
    GLvector4f attribs[VERT_RESULT_MAX]; /**< result vectors. */
    GLvector4f ndcCoords;              /**< normalized device coords */
    GLubyte *clipmask;                 /**< clip flags */
@@ -148,14 +145,23 @@ struct arb_vp_machine {
 
    GLuint vtx_nr;		/**< loop counter */
 
-   void (*func)( struct arb_vp_machine * ); /**< codegen'd program? */
-
    struct vertex_buffer *VB;
    GLcontext *ctx;
 
    GLboolean try_codegen;
 };
 
+struct tnl_compiled_program {
+   union instruction instructions[1024];
+   GLint nr_instructions;
+   void (*compiled_func)( struct arb_vp_machine * ); /**< codegen'd program */   
+};
+
+void _tnl_program_string_change( struct vertex_program * );
+void _tnl_program_destroy( struct vertex_program * );
+
 void _tnl_disassem_vba_insn( union instruction op );
 
+GLboolean _tnl_sse_codegen_vertex_program(struct tnl_compiled_program *p);
+
 #endif
diff --git a/src/mesa/tnl/t_vb_arbprogram_sse.c b/src/mesa/tnl/t_vb_arbprogram_sse.c
index 83be1d2d9f9..b6ffdda7d3a 100644
--- a/src/mesa/tnl/t_vb_arbprogram_sse.c
+++ b/src/mesa/tnl/t_vb_arbprogram_sse.c
@@ -76,8 +76,7 @@ do {									\
 
 struct compilation {
    struct x86_function func;
-   struct arb_vp_machine *m;
-
+   struct tnl_compiled_program *p;   
    GLuint insn_counter;
 
    struct {
@@ -788,6 +787,7 @@ static GLint get_offset( const void *a, const void *b )
 
 static GLboolean build_vertex_program( struct compilation *cp )
 {
+   struct arb_vp_machine *m = NULL;
    GLuint j;
 
    struct x86_reg regEAX = x86_make_reg(file_REG32, reg_AX);
@@ -796,11 +796,11 @@ static GLboolean build_vertex_program( struct compilation *cp )
    x86_mov(&cp->func, regEAX, x86_fn_arg(&cp->func, 1));
    x86_mov(&cp->func, parmECX, regEAX);
    
-   x86_mov(&cp->func, regEAX, x86_make_disp(regEAX, get_offset(cp->m, cp->m->File + FILE_REG)));
-   x86_mov(&cp->func, parmECX, x86_make_disp(parmECX, get_offset(cp->m, cp->m->File + FILE_STATE_PARAM)));
+   x86_mov(&cp->func, regEAX, x86_make_disp(regEAX, get_offset(m, m->File + FILE_REG)));
+   x86_mov(&cp->func, parmECX, x86_make_disp(parmECX, get_offset(m, m->File + FILE_STATE_PARAM)));
 
-   for (j = 0; j < cp->m->nr_instructions; j++) {
-      union instruction inst = cp->m->instructions[j];	 
+   for (j = 0; j < cp->p->nr_instructions; j++) {
+      union instruction inst = cp->p->instructions[j];	 
       cp->insn_counter = j+1;	/* avoid zero */
       
       if (DISASSEM) {
@@ -842,27 +842,30 @@ static GLboolean build_vertex_program( struct compilation *cp )
  * struct arb_vertex_machine.
  */
 GLboolean
-_tnl_sse_codegen_vertex_program(struct arb_vp_machine *m)
+_tnl_sse_codegen_vertex_program(struct tnl_compiled_program *p)
 {
    struct compilation cp;
    
    memset(&cp, 0, sizeof(cp));
-   cp.m = m;
+   cp.p = p;
    cp.have_sse2 = 1;
 
-   if (m->func) {
-      free((void *)m->func);
-      m->func = NULL;
+   if (p->compiled_func) {
+      free((void *)p->compiled_func);
+      p->compiled_func = NULL;
    }
 
    x86_init_func(&cp.func);
 
+   /* Note ctx state is not referenced in building the function, so it
+    * depends only on the list of instructions:
+    */
    if (!build_vertex_program(&cp)) {
       x86_release_func( &cp.func );
       return GL_FALSE;
    }
 
-   m->func = (void (*)(struct arb_vp_machine *))x86_get_func( &cp.func );
+   p->compiled_func = (void (*)(struct arb_vp_machine *))x86_get_func( &cp.func );
    return GL_TRUE;
 }
 
@@ -871,7 +874,7 @@ _tnl_sse_codegen_vertex_program(struct arb_vp_machine *m)
 #else
 
 GLboolean
-_tnl_sse_codegen_vertex_program( GLcontext *ctx )
+_tnl_sse_codegen_vertex_program(struct tnl_compiled_program *p)
 {
    /* Dummy version for when USE_SSE_ASM not defined */
    return GL_FALSE;
diff --git a/src/mesa/tnl/t_vb_program.c b/src/mesa/tnl/t_vb_program.c
index 9cf5df7cae5..d77f5424c14 100644
--- a/src/mesa/tnl/t_vb_program.c
+++ b/src/mesa/tnl/t_vb_program.c
@@ -80,7 +80,8 @@ run_vp( GLcontext *ctx, struct tnl_pipeline_stage *stage )
    struct vertex_program *program = ctx->VertexProgram.Current;
    GLuint i;
 
-   if (!ctx->VertexProgram._Enabled)
+   if (!ctx->VertexProgram._Enabled ||
+       !program->IsNVProgram)
       return GL_TRUE;
 
    /* load program parameter registers (they're read-only) */
diff --git a/src/mesa/tnl/t_vp_build.h b/src/mesa/tnl/t_vp_build.h
index 39c0348dd7a..8a669767550 100644
--- a/src/mesa/tnl/t_vp_build.h
+++ b/src/mesa/tnl/t_vp_build.h
@@ -28,10 +28,6 @@
 
 #include "mtypes.h"
 
-/* Define to 1 to test fixed-function execution via vertex programs:
- */
-#define TNL_FIXED_FUNCTION_PROGRAM 0
-
 extern void _tnl_UpdateFixedFunctionProgram( GLcontext *ctx );
 
 #endif